summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.h31
-rw-r--r--fs/9p/v9fs_vfs.h11
-rw-r--r--fs/9p/vfs_dir.c4
-rw-r--r--fs/9p/vfs_inode.c150
-rw-r--r--fs/9p/vfs_inode_dotl.c194
-rw-r--r--fs/9p/vfs_super.c45
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/afs/dir.c10
-rw-r--r--fs/afs/rotate.c21
-rw-r--r--fs/afs/validation.c16
-rw-r--r--fs/bcachefs/Makefile4
-rw-r--r--fs/bcachefs/alloc_background.c234
-rw-r--r--fs/bcachefs/alloc_background.h1
-rw-r--r--fs/bcachefs/alloc_foreground.c23
-rw-r--r--fs/bcachefs/backpointers.c143
-rw-r--r--fs/bcachefs/bbpos_types.h2
-rw-r--r--fs/bcachefs/bcachefs.h23
-rw-r--r--fs/bcachefs/bcachefs_format.h53
-rw-r--r--fs/bcachefs/bkey.h207
-rw-r--r--fs/bcachefs/bkey_types.h213
-rw-r--r--fs/bcachefs/btree_cache.c37
-rw-r--r--fs/bcachefs/btree_gc.c151
-rw-r--r--fs/bcachefs/btree_io.c34
-rw-r--r--fs/bcachefs/btree_iter.c20
-rw-r--r--fs/bcachefs/btree_journal_iter.c180
-rw-r--r--fs/bcachefs/btree_journal_iter.h14
-rw-r--r--fs/bcachefs/btree_key_cache.c10
-rw-r--r--fs/bcachefs/btree_locking.c3
-rw-r--r--fs/bcachefs/btree_types.h9
-rw-r--r--fs/bcachefs/btree_update.c23
-rw-r--r--fs/bcachefs/btree_update.h3
-rw-r--r--fs/bcachefs/btree_update_interior.c127
-rw-r--r--fs/bcachefs/btree_update_interior.h3
-rw-r--r--fs/bcachefs/btree_write_buffer.c6
-rw-r--r--fs/bcachefs/buckets.c38
-rw-r--r--fs/bcachefs/chardev.c57
-rw-r--r--fs/bcachefs/checksum.c2
-rw-r--r--fs/bcachefs/compress.c14
-rw-r--r--fs/bcachefs/debug.c8
-rw-r--r--fs/bcachefs/dirent.c143
-rw-r--r--fs/bcachefs/dirent.h6
-rw-r--r--fs/bcachefs/ec.c10
-rw-r--r--fs/bcachefs/errcode.c15
-rw-r--r--fs/bcachefs/errcode.h18
-rw-r--r--fs/bcachefs/error.c10
-rw-r--r--fs/bcachefs/error.h6
-rw-r--r--fs/bcachefs/extents.h17
-rw-r--r--fs/bcachefs/fifo.h4
-rw-r--r--fs/bcachefs/fs-common.c74
-rw-r--r--fs/bcachefs/fs-io-buffered.c149
-rw-r--r--fs/bcachefs/fs-io-pagecache.h9
-rw-r--r--fs/bcachefs/fs.c225
-rw-r--r--fs/bcachefs/fsck.c880
-rw-r--r--fs/bcachefs/fsck.h1
-rw-r--r--fs/bcachefs/inode.c55
-rw-r--r--fs/bcachefs/inode.h19
-rw-r--r--fs/bcachefs/io_read.c2
-rw-r--r--fs/bcachefs/io_write.c18
-rw-r--r--fs/bcachefs/journal.c278
-rw-r--r--fs/bcachefs/journal.h7
-rw-r--r--fs/bcachefs/journal_io.c418
-rw-r--r--fs/bcachefs/journal_io.h47
-rw-r--r--fs/bcachefs/journal_reclaim.c29
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c69
-rw-r--r--fs/bcachefs/journal_types.h30
-rw-r--r--fs/bcachefs/logged_ops.c4
-rw-r--r--fs/bcachefs/lru.c7
-rw-r--r--fs/bcachefs/mean_and_variance.c28
-rw-r--r--fs/bcachefs/mean_and_variance.h14
-rw-r--r--fs/bcachefs/mean_and_variance_test.c80
-rw-r--r--fs/bcachefs/migrate.c8
-rw-r--r--fs/bcachefs/movinggc.c3
-rw-r--r--fs/bcachefs/opts.c8
-rw-r--r--fs/bcachefs/opts.h10
-rw-r--r--fs/bcachefs/rebalance.c4
-rw-r--r--fs/bcachefs/recovery.c94
-rw-r--r--fs/bcachefs/recovery_types.h4
-rw-r--r--fs/bcachefs/sb-clean.c16
-rw-r--r--fs/bcachefs/sb-downgrade.c10
-rw-r--r--fs/bcachefs/sb-errors_types.h19
-rw-r--r--fs/bcachefs/snapshot.c32
-rw-r--r--fs/bcachefs/str_hash.h15
-rw-r--r--fs/bcachefs/subvolume.c187
-rw-r--r--fs/bcachefs/subvolume.h8
-rw-r--r--fs/bcachefs/subvolume_format.h4
-rw-r--r--fs/bcachefs/super-io.c30
-rw-r--r--fs/bcachefs/super.c108
-rw-r--r--fs/bcachefs/sysfs.c4
-rw-r--r--fs/bcachefs/thread_with_file.c391
-rw-r--r--fs/bcachefs/thread_with_file.h59
-rw-r--r--fs/bcachefs/thread_with_file_types.h15
-rw-r--r--fs/bcachefs/time_stats.c165
-rw-r--r--fs/bcachefs/time_stats.h159
-rw-r--r--fs/bcachefs/trace.h19
-rw-r--r--fs/bcachefs/util.c227
-rw-r--r--fs/bcachefs/util.h145
-rw-r--r--fs/bcachefs/xattr.c5
-rw-r--r--fs/btrfs/volumes.c58
-rw-r--r--fs/debugfs/inode.c25
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lock.c108
-rw-r--r--fs/dlm/user.c10
-rw-r--r--fs/exfat/dir.c290
-rw-r--r--fs/exfat/exfat_fs.h25
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/exfat/namei.c352
-rw-r--r--fs/ext2/Kconfig15
-rw-r--r--fs/ext2/balloc.c2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c2
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/extents.c6
-rw-r--r--fs/ext4/inode.c10
-rw-r--r--fs/ext4/mballoc-test.c601
-rw-r--r--fs/ext4/mballoc.c62
-rw-r--r--fs/ext4/resize.c5
-rw-r--r--fs/ext4/super.c44
-rw-r--r--fs/ext4/xattr.c61
-rw-r--r--fs/f2fs/checkpoint.c74
-rw-r--r--fs/f2fs/compress.c55
-rw-r--r--fs/f2fs/data.c191
-rw-r--r--fs/f2fs/debug.c11
-rw-r--r--fs/f2fs/dir.c10
-rw-r--r--fs/f2fs/extent_cache.c5
-rw-r--r--fs/f2fs/f2fs.h243
-rw-r--r--fs/f2fs/file.c171
-rw-r--r--fs/f2fs/gc.c129
-rw-r--r--fs/f2fs/gc.h4
-rw-r--r--fs/f2fs/namei.c36
-rw-r--r--fs/f2fs/node.c26
-rw-r--r--fs/f2fs/node.h4
-rw-r--r--fs/f2fs/recovery.c56
-rw-r--r--fs/f2fs/segment.c444
-rw-r--r--fs/f2fs/segment.h90
-rw-r--r--fs/f2fs/super.c210
-rw-r--r--fs/f2fs/sysfs.c52
-rw-r--r--fs/f2fs/verity.c16
-rw-r--r--fs/fat/nfs.c6
-rw-r--r--fs/fuse/Kconfig11
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/control.c6
-rw-r--r--fs/fuse/dev.c156
-rw-r--r--fs/fuse/dir.c55
-rw-r--r--fs/fuse/file.c457
-rw-r--r--fs/fuse/fuse_i.h153
-rw-r--r--fs/fuse/inode.c55
-rw-r--r--fs/fuse/iomode.c254
-rw-r--r--fs/fuse/passthrough.c355
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/fuse/virtio_fs.c156
-rw-r--r--fs/inode.c7
-rw-r--r--fs/isofs/inode.c18
-rw-r--r--fs/jfs/jfs_incore.h2
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c31
-rw-r--r--fs/kernfs/file.c8
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/netfs/fscache_io.c4
-rw-r--r--fs/nfs/client.c9
-rw-r--r--fs/nfs/delegation.c4
-rw-r--r--fs/nfs/direct.c18
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/fs_context.c1
-rw-r--r--fs/nfs/fscache.c9
-rw-r--r--fs/nfs/inode.c8
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/netns.h2
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs42.h7
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4proc.c29
-rw-r--r--fs/nfs/nfs4state.c12
-rw-r--r--fs/nfs/nfs4super.c24
-rw-r--r--fs/nfs/nfs4trace.c2
-rw-r--r--fs/nfs/nfs4trace.h58
-rw-r--r--fs/nfs/nfsroot.c4
-rw-r--r--fs/nfs/pnfs.c8
-rw-r--r--fs/nfs/pnfs_nfs.c44
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfs/write.c6
-rw-r--r--fs/nfsd/trace.h10
-rw-r--r--fs/nilfs2/alloc.c91
-rw-r--r--fs/nilfs2/bmap.c3
-rw-r--r--fs/nilfs2/btree.c9
-rw-r--r--fs/nilfs2/cpfile.c321
-rw-r--r--fs/nilfs2/cpfile.h10
-rw-r--r--fs/nilfs2/dat.c40
-rw-r--r--fs/nilfs2/direct.c9
-rw-r--r--fs/nilfs2/ifile.c21
-rw-r--r--fs/nilfs2/ifile.h10
-rw-r--r--fs/nilfs2/inode.c46
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/mdt.c4
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/nilfs2/page.c8
-rw-r--r--fs/nilfs2/recovery.c4
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/segment.c121
-rw-r--r--fs/nilfs2/sufile.c88
-rw-r--r--fs/nilfs2/super.c33
-rw-r--r--fs/nilfs2/the_nilfs.c2
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fsnotify.c28
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/quota_global.c12
-rw-r--r--fs/ocfs2/quota_local.c3
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/orangefs/orangefs-cache.c2
-rw-r--r--fs/orangefs/orangefs-kernel.h10
-rw-r--r--fs/orangefs/super.c4
-rw-r--r--fs/overlayfs/copy_up.c6
-rw-r--r--fs/proc/Kconfig2
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/task_mmu.c17
-rw-r--r--fs/quota/dquot.c172
-rw-r--r--fs/quota/quota_tree.c152
-rw-r--r--fs/quota/quota_v1.c6
-rw-r--r--fs/quota/quota_v2.c35
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/smb/server/glob.h2
-rw-r--r--fs/smb/server/ksmbd_netlink.h1
-rw-r--r--fs/smb/server/mgmt/user_session.c28
-rw-r--r--fs/smb/server/mgmt/user_session.h3
-rw-r--r--fs/smb/server/oplock.c96
-rw-r--r--fs/smb/server/oplock.h7
-rw-r--r--fs/smb/server/server.c1
-rw-r--r--fs/smb/server/smb2misc.c26
-rw-r--r--fs/smb/server/smb2ops.c6
-rw-r--r--fs/smb/server/smb2pdu.c509
-rw-r--r--fs/smb/server/smb2pdu.h15
-rw-r--r--fs/smb/server/smb_common.c11
-rw-r--r--fs/smb/server/vfs.c12
-rw-r--r--fs/smb/server/vfs_cache.c137
-rw-r--r--fs/smb/server/vfs_cache.h9
-rw-r--r--fs/super.c18
-rw-r--r--fs/sysfs/group.c55
-rw-r--r--fs/tracefs/event_inode.c70
-rw-r--r--fs/tracefs/internal.h2
-rw-r--r--fs/ubifs/debug.c9
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c443
-rw-r--r--fs/ubifs/find.c32
-rw-r--r--fs/ubifs/journal.c171
-rw-r--r--fs/ubifs/lprops.c6
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/tnc.c9
-rw-r--r--fs/ubifs/tnc_misc.c22
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/inode.c2
-rw-r--r--fs/udf/namei.c21
-rw-r--r--fs/udf/super.c555
-rw-r--r--fs/udf/udf_sb.h1
-rw-r--r--fs/userfaultfd.c86
-rw-r--r--fs/xfs/Kconfig13
-rw-r--r--fs/xfs/Makefile15
-rw-r--r--fs/xfs/kmem.c30
-rw-r--r--fs/xfs/kmem.h83
-rw-r--r--fs/xfs/libxfs/xfs_ag.c79
-rw-r--r--fs/xfs/libxfs/xfs_ag.h18
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c258
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c191
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_attr.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c22
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c37
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c365
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h19
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c152
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c1078
-rw-r--r--fs/xfs/libxfs/xfs_btree.h274
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.c347
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.h75
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c133
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h10
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_defer.c25
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c59
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h13
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c7
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c16
-rw-r--r--fs/xfs/libxfs/xfs_format.h21
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/libxfs/xfs_health.h95
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c232
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c155
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h11
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c26
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c12
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c49
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h1
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c69
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c78
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c284
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h31
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c231
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h8
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c11
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/libxfs/xfs_shared.h67
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c155
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.h26
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/libxfs/xfs_types.h26
-rw-r--r--fs/xfs/mrlock.h78
-rw-r--r--fs/xfs/scrub/agb_bitmap.h5
-rw-r--r--fs/xfs/scrub/agheader.c12
-rw-r--r--fs/xfs/scrub/agheader_repair.c47
-rw-r--r--fs/xfs/scrub/alloc_repair.c27
-rw-r--r--fs/xfs/scrub/bitmap.c14
-rw-r--r--fs/xfs/scrub/bitmap.h2
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/bmap_repair.c8
-rw-r--r--fs/xfs/scrub/btree.c58
-rw-r--r--fs/xfs/scrub/common.c129
-rw-r--r--fs/xfs/scrub/common.h13
-rw-r--r--fs/xfs/scrub/cow_repair.c2
-rw-r--r--fs/xfs/scrub/dir.c4
-rw-r--r--fs/xfs/scrub/fscounters.c29
-rw-r--r--fs/xfs/scrub/fscounters.h20
-rw-r--r--fs/xfs/scrub/fscounters_repair.c72
-rw-r--r--fs/xfs/scrub/health.c140
-rw-r--r--fs/xfs/scrub/health.h5
-rw-r--r--fs/xfs/scrub/ialloc.c20
-rw-r--r--fs/xfs/scrub/ialloc_repair.c10
-rw-r--r--fs/xfs/scrub/inode_repair.c237
-rw-r--r--fs/xfs/scrub/iscan.c767
-rw-r--r--fs/xfs/scrub/iscan.h84
-rw-r--r--fs/xfs/scrub/newbt.c14
-rw-r--r--fs/xfs/scrub/newbt.h7
-rw-r--r--fs/xfs/scrub/nlinks.c930
-rw-r--r--fs/xfs/scrub/nlinks.h102
-rw-r--r--fs/xfs/scrub/nlinks_repair.c223
-rw-r--r--fs/xfs/scrub/quotacheck.c867
-rw-r--r--fs/xfs/scrub/quotacheck.h76
-rw-r--r--fs/xfs/scrub/quotacheck_repair.c261
-rw-r--r--fs/xfs/scrub/rcbag.c307
-rw-r--r--fs/xfs/scrub/rcbag.h28
-rw-r--r--fs/xfs/scrub/rcbag_btree.c370
-rw-r--r--fs/xfs/scrub/rcbag_btree.h81
-rw-r--r--fs/xfs/scrub/readdir.c4
-rw-r--r--fs/xfs/scrub/reap.c2
-rw-r--r--fs/xfs/scrub/refcount.c12
-rw-r--r--fs/xfs/scrub/refcount_repair.c177
-rw-r--r--fs/xfs/scrub/repair.c120
-rw-r--r--fs/xfs/scrub/repair.h23
-rw-r--r--fs/xfs/scrub/rmap.c26
-rw-r--r--fs/xfs/scrub/rmap_repair.c1697
-rw-r--r--fs/xfs/scrub/rtsummary.c6
-rw-r--r--fs/xfs/scrub/scrub.c37
-rw-r--r--fs/xfs/scrub/scrub.h18
-rw-r--r--fs/xfs/scrub/stats.c6
-rw-r--r--fs/xfs/scrub/symlink.c3
-rw-r--r--fs/xfs/scrub/trace.c8
-rw-r--r--fs/xfs/scrub/trace.h637
-rw-r--r--fs/xfs/scrub/xfarray.c234
-rw-r--r--fs/xfs/scrub/xfarray.h30
-rw-r--r--fs/xfs/scrub/xfile.c345
-rw-r--r--fs/xfs/scrub/xfile.h62
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_attr_inactive.c4
-rw-r--r--fs/xfs/xfs_attr_item.c25
-rw-r--r--fs/xfs/xfs_attr_list.c26
-rw-r--r--fs/xfs/xfs_bmap_item.c119
-rw-r--r--fs/xfs/xfs_bmap_item.h4
-rw-r--r--fs/xfs/xfs_bmap_util.c20
-rw-r--r--fs/xfs/xfs_buf.c320
-rw-r--r--fs/xfs/xfs_buf.h21
-rw-r--r--fs/xfs/xfs_buf_item.c8
-rw-r--r--fs/xfs/xfs_buf_item_recover.c8
-rw-r--r--fs/xfs/xfs_buf_mem.c270
-rw-r--r--fs/xfs/xfs_buf_mem.h34
-rw-r--r--fs/xfs/xfs_dir2_readdir.c8
-rw-r--r--fs/xfs/xfs_discard.c19
-rw-r--r--fs/xfs/xfs_dquot.c36
-rw-r--r--fs/xfs/xfs_error.c8
-rw-r--r--fs/xfs/xfs_extent_busy.c5
-rw-r--r--fs/xfs/xfs_extfree_item.c8
-rw-r--r--fs/xfs/xfs_file.c4
-rw-r--r--fs/xfs/xfs_filestream.c6
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_health.c202
-rw-r--r--fs/xfs/xfs_hooks.c52
-rw-r--r--fs/xfs/xfs_hooks.h65
-rw-r--r--fs/xfs/xfs_icache.c14
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c274
-rw-r--r--fs/xfs/xfs_inode.h37
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_inode_item_recover.c5
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_iomap.c19
-rw-r--r--fs/xfs/xfs_iops.c9
-rw-r--r--fs/xfs/xfs_itable.c12
-rw-r--r--fs/xfs/xfs_iwalk.c41
-rw-r--r--fs/xfs/xfs_linux.h17
-rw-r--r--fs/xfs/xfs_log.c34
-rw-r--r--fs/xfs/xfs_log_cil.c31
-rw-r--r--fs/xfs/xfs_log_recover.c102
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h12
-rw-r--r--fs/xfs/xfs_mru_cache.c17
-rw-r--r--fs/xfs/xfs_qm.c59
-rw-r--r--fs/xfs/xfs_qm.h16
-rw-r--r--fs/xfs/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/xfs_quota.h46
-rw-r--r--fs/xfs/xfs_refcount_item.c12
-rw-r--r--fs/xfs/xfs_reflink.c16
-rw-r--r--fs/xfs/xfs_rmap_item.c11
-rw-r--r--fs/xfs/xfs_rtalloc.c18
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_stats.h2
-rw-r--r--fs/xfs/xfs_super.c20
-rw-r--r--fs/xfs/xfs_symlink.c158
-rw-r--r--fs/xfs/xfs_symlink.h1
-rw-r--r--fs/xfs/xfs_sysfs.c4
-rw-r--r--fs/xfs/xfs_trace.c3
-rw-r--r--fs/xfs/xfs_trace.h607
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c7
-rw-r--r--fs/xfs/xfs_trans_buf.c42
-rw-r--r--fs/xfs/xfs_trans_dquot.c171
439 files changed, 23137 insertions, 9280 deletions
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 698c43dd5dc8..9defa12208f9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -179,16 +179,13 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
-extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
- struct p9_fid *fid,
- struct super_block *sb, int new);
+extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid);
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
extern const struct netfs_request_ops v9fs_req_ops;
-extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
- struct p9_fid *fid,
- struct super_block *sb, int new);
+extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb,
+ struct p9_fid *fid);
/* other default globals */
#define V9FS_PORT 564
@@ -230,27 +227,9 @@ v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
struct super_block *sb)
{
if (v9fs_proto_dotl(v9ses))
- return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
+ return v9fs_fid_iget_dotl(sb, fid);
else
- return v9fs_inode_from_fid(v9ses, fid, sb, 0);
-}
-
-/**
- * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb)
-{
- if (v9fs_proto_dotl(v9ses))
- return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
- else
- return v9fs_inode_from_fid(v9ses, fid, sb, 1);
+ return v9fs_fid_iget(sb, fid);
}
#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 0e8418066a48..7923c3c347cb 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,13 +40,16 @@ extern struct kmem_cache *v9fs_inode_cache;
struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
- dev_t rdev);
void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, umode_t mode, dev_t rdev);
+ struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
-ino_t v9fs_qid2ino(struct p9_qid *qid);
+#if (BITS_PER_LONG == 32)
+#define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32)))
+#else
+#define QID2INO(q) ((ino_t) ((q)->path+2))
+#endif
+
void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct super_block *sb, unsigned int flags);
void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 4102759a5cb5..e0d34e4e9076 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -127,7 +127,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
}
over = !dir_emit(ctx, st.name, strlen(st.name),
- v9fs_qid2ino(&st.qid), dt_type(&st));
+ QID2INO(&st.qid), dt_type(&st));
p9stat_free(&st);
if (over)
return 0;
@@ -184,7 +184,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, curdirent.d_name,
strlen(curdirent.d_name),
- v9fs_qid2ino(&curdirent.qid),
+ QID2INO(&curdirent.qid),
curdirent.d_type))
return 0;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 32572982f72e..360a5304ec03 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -253,9 +253,12 @@ void v9fs_set_netfs_context(struct inode *inode)
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, umode_t mode, dev_t rdev)
+ struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev)
{
int err = 0;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+
+ memcpy(&v9inode->qid, qid, sizeof(struct p9_qid));
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_blocks = 0;
@@ -332,36 +335,6 @@ error:
}
/**
- * v9fs_get_inode - helper function to setup an inode
- * @sb: superblock
- * @mode: mode to setup inode with
- * @rdev: The device numbers to set
- */
-
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
-{
- int err;
- struct inode *inode;
- struct v9fs_session_info *v9ses = sb->s_fs_info;
-
- p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
-
- inode = new_inode(sb);
- if (!inode) {
- pr_warn("%s (%d): Problem allocating inode\n",
- __func__, task_pid_nr(current));
- return ERR_PTR(-ENOMEM);
- }
- err = v9fs_init_inode(v9ses, inode, mode, rdev);
- if (err) {
- iput(inode);
- return ERR_PTR(err);
- }
- v9fs_set_netfs_context(inode);
- return inode;
-}
-
-/**
* v9fs_evict_inode - Remove an inode from the inode cache
* @inode: inode to release
*
@@ -384,82 +357,40 @@ void v9fs_evict_inode(struct inode *inode)
#endif
}
-static int v9fs_test_inode(struct inode *inode, void *data)
-{
- int umode;
- dev_t rdev;
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_wstat *st = (struct p9_wstat *)data;
- struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-
- umode = p9mode2unixmode(v9ses, st, &rdev);
- /* don't match inode of different type */
- if (inode_wrong_type(inode, umode))
- return 0;
-
- /* compare qid details */
- if (memcmp(&v9inode->qid.version,
- &st->qid.version, sizeof(v9inode->qid.version)))
- return 0;
-
- if (v9inode->qid.type != st->qid.type)
- return 0;
-
- if (v9inode->qid.path != st->qid.path)
- return 0;
- return 1;
-}
-
-static int v9fs_test_new_inode(struct inode *inode, void *data)
-{
- return 0;
-}
-
-static int v9fs_set_inode(struct inode *inode, void *data)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_wstat *st = (struct p9_wstat *)data;
-
- memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
- return 0;
-}
-
-static struct inode *v9fs_qid_iget(struct super_block *sb,
- struct p9_qid *qid,
- struct p9_wstat *st,
- int new)
+struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid)
{
dev_t rdev;
int retval;
umode_t umode;
- unsigned long i_ino;
struct inode *inode;
+ struct p9_wstat *st;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *inode, void *data);
- if (new)
- test = v9fs_test_new_inode;
- else
- test = v9fs_test_inode;
-
- i_ino = v9fs_qid2ino(qid);
- inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st);
- if (!inode)
+ inode = iget_locked(sb, QID2INO(&fid->qid));
+ if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
+
/*
* initialize the inode with the stat info
* FIXME!! we may need support for stale inodes
* later.
*/
- inode->i_ino = i_ino;
+ st = p9_client_stat(fid);
+ if (IS_ERR(st)) {
+ retval = PTR_ERR(st);
+ goto error;
+ }
+
umode = p9mode2unixmode(v9ses, st, &rdev);
- retval = v9fs_init_inode(v9ses, inode, umode, rdev);
+ retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev);
+ v9fs_stat2inode(st, inode, sb, 0);
+ p9stat_free(st);
+ kfree(st);
if (retval)
goto error;
- v9fs_stat2inode(st, inode, sb, 0);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
@@ -470,23 +401,6 @@ error:
}
-struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb, int new)
-{
- struct p9_wstat *st;
- struct inode *inode = NULL;
-
- st = p9_client_stat(fid);
- if (IS_ERR(st))
- return ERR_CAST(st);
-
- inode = v9fs_qid_iget(sb, &st->qid, st, new);
- p9stat_free(st);
- kfree(st);
- return inode;
-}
-
/**
* v9fs_at_to_dotl_flags- convert Linux specific AT flags to
* plan 9 AT flag.
@@ -633,7 +547,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
/*
* instantiate inode and assign the unopened fid to the dentry
*/
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS,
@@ -761,10 +675,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
inode = NULL;
else if (IS_ERR(fid))
inode = ERR_CAST(fid);
- else if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
else
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
/*
* If we had a rename on the server and a parallel lookup
* for the new name, then make sure we instantiate with
@@ -1187,26 +1099,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
}
/**
- * v9fs_qid2ino - convert qid into inode number
- * @qid: qid to hash
- *
- * BUG: potential for inode number collisions?
- */
-
-ino_t v9fs_qid2ino(struct p9_qid *qid)
-{
- u64 path = qid->path + 2;
- ino_t i = 0;
-
- if (sizeof(ino_t) == sizeof(path))
- memcpy(&i, &path, sizeof(ino_t));
- else
- i = (ino_t) (path ^ (path >> 32));
-
- return i;
-}
-
-/**
* v9fs_vfs_get_link - follow a symlink path
* @dentry: dentry for symlink
* @inode: inode for symlink
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 3505227e1704..ef9db3e03506 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -52,78 +52,33 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
return current_fsgid();
}
-static int v9fs_test_inode_dotl(struct inode *inode, void *data)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
-
- /* don't match inode of different type */
- if (inode_wrong_type(inode, st->st_mode))
- return 0;
-
- if (inode->i_generation != st->st_gen)
- return 0;
-
- /* compare qid details */
- if (memcmp(&v9inode->qid.version,
- &st->qid.version, sizeof(v9inode->qid.version)))
- return 0;
-
- if (v9inode->qid.type != st->qid.type)
- return 0;
-
- if (v9inode->qid.path != st->qid.path)
- return 0;
- return 1;
-}
-
-/* Always get a new inode */
-static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
-{
- return 0;
-}
-
-static int v9fs_set_inode_dotl(struct inode *inode, void *data)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
-
- memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
- inode->i_generation = st->st_gen;
- return 0;
-}
-
-static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
- struct p9_qid *qid,
- struct p9_fid *fid,
- struct p9_stat_dotl *st,
- int new)
+struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid)
{
int retval;
- unsigned long i_ino;
struct inode *inode;
+ struct p9_stat_dotl *st;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *inode, void *data);
-
- if (new)
- test = v9fs_test_new_inode_dotl;
- else
- test = v9fs_test_inode_dotl;
- i_ino = v9fs_qid2ino(qid);
- inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st);
- if (!inode)
+ inode = iget_locked(sb, QID2INO(&fid->qid));
+ if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
+
/*
* initialize the inode with the stat info
* FIXME!! we may need support for stale inodes
* later.
*/
- inode->i_ino = i_ino;
- retval = v9fs_init_inode(v9ses, inode,
+ st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
+ if (IS_ERR(st)) {
+ retval = PTR_ERR(st);
+ goto error;
+ }
+
+ retval = v9fs_init_inode(v9ses, inode, &fid->qid,
st->st_mode, new_decode_dev(st->st_rdev));
+ kfree(st);
if (retval)
goto error;
@@ -135,6 +90,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
goto error;
unlock_new_inode(inode);
+
return inode;
error:
iget_failed(inode);
@@ -142,22 +98,6 @@ error:
}
-struct inode *
-v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb, int new)
-{
- struct p9_stat_dotl *st;
- struct inode *inode = NULL;
-
- st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
- if (IS_ERR(st))
- return ERR_CAST(st);
-
- inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
- kfree(st);
- return inode;
-}
-
struct dotl_openflag_map {
int open_flag;
int dotl_flag;
@@ -307,7 +247,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
goto out;
}
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -402,32 +342,17 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
}
/* instantiate inode and assign the unopened fid to the dentry */
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
- goto error;
- }
- v9fs_fid_add(dentry, &fid);
- v9fs_set_create_acl(inode, fid, dacl, pacl);
- d_instantiate(dentry, inode);
- err = 0;
- } else {
- /*
- * Not in cached mode. No need to populate
- * inode with stat. We need to get an inode
- * so that we can set the acl with dentry
- */
- inode = v9fs_get_inode(dir->i_sb, mode, 0);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto error;
- }
- v9fs_set_create_acl(inode, fid, dacl, pacl);
- d_instantiate(dentry, inode);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
+ goto error;
}
+ v9fs_fid_add(dentry, &fid);
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
+ d_instantiate(dentry, inode);
+ err = 0;
inc_nlink(dir);
v9fs_invalidate_inode_attr(dir);
error:
@@ -709,14 +634,11 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir,
kgid_t gid;
const unsigned char *name;
struct p9_qid qid;
- struct inode *inode;
struct p9_fid *dfid;
struct p9_fid *fid = NULL;
- struct v9fs_session_info *v9ses;
name = dentry->d_name.name;
p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
- v9ses = v9fs_inode2v9ses(dir);
dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
@@ -736,36 +658,6 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir,
}
v9fs_invalidate_inode_attr(dir);
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- /* Now walk from the parent so we can get an unopened fid. */
- fid = p9_client_walk(dfid, 1, &name, 1);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
- goto error;
- }
-
- /* instantiate inode and assign the unopened fid to dentry */
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
- goto error;
- }
- v9fs_fid_add(dentry, &fid);
- d_instantiate(dentry, inode);
- err = 0;
- } else {
- /* Not in cached mode. No need to populate inode with stat */
- inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto error;
- }
- d_instantiate(dentry, inode);
- }
error:
p9_fid_put(fid);
@@ -888,33 +780,17 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
err);
goto error;
}
-
- /* instantiate inode and assign the unopened fid to the dentry */
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
- goto error;
- }
- v9fs_set_create_acl(inode, fid, dacl, pacl);
- v9fs_fid_add(dentry, &fid);
- d_instantiate(dentry, inode);
- err = 0;
- } else {
- /*
- * Not in cached mode. No need to populate inode with stat.
- * socket syscall returns a fd, so we need instantiate
- */
- inode = v9fs_get_inode(dir->i_sb, mode, rdev);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto error;
- }
- v9fs_set_create_acl(inode, fid, dacl, pacl);
- d_instantiate(dentry, inode);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
+ goto error;
}
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
+ v9fs_fid_add(dentry, &fid);
+ d_instantiate(dentry, inode);
+ err = 0;
error:
p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 941f7d0e0bfa..4236058c7bbd 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -110,7 +110,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
struct inode *inode = NULL;
struct dentry *root = NULL;
struct v9fs_session_info *v9ses = NULL;
- umode_t mode = 0777 | S_ISVTX;
struct p9_fid *fid;
int retval = 0;
@@ -140,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
else
sb->s_d_op = &v9fs_dentry_operations;
- inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, sb);
if (IS_ERR(inode)) {
retval = PTR_ERR(inode);
goto release_sb;
@@ -152,32 +151,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
}
sb->s_root = root;
- if (v9fs_proto_dotl(v9ses)) {
- struct p9_stat_dotl *st = NULL;
-
- st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto release_sb;
- }
- d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode_dotl(st, d_inode(root), 0);
- kfree(st);
- } else {
- struct p9_wstat *st = NULL;
-
- st = p9_client_stat(fid);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto release_sb;
- }
-
- d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode(st, d_inode(root), sb, 0);
-
- p9stat_free(st);
- kfree(st);
- }
retval = v9fs_get_acl(inode, fid);
if (retval)
goto release_sb;
@@ -271,21 +244,6 @@ done:
return res;
}
-static int v9fs_drop_inode(struct inode *inode)
-{
- struct v9fs_session_info *v9ses;
-
- v9ses = v9fs_inode2v9ses(inode);
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- return generic_drop_inode(inode);
- /*
- * in case of non cached mode always drop the
- * inode because we want the inode attribute
- * to always match that on the server.
- */
- return 1;
-}
-
static int v9fs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
@@ -320,7 +278,6 @@ static const struct super_operations v9fs_super_ops_dotl = {
.alloc_inode = v9fs_alloc_inode,
.free_inode = v9fs_free_inode,
.statfs = v9fs_statfs,
- .drop_inode = v9fs_drop_inode,
.evict_inode = v9fs_evict_inode,
.show_options = v9fs_show_options,
.umount_begin = v9fs_umount_begin,
diff --git a/fs/Kconfig b/fs/Kconfig
index ea2f77446080..a46b0cbc4d8f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -60,7 +60,6 @@ endif # BLOCK
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
- depends on !(ARM || MIPS || SPARC)
depends on ZONE_DEVICE || FS_DAX_LIMITED
select FS_IOMAP
select DAX
@@ -261,6 +260,7 @@ menuconfig HUGETLBFS
depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
depends on (SYSFS || SYSCTL)
select MEMFD_CREATE
+ select PADATA if SMP
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 8a67fc427e74..67afe68972d5 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -474,16 +474,6 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
continue;
}
- /* Don't expose silly rename entries to userspace. */
- if (nlen > 6 &&
- dire->u.name[0] == '.' &&
- ctx->actor != afs_lookup_filldir &&
- ctx->actor != afs_lookup_one_filldir &&
- memcmp(dire->u.name, ".__afs", 6) == 0) {
- ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
- continue;
- }
-
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 700a27bc8c25..ed04bd1eeae8 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -602,6 +602,8 @@ iterate_address:
goto wait_for_more_probe_results;
alist = op->estate->addresses;
+ best_prio = -1;
+ addr_index = 0;
for (i = 0; i < alist->nr_addrs; i++) {
if (alist->addrs[i].prio > best_prio) {
addr_index = i;
@@ -609,9 +611,7 @@ iterate_address:
}
}
- addr_index = READ_ONCE(alist->preferred);
- if (!test_bit(addr_index, &set))
- addr_index = __ffs(set);
+ alist->preferred = addr_index;
op->addr_index = addr_index;
set_bit(addr_index, &op->addr_tried);
@@ -656,12 +656,6 @@ wait_for_more_probe_results:
next_server:
trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
_debug("next");
- ASSERT(op->estate);
- alist = op->estate->addresses;
- if (op->call_responded &&
- op->addr_index != READ_ONCE(alist->preferred) &&
- test_bit(alist->preferred, &op->addr_tried))
- WRITE_ONCE(alist->preferred, op->addr_index);
op->estate = NULL;
goto pick_server;
@@ -690,14 +684,7 @@ no_more_servers:
failed:
trace_afs_rotate(op, afs_rotate_trace_failed, 0);
op->flags |= AFS_OPERATION_STOP;
- if (op->estate) {
- alist = op->estate->addresses;
- if (op->call_responded &&
- op->addr_index != READ_ONCE(alist->preferred) &&
- test_bit(alist->preferred, &op->addr_tried))
- WRITE_ONCE(alist->preferred, op->addr_index);
- op->estate = NULL;
- }
+ op->estate = NULL;
_leave(" = f [failed %d]", afs_op_error(op));
return false;
}
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index 46b37f2cce7d..32a53fc8dfb2 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -122,6 +122,9 @@ bool afs_check_validity(const struct afs_vnode *vnode)
const struct afs_volume *volume = vnode->volume;
time64_t deadline = ktime_get_real_seconds() + 10;
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+ return true;
+
if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
atomic64_read(&vnode->cb_expires_at) <= deadline ||
volume->cb_expires_at <= deadline ||
@@ -389,12 +392,17 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
key_serial(key));
if (afs_check_validity(vnode))
- return 0;
+ return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
ret = down_write_killable(&vnode->validate_lock);
if (ret < 0)
goto error;
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ ret = -ESTALE;
+ goto error_unlock;
+ }
+
/* Validate a volume after the v_break has changed or the volume
* callback expired. We only want to do this once per volume per
* v_break change. The actual work will be done when parsing the
@@ -448,12 +456,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
vnode->cb_ro_snapshot = cb_ro_snapshot;
vnode->cb_scrub = cb_scrub;
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- _debug("file already deleted");
- ret = -ESTALE;
- goto error_unlock;
- }
-
/* if the vnode's data version number changed then its contents are
* different */
zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1a05cecda7cc..b02796c8a595 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -82,6 +82,7 @@ bcachefs-y := \
super-io.o \
sysfs.o \
tests.o \
+ time_stats.o \
thread_with_file.o \
trace.o \
two_state_shared_lock.o \
@@ -90,3 +91,6 @@ bcachefs-y := \
xattr.o
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
+
+# Silence "note: xyz changed in GCC X.X" messages
+subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd3e175d8342..893e38f9db80 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -29,6 +29,8 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+
/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -530,13 +532,13 @@ int bch2_bucket_gens_init(struct bch_fs *c)
u8 gen = bch2_alloc_to_v4(k, &a)->gen;
unsigned offset;
struct bpos pos = alloc_gens_pos(iter.pos, &offset);
+ int ret2 = 0;
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
- if (ret)
- break;
+ ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret2)
+ goto iter_err;
have_bucket_gens_key = false;
}
@@ -547,7 +549,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
}
g.v.gens[offset] = gen;
- 0;
+iter_err:
+ ret2;
}));
if (have_bucket_gens_key && !ret)
@@ -850,7 +853,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bucket_journal_seq);
if (ret) {
bch2_fs_fatal_error(c,
- "error setting bucket_needs_journal_commit: %i", ret);
+ "setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
return ret;
}
}
@@ -860,23 +863,28 @@ int bch2_trigger_alloc(struct btree_trans *trans,
*bucket_gen(ca, new.k->p.offset) = new_a->gen;
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+ percpu_up_read(&c->mark_lock);
+
+#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
+#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
+#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
- if (new_a->data_type == BCH_DATA_free &&
- (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ if (statechange(a->data_type == BCH_DATA_free) &&
+ bucket_flushed(new_a))
closure_wake_up(&c->freelist_wait);
- if (new_a->data_type == BCH_DATA_need_discard &&
- (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
- bch2_do_discards(c);
+ if (statechange(a->data_type == BCH_DATA_need_discard) &&
+ !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
+ bucket_flushed(new_a))
+ bch2_discard_one_bucket_fast(c, new.k->p);
- if (old_a->data_type != BCH_DATA_cached &&
- new_a->data_type == BCH_DATA_cached &&
+ if (statechange(a->data_type == BCH_DATA_cached) &&
+ !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
bch2_do_invalidates(c);
- if (new_a->data_type == BCH_DATA_need_gc_gens)
+ if (statechange(a->data_type == BCH_DATA_need_gc_gens))
bch2_do_gc_gens(c);
- percpu_up_read(&c->mark_lock);
}
if ((flags & BTREE_TRIGGER_GC) &&
@@ -1045,14 +1053,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (k.k->type != discard_key_type &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, need_discard_key_wrong,
- "incorrect key in need_discard btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[discard_key_type],
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(k.k->type != discard_key_type,
+ c, need_discard_key_wrong,
+ "incorrect key in need_discard btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[discard_key_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1076,15 +1083,14 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (k.k->type != freespace_key_type &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, freespace_key_wrong,
- "incorrect key in freespace btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[freespace_key_type],
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(k.k->type != freespace_key_type,
+ c, freespace_key_wrong,
+ "incorrect key in freespace btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[freespace_key_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1108,14 +1114,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (a->gen != alloc_gen(k, gens_offset) &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, bucket_gens_key_wrong,
- "incorrect gen in bucket_gens btree (got %u should be %u)\n"
- " %s",
- alloc_gen(k, gens_offset), a->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
+ c, bucket_gens_key_wrong,
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+ " %s",
+ alloc_gen(k, gens_offset), a->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i_bucket_gens *g =
bch2_trans_kmalloc(trans, sizeof(*g));
@@ -1167,14 +1172,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
*end = bkey_min(k.k->p, *end);
- if (k.k->type != KEY_TYPE_set &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, freespace_hole_missing,
- "hole in alloc btree missing in freespace btree\n"
- " device %llu buckets %llu-%llu",
- freespace_iter->pos.inode,
- freespace_iter->pos.offset,
- end->offset))) {
+ if (fsck_err_on(k.k->type != KEY_TYPE_set,
+ c, freespace_hole_missing,
+ "hole in alloc btree missing in freespace btree\n"
+ " device %llu buckets %llu-%llu",
+ freespace_iter->pos.inode,
+ freespace_iter->pos.offset,
+ end->offset)) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1604,6 +1608,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
+static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+{
+ int ret;
+
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i)
+ if (bkey_eq(*i, bucket)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = darray_push(&c->discard_buckets_in_flight, bucket);
+out:
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+ return ret;
+}
+
+static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+{
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i)
+ if (bkey_eq(*i, bucket)) {
+ darray_remove_item(&c->discard_buckets_in_flight, i);
+ goto found;
+ }
+ BUG();
+found:
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+}
+
struct discard_buckets_state {
u64 seen;
u64 open;
@@ -1642,6 +1676,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bch_dev *ca;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
+ bool discard_locked = false;
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
@@ -1709,6 +1744,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
+ if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+ goto out;
+
+ discard_locked = true;
+
if (!bkey_eq(*discard_pos_done, iter.pos) &&
ca->mi.discard && !c->opts.nochanges) {
/*
@@ -1740,6 +1780,8 @@ write:
count_event(c, bucket_discard);
s->discarded++;
out:
+ if (discard_locked)
+ discard_in_flight_remove(c, iter.pos);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
@@ -1779,6 +1821,93 @@ void bch2_do_discards(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
+static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
+{
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto err;
+
+ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+ a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static void bch2_do_discards_fast_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+
+ while (1) {
+ bool got_bucket = false;
+ struct bpos bucket;
+ struct bch_dev *ca;
+
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i) {
+ if (i->snapshot)
+ continue;
+
+ ca = bch_dev_bkey_exists(c, i->inode);
+
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ darray_remove_item(&c->discard_buckets_in_flight, i);
+ continue;
+ }
+
+ got_bucket = true;
+ bucket = *i;
+ i->snapshot = true;
+ break;
+ }
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+
+ if (!got_bucket)
+ break;
+
+ if (ca->mi.discard && !c->opts.nochanges)
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
+
+ int ret = bch2_trans_do(c, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_bucket_needs_discard(trans, bucket));
+ bch_err_fn(c, ret);
+
+ percpu_ref_put(&ca->io_ref);
+ discard_in_flight_remove(c, bucket);
+
+ if (ret)
+ break;
+ }
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+ if (!percpu_ref_is_dying(&ca->io_ref) &&
+ !discard_in_flight_add(c, bucket) &&
+ bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
+ !queue_work(c->write_ref_wq, &c->discard_fast_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
static int invalidate_one_bucket(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
@@ -2210,9 +2339,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
+void bch2_fs_allocator_background_exit(struct bch_fs *c)
+{
+ darray_exit(&c->discard_buckets_in_flight);
+}
+
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
+ mutex_init(&c->discard_buckets_in_flight_lock);
INIT_WORK(&c->discard_work, bch2_do_discards_work);
+ INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index e7f7e842ee1b..052b2fac25d6 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -269,6 +269,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+void bch2_fs_allocator_background_exit(struct bch_fs *);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 633d3223b353..214b15c84d1f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, false);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
spin_unlock(&c->freelist_lock);
return ob;
@@ -555,8 +551,7 @@ again:
goto again;
}
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;
@@ -1361,15 +1356,17 @@ retry:
/* Don't retry from all devices if we're out of open buckets: */
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, watermark,
flags, cl);
- if (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ if (!ret2 ||
+ bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
+ ret = ret2;
goto alloc_done;
+ }
}
/*
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 569b97904da4..8cb35ea572cb 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -131,8 +131,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
printbuf_exit(&buf);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- bch2_inconsistent_error(c);
- return -EIO;
+ return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
} else {
return 0;
}
@@ -478,8 +477,7 @@ missing:
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
- if (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+ if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
goto out;
@@ -555,60 +553,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
};
}
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+static u64 mem_may_pin_bytes(struct bch_fs *c)
{
struct sysinfo i;
- u64 mem_bytes;
-
si_meminfo(&i);
- mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
+
+ u64 mem_bytes = i.totalram * i.mem_unit;
+ return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+ return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
- unsigned btree_leaf_mask,
- unsigned btree_interior_mask,
+ u64 btree_leaf_mask,
+ u64 btree_interior_mask,
struct bbpos start, struct bbpos *end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
- enum btree_id btree;
+ struct bch_fs *c = trans->c;
+ s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
- for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
- unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+ btree_interior_mask |= btree_leaf_mask;
+
+ c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
+ c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
+ c->btree_cache.pinned_nodes_start = start;
+ c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
+
+ for (enum btree_id btree = start.btree;
+ btree < BTREE_ID_NR && !ret;
+ btree++) {
+ unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
+ struct btree_iter iter;
+ struct btree *b;
if (!((1U << btree) & btree_leaf_mask) &&
!((1U << btree) & btree_interior_mask))
continue;
- bch2_trans_node_iter_init(trans, &iter, btree,
- btree == start.btree ? start.pos : POS_MIN,
- 0, depth, 0);
- /*
- * for_each_btree_key_contineu() doesn't check the return value
- * from bch2_btree_iter_advance(), which is needed when
- * iterating over interior nodes where we'll see keys at
- * SPOS_MAX:
- */
- do {
- k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
- ret = bkey_err(k);
- if (!k.k || ret)
- break;
-
- --btree_nodes;
- if (!btree_nodes) {
- *end = BBPOS(btree, k.k->p);
+ __for_each_btree_node(trans, iter, btree,
+ btree == start.btree ? start.pos : POS_MIN,
+ 0, depth, BTREE_ITER_PREFETCH, b, ret) {
+ mem_may_pin -= btree_buf_bytes(b);
+ if (mem_may_pin <= 0) {
+ c->btree_cache.pinned_nodes_end = *end =
+ BBPOS(btree, b->key.k.p);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
- } while (bch2_btree_iter_advance(&iter));
+ }
bch2_trans_iter_exit(trans, &iter);
}
- *end = BBPOS_MAX;
return ret;
}
@@ -666,62 +665,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
return 0;
}
-static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
- struct bpos bucket)
-{
- return bch2_dev_exists2(c, bucket.inode)
- ? bucket_pos_to_bp(c, bucket, 0)
- : bucket;
-}
-
-static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
- struct bpos start, struct bpos *end)
-{
- struct btree_iter alloc_iter;
- struct btree_iter bp_iter;
- struct bkey_s_c alloc_k, bp_k;
- size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
- bool alloc_end = false, bp_end = false;
- int ret = 0;
-
- bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
- start, 0, 1, 0);
- bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
- while (1) {
- alloc_k = !alloc_end
- ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
- : bkey_s_c_null;
- bp_k = !bp_end
- ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
- : bkey_s_c_null;
-
- ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
- if ((!alloc_k.k && !bp_k.k) || ret) {
- *end = SPOS_MAX;
- break;
- }
-
- --btree_nodes;
- if (!btree_nodes) {
- *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
- break;
- }
-
- if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
- bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
- if (!bch2_btree_iter_advance(&alloc_iter))
- alloc_end = true;
- } else {
- if (!bch2_btree_iter_advance(&bp_iter))
- bp_end = true;
- }
- }
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_trans_iter_exit(trans, &alloc_iter);
- return ret;
-}
-
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
@@ -732,10 +675,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bkey_init(&s.last_flushed.k->k);
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
+ struct bbpos end;
+ ret = bch2_get_btree_in_memory_pos(trans,
+ BIT_ULL(BTREE_ID_backpointers),
+ BIT_ULL(BTREE_ID_backpointers),
+ BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
if (ret)
break;
+ s.bucket_end = end.pos;
+
if ( bpos_eq(s.bucket_start, POS_MIN) &&
!bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
@@ -763,6 +712,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
+ c->btree_cache.pinned_nodes_leaf_mask = 0;
+ c->btree_cache.pinned_nodes_interior_mask = 0;
+
bch_err_fn(c, ret);
return ret;
}
@@ -868,6 +820,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
+ c->btree_cache.pinned_nodes_leaf_mask = 0;
+ c->btree_cache.pinned_nodes_interior_mask = 0;
+
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
index 5198e94cf3b8..f63893344f80 100644
--- a/fs/bcachefs/bbpos_types.h
+++ b/fs/bcachefs/bbpos_types.h
@@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 69d0d60d50e3..799aa32b6b4d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -212,6 +212,7 @@
#include "recovery_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
+#include "time_stats.h"
#include "util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -266,6 +267,9 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
__printf(2, 3)
+void bch2_print_opts(struct bch_opts *, const char *, ...);
+
+__printf(2, 3)
void __bch2_print(struct bch_fs *c, const char *fmt, ...);
#define maybe_dev_to_fs(_c) _Generic((_c), \
@@ -504,6 +508,7 @@ enum gc_phase {
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
GC_PHASE_BTREE_rebalance_work,
+ GC_PHASE_BTREE_subvolume_children,
GC_PHASE_PENDING_DELETE,
};
@@ -593,7 +598,7 @@ struct bch_dev {
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
- struct bch2_time_stats io_latency[2];
+ struct bch2_time_stats_quantiles io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
@@ -663,6 +668,8 @@ struct journal_seq_blacklist_table {
};
struct journal_keys {
+ /* must match layout in darray_types.h */
+ size_t nr, size;
struct journal_key {
u64 journal_seq;
u32 journal_offset;
@@ -671,15 +678,13 @@ struct journal_keys {
bool allocated;
bool overwritten;
struct bkey_i *k;
- } *d;
+ } *data;
/*
* Gap buffer: instead of all the empty space in the array being at the
* end of the buffer - from @nr to @size - the empty space is at @gap.
* This means that sequential insertions are O(n) instead of O(n^2).
*/
size_t gap;
- size_t nr;
- size_t size;
atomic_t ref;
bool initial_ref_held;
};
@@ -703,6 +708,7 @@ struct btree_trans_buf {
x(reflink) \
x(fallocate) \
x(discard) \
+ x(discard_fast) \
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
@@ -843,6 +849,8 @@ struct bch_fs {
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
+ struct workqueue_struct *btree_node_rewrite_worker;
+
struct list_head pending_node_rewrites;
struct mutex pending_node_rewrites_lock;
@@ -919,8 +927,6 @@ struct bch_fs {
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
- u64 blocked_allocate;
- u64 blocked_allocate_open_bucket;
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
@@ -940,8 +946,11 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
- struct work_struct discard_work;
struct work_struct invalidate_work;
+ struct work_struct discard_work;
+ struct mutex discard_buckets_in_flight_lock;
+ DARRAY(struct bpos) discard_buckets_in_flight;
+ struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0668b682a21c..bff8750ac0d7 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -189,7 +189,11 @@ struct bversion {
__u32 hi;
__u64 lo;
#endif
-} __packed __aligned(4);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
struct bkey {
/* Size of combined key and value, in u64s */
@@ -222,7 +226,36 @@ struct bkey {
__u8 pad[1];
#endif
-} __packed __aligned(8);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+/*
+ * The big-endian version of bkey can't be compiled by rustc with the "aligned"
+ * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
+ * So for Rust compatibility, don't include this. It can be included in the LE
+ * version because the "packed" attr is redundant in that case.
+ *
+ * History: (quoting Kent)
+ *
+ * Specifically, when i was designing bkey, I wanted the header to be no
+ * bigger than necessary so that bkey_packed could use the rest. That means that
+ * decently offten extent keys will fit into only 8 bytes, instead of spilling over
+ * to 16.
+ *
+ * But packed_bkey treats the part after the header - the packed section -
+ * as a single multi word, variable length integer. And bkey, the unpacked
+ * version, is just a special case version of a bkey_packed; all the packed
+ * bkey code will work on keys in any packed format, the in-memory
+ * representation of an unpacked key also is just one type of packed key...
+ *
+ * So that constrains the key part of a bkig endian bkey to start right
+ * after the header.
+ *
+ * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
+ * some reason - that will clean up this wart.
+ */
+__aligned(8)
+#endif
+;
struct bkey_packed {
__u64 _data[0];
@@ -840,7 +873,9 @@ struct bch_sb_field_downgrade {
x(snapshot_skiplists, BCH_VERSION(1, 1)) \
x(deleted_inodes, BCH_VERSION(1, 2)) \
x(rebalance_work, BCH_VERSION(1, 3)) \
- x(member_seq, BCH_VERSION(1, 4))
+ x(member_seq, BCH_VERSION(1, 4)) \
+ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
+ x(btree_subvolume_children, BCH_VERSION(1, 6))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1275,7 +1310,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(dev_usage, 8) \
x(log, 9) \
x(overwrite, 10) \
- x(write_buffer_keys, 11)
+ x(write_buffer_keys, 11) \
+ x(datetime, 12)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -1376,6 +1412,11 @@ struct jset_entry_log {
u8 d[];
} __packed __aligned(8);
+struct jset_entry_datetime {
+ struct jset_entry entry;
+ __le64 seconds;
+} __packed __aligned(8);
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@@ -1482,7 +1523,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
- BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
+ x(subvolume_children, 19, 0, \
+ BIT_ULL(KEY_TYPE_set))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 831be01809f2..cf23ff47bed8 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -4,7 +4,7 @@
#include <linux/bug.h>
#include "bcachefs_format.h"
-
+#include "bkey_types.h"
#include "btree_types.h"
#include "util.h"
#include "vstructs.h"
@@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *,
const struct bkey_format *,
const struct bkey_packed *);
-/* bkey with split value, const */
-struct bkey_s_c {
- const struct bkey *k;
- const struct bch_val *v;
-};
-
-/* bkey with split value */
-struct bkey_s {
- union {
- struct {
- struct bkey *k;
- struct bch_val *v;
- };
- struct bkey_s_c s_c;
- };
-};
-
-#define bkey_p_next(_k) vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
- return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
- return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
- unsigned u64s = BKEY_U64s + val_u64s;
-
- BUG_ON(u64s > U8_MAX);
- k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
- set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
@@ -362,10 +311,7 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
const struct bkey_packed *k)
{
- unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
-
- EBUG_ON(k->u64s < ret);
- return ret;
+ return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
}
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@@ -553,155 +499,6 @@ static inline void bkey_reassemble(struct bkey_i *dst,
memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
-#define bkey_s_null ((struct bkey_s) { .k = NULL })
-#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
- return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
- return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
- return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
- return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...) \
-struct bkey_i_##name { \
- union { \
- struct bkey k; \
- struct bkey_i k_i; \
- }; \
- struct bch_##name v; \
-}; \
- \
-struct bkey_s_c_##name { \
- union { \
- struct { \
- const struct bkey *k; \
- const struct bch_##name *v; \
- }; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-struct bkey_s_##name { \
- union { \
- struct { \
- struct bkey *k; \
- struct bch_##name *v; \
- }; \
- struct bkey_s_c_##name c; \
- struct bkey_s s; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline const struct bkey_i_##name * \
-bkey_i_to_##name##_c(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{ \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-name##_i_to_s_c(const struct bkey_i_##name *k) \
-{ \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-bkey_i_to_s_c_##name(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{ \
- struct bkey_i_##name *k = \
- container_of(&_k->k, struct bkey_i_##name, k); \
- \
- bkey_init(&k->k); \
- memset(&k->v, 0, sizeof(k->v)); \
- k->k.type = KEY_TYPE_##name; \
- set_bkey_val_bytes(&k->k, sizeof(k->v)); \
- \
- return k; \
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
/* byte order helpers */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
new file mode 100644
index 000000000000..c9ae9e42b385
--- /dev/null
+++ b/fs/bcachefs/bkey_types.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_TYPES_H
+#define _BCACHEFS_BKEY_TYPES_H
+
+#include "bcachefs_format.h"
+
+/*
+ * bkey_i - bkey with inline value
+ * bkey_s - bkey with split value
+ * bkey_s_c - bkey with split value, const
+ */
+
+#define bkey_p_next(_k) vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+ return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+ return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+ unsigned u64s = BKEY_U64s + val_u64s;
+
+ BUG_ON(u64s > U8_MAX);
+ k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+ set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k) \
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+ const struct bkey *k;
+ const struct bch_val *v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+ union {
+ struct {
+ struct bkey *k;
+ struct bch_val *v;
+ };
+ struct bkey_s_c s_c;
+ };
+};
+
+#define bkey_s_null ((struct bkey_s) { .k = NULL })
+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+ return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+ return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+ return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...) \
+struct bkey_i_##name { \
+ union { \
+ struct bkey k; \
+ struct bkey_i k_i; \
+ }; \
+ struct bch_##name v; \
+}; \
+ \
+struct bkey_s_c_##name { \
+ union { \
+ struct { \
+ const struct bkey *k; \
+ const struct bch_##name *v; \
+ }; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+struct bkey_s_##name { \
+ union { \
+ struct { \
+ struct bkey *k; \
+ struct bch_##name *v; \
+ }; \
+ struct bkey_s_c_##name c; \
+ struct bkey_s s; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline const struct bkey_i_##name * \
+bkey_i_to_##name##_c(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{ \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+name##_i_to_s_c(const struct bkey_i_##name *k) \
+{ \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+bkey_i_to_s_c_##name(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{ \
+ struct bkey_i_##name *k = \
+ container_of(&_k->k, struct bkey_i_##name, k); \
+ \
+ bkey_init(&k->k); \
+ memset(&k->v, 0, sizeof(k->v)); \
+ k->k.type = KEY_TYPE_##name; \
+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \
+ \
+ return k; \
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d7c81beac14a..562561a9a510 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
@@ -60,7 +61,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_buf_bytes(b));
+ kvfree(b->data);
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -94,7 +95,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_buf_bytes(b), gfp);
+ b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +108,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_buf_bytes(b));
+ kvfree(b->data);
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
int ret = 0;
lockdep_assert_held(&bc->lock);
+
+ struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+ u64 mask = b->c.level
+ ? bc->pinned_nodes_interior_mask
+ : bc->pinned_nodes_leaf_mask;
+
+ if ((mask & BIT_ULL(b->c.btree_id)) &&
+ bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+ bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@@ -408,7 +421,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
- kvpfree(c->verify_ondisk, c->opts.btree_node_size);
+ kvfree(c->verify_ondisk);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@@ -711,6 +724,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
b = bch2_btree_node_mem_alloc(trans, level != 0);
if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+ if (!path)
+ return b;
+
trans->memory_allocation_failure = true;
trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -760,8 +776,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
}
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- if (path)
- trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+ BUG_ON(!path);
+
+ trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
}
@@ -901,7 +918,7 @@ retry:
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-BCH_ERR_btree_node_read_error);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -992,7 +1009,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-BCH_ERR_btree_node_read_error);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1075,7 +1092,7 @@ lock_node:
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
- b = ERR_PTR(-EIO);
+ b = ERR_PTR(-BCH_ERR_btree_node_read_error);
goto out;
}
@@ -1096,7 +1113,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- BUG_ON(trans && !btree_node_locked(path, level + 1));
+ BUG_ON(path && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_cache_find(bc, k);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1102995643b1..bdaed29f084a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -389,7 +389,8 @@ again:
have_child = dropped_children = false;
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -406,7 +407,7 @@ again:
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
- if (mustfix_fsck_err_on(ret == -EIO, c,
+ if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
@@ -478,7 +479,8 @@ again:
goto err;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -591,16 +593,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
- if (!g->gen_valid &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+ if (fsck_err_on(!g->gen_valid,
+ c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@@ -609,16 +610,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
- if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@@ -631,28 +631,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
- if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
- if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
@@ -931,7 +929,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
bch2_bkey_buf_init(&prev);
bch2_bkey_buf_init(&cur);
bkey_init(&prev.k->k);
@@ -963,7 +961,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
if (b->c.level > target_depth) {
bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
struct btree *child;
@@ -976,7 +975,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
false);
ret = PTR_ERR_OR_ZERO(child);
- if (ret == -EIO) {
+ if (bch2_err_matches(ret, EIO)) {
bch2_topology_error(c);
if (__fsck_err(c,
@@ -1190,9 +1189,7 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
- kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
- sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket));
+ kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
free_percpu(ca->usage_gc);
@@ -1365,11 +1362,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
- struct bucket gc, *b;
+ struct bucket old_gc, gc, *b;
struct bkey_i_alloc_v4 *a;
struct bch_alloc_v4 old_convert, new;
const struct bch_alloc_v4 *old;
- enum bch_data_type type;
int ret;
old = bch2_alloc_to_v4(k, &old_convert);
@@ -1377,28 +1373,29 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
b = gc_bucket(ca, iter->pos.offset);
+ old_gc = *b;
+
+ if ((old->data_type == BCH_DATA_sb ||
+ old->data_type == BCH_DATA_journal) &&
+ !bch2_dev_is_online(ca)) {
+ b->data_type = old->data_type;
+ b->dirty_sectors = old->dirty_sectors;
+ }
/*
* b->data_type doesn't yet include need_discard & need_gc_gen states -
* fix that here:
*/
- type = __alloc_data_type(b->dirty_sectors,
- b->cached_sectors,
- b->stripe,
- *old,
- b->data_type);
- if (b->data_type != type) {
- struct bch_dev_usage *u;
-
- preempt_disable();
- u = this_cpu_ptr(ca->usage_gc);
- u->d[b->data_type].buckets--;
- b->data_type = type;
- u->d[b->data_type].buckets++;
- preempt_enable();
- }
-
+ b->data_type = __alloc_data_type(b->dirty_sectors,
+ b->cached_sectors,
+ b->stripe,
+ *old,
+ b->data_type);
gc = *b;
+
+ if (gc.data_type != old_gc.data_type ||
+ gc.dirty_sectors != old_gc.dirty_sectors)
+ bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
percpu_up_read(&c->mark_lock);
if (metadata_only &&
@@ -1410,8 +1407,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (gen_after(old->gen, gc.gen))
return 0;
- if (c->opts.reconstruct_alloc ||
- fsck_err_on(new.data_type != gc.data_type, c,
+ if (fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
@@ -1422,8 +1418,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
- if (c->opts.reconstruct_alloc || \
- fsck_err_on(new._f != gc._f, c, _errtype, \
+ if (fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
@@ -1491,7 +1486,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
for_each_member_device(c, ca) {
- struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
@@ -1585,8 +1580,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
" should be %u",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
r->refcount)) {
- struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
-
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
@@ -1595,6 +1589,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
new->k.type = KEY_TYPE_deleted;
else
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+ ret = bch2_trans_update(trans, iter, new, 0);
}
fsck_err:
printbuf_exit(&buf);
@@ -1817,10 +1812,10 @@ out:
if (!ret) {
bch2_journal_block(&c->journal);
- ret = bch2_gc_stripes_done(c, metadata_only) ?:
- bch2_gc_reflink_done(c, metadata_only) ?:
- bch2_gc_alloc_done(c, metadata_only) ?:
- bch2_gc_done(c, initial, metadata_only);
+ ret = bch2_gc_alloc_done(c, metadata_only) ?:
+ bch2_gc_done(c, initial, metadata_only) ?:
+ bch2_gc_stripes_done(c, metadata_only) ?:
+ bch2_gc_reflink_done(c, metadata_only);
bch2_journal_unblock(&c->journal);
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index aa9b6cbe3226..34df8ccc5fec 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
if (used_mempool)
mempool_free(p, &c->btree_bounce_pool);
else
- vpfree(p, size);
+ kvfree(p);
}
static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
- p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+ p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -581,8 +581,7 @@ static int __btree_err(int ret,
break;
case -BCH_ERR_btree_node_read_err_bad_node:
bch2_print_string_as_lines(KERN_ERR, out.buf);
- bch2_topology_error(c);
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+ ret = bch2_topology_error(c);
break;
case -BCH_ERR_btree_node_read_err_incompatible:
bch2_print_string_as_lines(KERN_ERR, out.buf);
@@ -840,6 +839,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
if (k->format > KEY_FORMAT_CURRENT)
return false;
+ if (k->u64s < bkeyp_key_u64s(&b->format, k))
+ return false;
+
struct printbuf buf = PRINTBUF;
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
@@ -881,7 +883,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
"invalid bkey format %u", k->format))
goto drop_this_key;
- /* XXX: validate k->u64s */
+ if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_bad_u64s,
+ "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k)))
+ goto drop_this_key;
+
if (!write)
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
@@ -1058,7 +1066,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting btree node: %i", ret))
+ "decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
@@ -1099,7 +1107,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting btree node: %i\n", ret))
+ "decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
sectors = vstruct_sectors(bne, c->block_bits);
@@ -1330,7 +1338,7 @@ start:
if (saw_error && !btree_node_read_error(b)) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->key.k.p);
- bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+ bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
__func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
bch2_btree_node_rewrite_async(c, b);
@@ -1737,7 +1745,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
- ret = -EIO;
+ ret = -BCH_ERR_btree_node_read_error;
goto err;
}
@@ -1841,7 +1849,7 @@ static void btree_node_write_work(struct work_struct *work)
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_write_all_failed;
+ ret = -BCH_ERR_btree_node_write_all_failed;
goto err;
}
@@ -1866,8 +1874,8 @@ out:
return;
err:
set_btree_node_noevict(b);
- if (!bch2_err_matches(ret, EROFS))
- bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+ bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+ "writing btree node: %s", bch2_err_str(ret));
goto out;
}
@@ -2123,7 +2131,7 @@ do_write:
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error encrypting btree node: %i\n", ret))
+ "encrypting btree node: %s", bch2_err_str(ret)))
goto err;
nonce = btree_nonce(i, b->written << 9);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3ef338df82f5..51bcdc6c6d1c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct bkey_s_c k;
int ret = 0;
- __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+ __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
@@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
path = &trans->paths[path_idx];
if (unlikely(path->level >= BTREE_MAX_DEPTH))
- goto out;
+ goto out_uptodate;
path->level = btree_path_up_until_good_node(trans, path, 0);
@@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
}
-
+out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
@@ -1520,7 +1520,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans)
{
unsigned nr = trans->nr_paths * 2;
- void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+ void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
sizeof(struct btree_trans_paths) +
nr * sizeof(struct btree_path) +
nr * sizeof(btree_path_idx_t) + 8 +
@@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
- btree_path_set_should_be_locked(trans->paths + iter->path);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ if (btree_path_node(path, path->level))
+ btree_path_set_should_be_locked(path);
return 0;
}
@@ -2305,7 +2307,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_WITH_JOURNAL)
- return bkey_s_c_err(-EIO);
+ return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
@@ -2503,6 +2505,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = bch2_btree_iter_peek_upto(&iter2, end);
if (k.k && !bkey_err(k)) {
+ swap(iter->key_cache_path, iter2.key_cache_path);
iter->k = iter2.k;
k.k = &iter->k;
}
@@ -2762,6 +2765,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
struct btree_trans *trans = src->trans;
*dst = *src;
+#ifdef TRACK_PATH_ALLOCATED
+ dst->ip_allocated = _RET_IP_;
+#endif
if (src->path)
__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
@@ -3085,7 +3091,7 @@ void bch2_trans_put(struct btree_trans *trans)
trans->paths = NULL;
if (paths_allocated != trans->_paths_allocated)
- kfree_rcu_mightsleep(paths_allocated);
+ kvfree_rcu_mightsleep(paths_allocated);
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 719a94a84950..50e04356d72c 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "bset.h"
+#include "btree_cache.h"
#include "btree_journal_iter.h"
#include "journal_io.h"
@@ -40,7 +42,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
{
- return keys->d + idx_to_pos(keys, idx);
+ return keys->data + idx_to_pos(keys, idx);
}
static size_t __bch2_journal_key_search(struct journal_keys *keys,
@@ -180,10 +182,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
- journal_key_cmp(&n, &keys->d[idx]) == 0) {
- if (keys->d[idx].allocated)
- kfree(keys->d[idx].k);
- keys->d[idx] = n;
+ journal_key_cmp(&n, &keys->data[idx]) == 0) {
+ if (keys->data[idx].allocated)
+ kfree(keys->data[idx].k);
+ keys->data[idx] = n;
return 0;
}
@@ -196,17 +198,17 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
.size = max_t(size_t, keys->size, 8) * 2,
};
- new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
- if (!new_keys.d) {
+ new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
+ if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
return -BCH_ERR_ENOMEM_journal_key_insert;
}
/* Since @keys was full, there was no gap: */
- memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
- kvfree(keys->d);
- keys->d = new_keys.d;
+ memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
+ kvfree(keys->data);
+ keys->data = new_keys.data;
keys->nr = new_keys.nr;
keys->size = new_keys.size;
@@ -216,11 +218,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
journal_iters_move_gap(c, keys->gap, idx);
- move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
- keys->gap = idx;
+ move_gap(keys, idx);
keys->nr++;
- keys->d[keys->gap++] = n;
+ keys->data[keys->gap++] = n;
journal_iters_fix(c);
@@ -267,10 +268,10 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->size &&
- keys->d[idx].btree_id == btree &&
- keys->d[idx].level == level &&
- bpos_eq(keys->d[idx].k->k.p, pos))
- keys->d[idx].overwritten = true;
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos))
+ keys->data[idx].overwritten = true;
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
@@ -284,16 +285,16 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->keys->d + iter->idx;
+ struct journal_key *k = iter->keys->data + iter->idx;
- while (k < iter->keys->d + iter->keys->size &&
+ while (k < iter->keys->data + iter->keys->size &&
k->btree_id == iter->btree_id &&
k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
- k = iter->keys->d + iter->idx;
+ k = iter->keys->data + iter->idx;
}
return bkey_s_c_null;
@@ -334,9 +335,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
iter->pos = bpos_successor(iter->pos);
}
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+ struct btree_and_journal_iter iter = *_iter;
+ struct bch_fs *c = iter.trans->c;
+ unsigned level = iter.journal.level;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
+ ? (level > 1 ? 0 : 2)
+ : (level > 1 ? 1 : 16);
+
+ iter.prefetch = false;
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr--) {
+ bch2_btree_and_journal_iter_advance(&iter);
+ struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
struct bkey_s_c btree_k, journal_k, ret;
+
+ if (iter->prefetch && iter->journal.level)
+ btree_and_journal_iter_prefetch(iter);
again:
if (iter->at_end)
return bkey_s_c_null;
@@ -376,17 +406,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+ struct btree_and_journal_iter *iter,
struct btree *b,
struct btree_node_iter node_iter,
struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
+ iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
- bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
@@ -396,15 +427,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
* this version is used by btree_gc before filesystem has gone RW and
* multithreaded, so uses the journal_iters list:
*/
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+ struct btree_and_journal_iter *iter,
struct btree *b)
{
struct btree_node_iter node_iter;
bch2_btree_node_iter_init_from_start(&node_iter, b);
- __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
- list_add(&iter->journal.list, &c->journal_iters);
+ __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
+ list_add(&iter->journal.list, &trans->c->journal_iters);
}
/* sort and dedup all keys in the journal: */
@@ -415,9 +446,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
struct genradix_iter iter;
genradix_for_each(&c->journal_entries, iter, i)
- if (*i)
- kvpfree(*i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&(*i)->j));
+ kvfree(*i);
genradix_free(&c->journal_entries);
}
@@ -437,22 +466,20 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
void bch2_journal_keys_put(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key *i;
BUG_ON(atomic_read(&keys->ref) <= 0);
if (!atomic_dec_and_test(&keys->ref))
return;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
+ move_gap(keys, keys->nr);
- for (i = keys->d; i < keys->d + keys->nr; i++)
+ darray_for_each(*keys, i)
if (i->allocated)
kfree(i->k);
- kvfree(keys->d);
- keys->d = NULL;
+ kvfree(keys->data);
+ keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
bch2_journal_entries_free(c);
@@ -460,83 +487,38 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- struct journal_key *src, *dst;
+ sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
- sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+ struct journal_key *dst = keys->data;
- src = dst = keys->d;
- while (src < keys->d + keys->nr) {
- while (src + 1 < keys->d + keys->nr &&
- !journal_key_cmp(src, src + 1))
- src++;
+ darray_for_each(*keys, src) {
+ if (src + 1 < &darray_top(*keys) &&
+ !journal_key_cmp(src, src + 1))
+ continue;
- *dst++ = *src++;
+ *dst++ = *src;
}
- keys->nr = dst - keys->d;
+ keys->nr = dst - keys->data;
}
int bch2_journal_keys_sort(struct bch_fs *c)
{
struct genradix_iter iter;
struct journal_replay *i, **_i;
- struct jset_entry *entry;
- struct bkey_i *k;
struct journal_keys *keys = &c->journal_keys;
- size_t nr_keys = 0, nr_read = 0;
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (!i || i->ignore)
- continue;
-
- for_each_jset_key(k, entry, &i->j)
- nr_keys++;
- }
-
- if (!nr_keys)
- return 0;
-
- keys->size = roundup_pow_of_two(nr_keys);
-
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- if (!keys->d) {
- bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
- nr_keys);
-
- do {
- keys->size >>= 1;
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- } while (!keys->d && keys->size > nr_keys / 8);
-
- if (!keys->d) {
- bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
- keys->size);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
+ size_t nr_read = 0;
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
cond_resched();
for_each_jset_key(k, entry, &i->j) {
- if (keys->nr == keys->size) {
- __journal_keys_sort(keys);
-
- if (keys->nr > keys->size * 7 / 8) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
- keys->nr, keys->size, nr_read, nr_keys);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
-
- keys->d[keys->nr++] = (struct journal_key) {
+ struct journal_key n = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
@@ -544,6 +526,18 @@ int bch2_journal_keys_sort(struct bch_fs *c)
.journal_offset = k->_data - i->j._data,
};
+ if (darray_push(keys, n)) {
+ __journal_keys_sort(keys);
+
+ if (keys->nr * 8 > keys->size * 7) {
+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
+ keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
+ return -BCH_ERR_ENOMEM_journal_keys_sort;
+ }
+
+ BUG_ON(darray_push(keys, n));
+ }
+
nr_read++;
}
}
@@ -551,6 +545,6 @@ int bch2_journal_keys_sort(struct bch_fs *c)
__journal_keys_sort(keys);
keys->gap = keys->nr;
- bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0;
}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 8ca4c100b2e3..c9d19da3ea04 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -15,6 +15,7 @@ struct journal_iter {
*/
struct btree_and_journal_iter {
+ struct btree_trans *trans;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
@@ -22,6 +23,7 @@ struct btree_and_journal_iter {
struct journal_iter journal;
struct bpos pos;
bool at_end;
+ bool prefetch;
};
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@@ -29,6 +31,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+ struct btree_and_journal_iter *);
+
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@@ -42,12 +47,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *, struct btree *,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+ struct btree_and_journal_iter *, struct btree *,
struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *,
- struct btree *);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+ struct btree_and_journal_iter *, struct btree *);
void bch2_journal_keys_put(struct bch_fs *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 74e52fd28abe..581edcb0911b 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -380,9 +380,11 @@ static int btree_key_cache_fill(struct btree_trans *trans,
struct bkey_i *new_k = NULL;
int ret;
- k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
- BTREE_ITER_KEY_CACHE_FILL|
- BTREE_ITER_CACHED_NOFILL);
+ bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+ BTREE_ITER_KEY_CACHE_FILL|
+ BTREE_ITER_CACHED_NOFILL);
+ iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -674,7 +676,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
!bch2_journal_error(j), c,
- "error flushing key cache: %s", bch2_err_str(ret));
+ "flushing key cache: %s", bch2_err_str(ret));
if (ret)
goto out;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 684397442338..b9b151e693ed 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -747,7 +747,8 @@ void bch2_trans_downgrade(struct btree_trans *trans)
return;
trans_for_each_path(trans, path, i)
- bch2_btree_path_downgrade(trans, path);
+ if (path->ref)
+ bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4a5a64499eb7..9404d96c38f3 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,6 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
+#include "bbpos_types.h"
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
@@ -173,6 +174,11 @@ struct btree_cache {
*/
struct task_struct *alloc_lock;
struct closure_waitlist alloc_wait;
+
+ struct bbpos pinned_nodes_start;
+ struct bbpos pinned_nodes_end;
+ u64 pinned_nodes_leaf_mask;
+ u64 pinned_nodes_interior_mask;
};
struct btree_node_iter {
@@ -654,6 +660,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
BIT_ULL(BKEY_TYPE_reflink)| \
+ BIT_ULL(BKEY_TYPE_subvolumes)| \
BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
@@ -727,7 +734,7 @@ struct btree_root {
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
u8 alive;
- s8 error;
+ s16 error;
};
enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index c3ff365acce9..a4b40c1656a5 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -452,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
* the key cache - but the key has to exist in the btree for that to
* work:
*/
- if (path->cached && bkey_deleted(&i->old_k))
+ if (path->cached && !i->old_btree_u64s)
return flush_new_cached_update(trans, i, flags, ip);
return 0;
@@ -788,6 +788,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ int ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ return ret;
+
+ bkey_init(&k->k);
+ k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ k->k.p = pos;
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
+ struct bpos pos, bool set)
+{
struct bkey_i k;
bkey_init(&k.k);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b9382b7b288b..cc7c53e83f89 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -63,11 +63,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
enum btree_id btree, struct bpos pos)
{
- return bch2_btree_bit_mod(trans, btree, pos, false);
+ return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4530b14ff2c3..b2f5f2e50f7e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -25,8 +25,7 @@
#include <linux/random.h>
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- btree_path_idx_t, struct btree *,
- struct keylist *, unsigned);
+ btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
@@ -647,7 +646,7 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_trans_unlock(trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
- "%s(): error %s", __func__, bch2_err_str(ret));
+ "%s", bch2_err_str(ret));
err:
if (as->b) {
@@ -1068,13 +1067,18 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < c->journal.watermark) {
+ if (watermark < c->journal.watermark) {
struct journal_res res = { 0 };
+ unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
+
+ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
+ watermark != BCH_WATERMARK_reclaim)
+ journal_flags |= JOURNAL_RES_GET_NONBLOCK;
ret = drop_locks_do(trans,
- bch2_journal_res_get(&c->journal, &res, 1,
- watermark|JOURNAL_RES_GET_CHECK));
+ bch2_journal_res_get(&c->journal, &res, 1, journal_flags));
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
if (ret)
return ERR_PTR(ret);
}
@@ -1118,6 +1122,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
closure_init(&as->cl, NULL);
as->c = c;
as->start_time = start_time;
+ as->ip_started = _RET_IP_;
as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = true;
as->btree_id = path->btree_id;
@@ -1193,7 +1198,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
err:
bch2_btree_update_free(as, trans);
if (!bch2_err_matches(ret, ENOSPC) &&
- !bch2_err_matches(ret, EROFS))
+ !bch2_err_matches(ret, EROFS) &&
+ ret != -BCH_ERR_journal_reclaim_would_deadlock)
bch_err_fn_ratelimited(c, ret);
return ERR_PTR(ret);
}
@@ -1208,10 +1214,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
- BUG_ON(btree_node_root(c, b) &&
- (b->c.level < btree_node_root(c, b)->c.level ||
- !btree_node_dying(btree_node_root(c, b))));
-
bch2_btree_id_root(c, b->c.btree_id)->b = b;
mutex_unlock(&c->btree_root_lock);
@@ -1477,7 +1479,7 @@ static void btree_split_insert_keys(struct btree_update *as,
static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path, struct btree *b,
- struct keylist *keys, unsigned flags)
+ struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(trans->paths + path, b);
@@ -1578,7 +1580,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err;
} else if (n3) {
@@ -1673,7 +1675,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
- * @flags: transaction commit flags
*
* Returns: 0 on success, typically transaction restart error on failure
*
@@ -1683,7 +1684,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx, struct btree *b,
- struct keylist *keys, unsigned flags)
+ struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree_path *path = trans->paths + path_idx;
@@ -1739,7 +1740,7 @@ split:
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
- return btree_split(as, trans, path_idx, b, keys, flags);
+ return btree_split(as, trans, path_idx, b, keys);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
@@ -1747,7 +1748,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
unsigned flags)
{
/* btree_split & merge may both cause paths array to be reallocated */
-
struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
@@ -1759,7 +1759,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
if (IS_ERR(as))
return PTR_ERR(as);
- ret = btree_split(as, trans, path, b, NULL, flags);
+ ret = btree_split(as, trans, path, b, NULL);
if (ret) {
bch2_btree_update_free(as, trans);
return ret;
@@ -1775,6 +1775,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
return ret;
}
+static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
+ btree_path_idx_t path_idx)
+{
+ struct bch_fs *c = as->c;
+ struct btree_path *path = trans->paths + path_idx;
+ struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
+
+ BUG_ON(!btree_node_locked(path, b->c.level));
+
+ n = __btree_root_alloc(as, trans, b->c.level + 1);
+
+ bch2_btree_update_add_new_node(as, n);
+ six_unlock_write(&n->c.lock);
+
+ path->locks_want++;
+ BUG_ON(btree_node_locked(path, n->c.level));
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, path, n);
+
+ n->sib_u64s[0] = U16_MAX;
+ n->sib_u64s[1] = U16_MAX;
+
+ bch2_keylist_add(&as->parent_keys, &b->key);
+ btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
+
+ bch2_btree_set_root(as, trans, path, n);
+ bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_trans_node_add(trans, path, n);
+ six_unlock_intent(&n->c.lock);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_add_tail(&b->list, &c->btree_cache.live);
+ mutex_unlock(&c->btree_cache.lock);
+
+ bch2_trans_verify_locks(trans);
+}
+
+int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+ struct btree_update *as =
+ bch2_btree_update_start(trans, trans->paths + path,
+ b->c.level, true, flags);
+ if (IS_ERR(as))
+ return PTR_ERR(as);
+
+ __btree_increase_depth(as, trans, path);
+ bch2_btree_update_done(as, trans);
+ return 0;
+}
+
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
btree_path_idx_t path,
unsigned level,
@@ -1845,8 +1899,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
__func__, buf1.buf, buf2.buf);
printbuf_exit(&buf1);
printbuf_exit(&buf2);
- bch2_topology_error(c);
- ret = -EIO;
+ ret = bch2_topology_error(c);
goto err;
}
@@ -1916,7 +1969,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err_free_update;
@@ -1987,8 +2040,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path,
- parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
if (ret)
goto err;
} else {
@@ -2069,7 +2121,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
async_btree_node_rewrite_trans(trans, a));
- bch_err_fn(c, ret);
+ bch_err_fn_ratelimited(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@@ -2116,7 +2168,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
}
- queue_work(c->btree_interior_update_worker, &a->work);
+ queue_work(c->btree_node_rewrite_worker, &a->work);
}
void bch2_do_pending_node_rewrites(struct bch_fs *c)
@@ -2128,7 +2180,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c)
list_del(&a->list);
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
- queue_work(c->btree_interior_update_worker, &a->work);
+ queue_work(c->btree_node_rewrite_worker, &a->work);
}
mutex_unlock(&c->pending_node_rewrites_lock);
}
@@ -2396,12 +2448,12 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- prt_printf(out, "%p m %u w %u r %u j %llu\n",
- as,
- as->mode,
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
+ prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ (void *) as->ip_started,
+ as->mode,
+ as->nodes_written,
+ closure_nr_remaining(&as->cl),
+ as->journal.seq);
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -2465,6 +2517,8 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
+ if (c->btree_node_rewrite_worker)
+ destroy_workqueue(c->btree_node_rewrite_worker);
if (c->btree_interior_update_worker)
destroy_workqueue(c->btree_interior_update_worker);
mempool_exit(&c->btree_interior_update_pool);
@@ -2485,10 +2539,15 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
{
c->btree_interior_update_worker =
- alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ c->btree_node_rewrite_worker =
+ alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
+ if (!c->btree_node_rewrite_worker)
+ return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_update)))
return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index c593c925d1e3..f651dd48aaa0 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -32,6 +32,7 @@ struct btree_update {
struct closure cl;
struct bch_fs *c;
u64 start_time;
+ unsigned long ip_started;
struct list_head list;
struct list_head unwritten_list;
@@ -119,6 +120,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
+int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
+
int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index ac7844861966..5cbad8445782 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -378,7 +378,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
}
}
err:
- bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+ bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
bch2_journal_pin_drop(j, &wb->flushing.pin);
wb->flushing.keys.nr = 0;
@@ -574,8 +574,6 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
{
struct journal_keys_to_wb dst;
- struct jset_entry *entry;
- struct bkey_i *k;
int ret = 0;
bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
@@ -590,7 +588,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
entry->type = BCH_JSET_ENTRY_btree_keys;
}
+ spin_lock(&c->journal.lock);
buf->need_flush_to_write_buffer = false;
+ spin_unlock(&c->journal.lock);
out:
bch2_journal_keys_to_write_buffer_end(c, &dst);
return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 54f7826ac498..96edf2c34d43 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -990,8 +990,8 @@ static int __trigger_extent(struct btree_trans *trans,
ret = !gc
? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
: update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
- bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
- __func__);
+ bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
+ bch2_err_str(ret));
if (ret)
return ret;
}
@@ -1020,7 +1020,7 @@ static int __trigger_extent(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+ bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
printbuf_exit(&buf);
}
if (ret)
@@ -1053,7 +1053,8 @@ int bch2_trigger_extent(struct btree_trans *trans,
(int) bch2_bkey_needs_rebalance(c, old);
if (mod) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ new.k->p, mod > 0);
if (ret)
return ret;
}
@@ -1335,7 +1336,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
- kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+ kvfree(buckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -1345,16 +1346,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bool resize = ca->bucket_gens != NULL;
int ret;
- if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
- GFP_KERNEL|__GFP_ZERO))) {
+ if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
+ GFP_KERNEL|__GFP_ZERO))) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
if ((c->opts.buckets_nouse &&
- !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)))) {
+ !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) *
+ sizeof(unsigned long),
+ GFP_KERNEL|__GFP_ZERO)))) {
ret = -BCH_ERR_ENOMEM_buckets_nouse;
goto err;
}
@@ -1397,8 +1398,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
- kvpfree(buckets_nouse,
- BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+ kvfree(buckets_nouse);
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
@@ -1407,27 +1407,21 @@ err:
void bch2_dev_buckets_free(struct bch_dev *ca)
{
- unsigned i;
-
- kvpfree(ca->buckets_nouse,
- BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
- kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
- sizeof(struct bucket_gens) + ca->mi.nbuckets);
+ kvfree(ca->buckets_nouse);
+ kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
-
ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
if (!ca->usage_base)
return -BCH_ERR_ENOMEM_usage_init;
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
ca->usage[i] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[i])
return -BCH_ERR_ENOMEM_usage_init;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 226b39c17667..38defa19d52d 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -22,12 +22,6 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
-__must_check
-static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
- return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@@ -155,19 +149,35 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
kfree(thr);
}
-static int bch2_fsck_offline_thread_fn(void *arg)
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
- struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
- thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
- if (!thr->thr.thr.ret)
- bch2_fs_stop(c);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
- thread_with_stdio_done(&thr->thr);
- return 0;
+ int ret = 0;
+ if (test_bit(BCH_FS_errors_fixed, &c->flags))
+ ret |= 1;
+ if (test_bit(BCH_FS_error, &c->flags))
+ ret |= 4;
+
+ bch2_fs_stop(c);
+
+ if (ret & 1)
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+ if (ret & 4)
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+
+ return ret;
}
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_offline_thread_fn,
+};
+
static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
{
struct bch_ioctl_fsck_offline arg;
@@ -220,9 +230,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
- ret = bch2_run_thread_with_stdio(&thr->thr,
- bch2_fsck_thread_exit,
- bch2_fsck_offline_thread_fn);
+ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
err:
if (ret < 0) {
if (thr)
@@ -763,9 +771,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
-static int bch2_fsck_online_thread_fn(void *arg)
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
{
- struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = thr->c;
c->stdio_filter = current;
@@ -793,13 +801,16 @@ static int bch2_fsck_online_thread_fn(void *arg)
c->stdio_filter = NULL;
c->opts.fix_errors = old_fix_errors;
- thread_with_stdio_done(&thr->thr);
-
up(&c->online_fsck_mutex);
bch2_ro_ref_put(c);
- return 0;
+ return ret;
}
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_online_thread_fn,
+};
+
static long bch2_ioctl_fsck_online(struct bch_fs *c,
struct bch_ioctl_fsck_online arg)
{
@@ -840,9 +851,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
goto err;
}
- ret = bch2_run_thread_with_stdio(&thr->thr,
- bch2_fsck_thread_exit,
- bch2_fsck_online_thread_fn);
+ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
err:
if (ret < 0) {
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3c761ad6b1c8..4701457f6381 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -558,7 +558,7 @@ got_key:
return 0;
}
-#include "../crypto.h"
+#include "crypto.h"
#endif
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 33df8cf86bd8..1410365a8891 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
return 0;
if (!mempool_initialized(&c->compression_bounce[READ]) &&
- mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
- 1, c->opts.encoded_extent_max))
+ mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+ 1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_read_init;
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
- mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
- 1, c->opts.encoded_extent_max))
+ mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+ 1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_write_init;
for (i = compression_types;
@@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
- if (mempool_init_kvpmalloc_pool(
+ if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
return -BCH_ERR_ENOMEM_compression_workspace_init;
}
if (!mempool_initialized(&c->decompress_workspace) &&
- mempool_init_kvpmalloc_pool(&c->decompress_workspace,
- 1, decompress_workspace_size))
+ mempool_init_kvmalloc_pool(&c->decompress_workspace,
+ 1, decompress_workspace_size))
return -BCH_ERR_ENOMEM_decompression_workspace_init;
return 0;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7bdba8507fc9..208ce6f0fc43 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+ c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -170,7 +170,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+ bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
printbuf_exit(&buf);
}
out:
@@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+ n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_buf_bytes(b));
+ kvfree(n_ondisk);
percpu_ref_put(&ca->io_ref);
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4ae1e9f002a0..d37bd07afbfe 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -144,19 +144,21 @@ fsck_err:
return ret;
}
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
- prt_printf(out, "%.*s -> %llu type %s",
- d_name.len,
- d_name.name,
- d.v->d_type != DT_SUBVOL
- ? le64_to_cpu(d.v->d_inum)
- : le32_to_cpu(d.v->d_child_subvol),
- bch2_d_type_str(d.v->d_type));
+ prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+
+ if (d.v->d_type != DT_SUBVOL)
+ prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+ else
+ prt_printf(out, "%u -> %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ le32_to_cpu(d.v->d_child_subvol));
+
+ prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -199,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
}
int bch2_dirent_create_snapshot(struct btree_trans *trans,
- u64 dir, u32 snapshot,
+ u32 dir_subvol, u64 dir, u32 snapshot,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
bch_str_hash_flags_t str_hash_flags)
{
- subvol_inum zero_inum = { 0 };
+ subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+ dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -217,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.inode = dir;
dirent->k.p.snapshot = snapshot;
- ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- zero_inum, snapshot,
- &dirent->k_i, str_hash_flags,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, snapshot,
+ &dirent->k_i, str_hash_flags,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -291,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
- unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+ unsigned src_update_flags = 0;
+ bool delete_src, delete_dst;
int ret = 0;
- if (src_dir.subvol != dst_dir.subvol)
- return -EXDEV;
-
memset(src_inum, 0, sizeof(*src_inum));
memset(dst_inum, 0, sizeof(*dst_inum));
@@ -317,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
- src_type = bkey_s_c_to_dirent(old_src).v->d_type;
-
- if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
- return -EOPNOTSUPP;
-
-
/* Lookup dst: */
if (mode == BCH_RENAME) {
/*
@@ -350,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
-
- dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
-
- if (dst_type == DT_SUBVOL)
- return -EOPNOTSUPP;
}
if (mode != BCH_RENAME_EXCHANGE)
@@ -424,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
+ if (new_dst->v.d_type == DT_SUBVOL)
+ new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
+
+ if ((mode == BCH_RENAME_EXCHANGE) &&
+ new_src->v.d_type == DT_SUBVOL)
+ new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
out_set_src:
-
/*
- * If we're deleting a subvolume, we need to really delete the dirent,
- * not just emit a whiteout in the current snapshot:
+ * If we're deleting a subvolume we need to really delete the dirent,
+ * not just emit a whiteout in the current snapshot - there can only be
+ * single dirent that points to a given subvolume.
+ *
+ * IOW, we don't maintain multiple versions in different snapshots of
+ * dirents that point to subvolumes - dirents that point to subvolumes
+ * are only visible in one particular subvolume so it's not necessary,
+ * and it would be particularly confusing for fsck to have to deal with.
*/
- if (src_type == DT_SUBVOL) {
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&src_iter);
+ delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+ new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+ delete_dst = old_dst.k &&
+ bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+ new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+ if (!delete_src || !bkey_deleted(&new_src->k)) {
+ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (ret)
goto out;
+ }
- new_src->k.p = src_iter.pos;
- src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+ if (delete_src) {
+ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&src_iter) ?:
+ bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ if (ret)
+ goto out;
}
- ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
- if (ret)
- goto out;
+ if (delete_dst) {
+ bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&dst_iter) ?:
+ bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ if (ret)
+ goto out;
+ }
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
@@ -456,41 +472,29 @@ out:
return ret;
}
-int __bch2_dirent_lookup_trans(struct btree_trans *trans,
- struct btree_iter *iter,
- subvol_inum dir,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, subvol_inum *inum,
- unsigned flags)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum,
+ unsigned flags)
{
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
- if (ret)
- return ret;
-
- ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
+ int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
if (ret)
return ret;
- k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
- d = bkey_s_c_to_dirent(k);
-
- ret = bch2_dirent_read_target(trans, dir, d, inum);
+ ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
if (ret > 0)
ret = -ENOENT;
err:
if (ret)
bch2_trans_iter_exit(trans, iter);
-
return ret;
}
@@ -502,13 +506,13 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
struct btree_iter iter = { NULL };
int ret = lockrestart_do(trans,
- __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+ bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;
}
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -518,7 +522,10 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
SPOS(dir, 0, snapshot),
POS(dir, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
- ret = -ENOTEMPTY;
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
+ continue;
+ ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -531,7 +538,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
- bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+ bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
}
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 21ffeb78f02e..bee55cca2aa0 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
-int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
bch_str_hash_flags_t);
@@ -62,14 +62,14 @@ int bch2_dirent_rename(struct btree_trans *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
subvol_inum, const struct bch_hash_info *,
const struct qstr *, subvol_inum *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
const struct bch_hash_info *,
const struct qstr *, subvol_inum *);
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d503af270024..082075244e16 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -448,7 +448,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
printbuf_exit(&buf);
return ret;
}
@@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
unsigned i;
for (i = 0; i < s->v.nr_blocks; i++) {
- kvpfree(buf->data[i], buf->size << 9);
+ kvfree(buf->data[i]);
buf->data[i] = NULL;
}
}
@@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
memset(buf->valid, 0xFF, sizeof(buf->valid));
for (i = 0; i < v->nr_blocks; i++) {
- buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+ buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
if (!buf->data[i])
goto err;
}
@@ -1868,10 +1868,10 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
return -BCH_ERR_stripe_alloc_blocked;
ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+ bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+ "reading stripe key: %s", bch2_err_str(ret));
if (ret) {
bch2_stripe_close(c, h->s);
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
return ret;
}
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index d260ff9bbfeb..43557bebd0f8 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "errcode.h"
+#include "trace.h"
#include <linux/errname.h>
@@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class)
return err == class;
}
-int __bch2_err_class(int err)
+int __bch2_err_class(int bch_err)
{
- err = -err;
- BUG_ON((unsigned) err >= BCH_ERR_MAX);
+ int std_err = -bch_err;
+ BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
- while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
- err = bch2_errcode_parents[err - BCH_ERR_START];
+ while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
+ std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
+
+ trace_error_downcast(bch_err, std_err, _RET_IP_);
- return -err;
+ return -std_err;
}
const char *bch2_blk_status_to_str(blk_status_t status)
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 8c40c2067a04..af25d8ec60f2 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -5,6 +5,10 @@
#define BCH_ERRCODES() \
x(ERANGE, ERANGE_option_too_small) \
x(ERANGE, ERANGE_option_too_big) \
+ x(EINVAL, mount_option) \
+ x(BCH_ERR_mount_option, option_name) \
+ x(BCH_ERR_mount_option, option_value) \
+ x(BCH_ERR_mount_option, option_not_bool) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
@@ -78,6 +82,7 @@
x(ENOMEM, ENOMEM_fs_name_alloc) \
x(ENOMEM, ENOMEM_fs_other_alloc) \
x(ENOMEM, ENOMEM_dev_alloc) \
+ x(ENOMEM, ENOMEM_disk_accounting) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@@ -109,6 +114,8 @@
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
+ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
+ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(0, open_buckets_empty) \
x(0, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
@@ -176,6 +183,9 @@
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
x(EINVAL, opt_parse_error) \
+ x(EINVAL, remove_with_metadata_missing_unimplemented)\
+ x(EINVAL, remove_would_lose_data) \
+ x(EINVAL, btree_iter_with_journal_not_supported) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -225,7 +235,10 @@
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
x(EIO, sb_not_downgraded) \
- x(EIO, btree_write_all_failed) \
+ x(EIO, btree_node_write_all_failed) \
+ x(EIO, btree_node_read_error) \
+ x(EIO, btree_node_read_validate_error) \
+ x(EIO, btree_need_topology_repair) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -238,7 +251,8 @@
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
- x(BCH_ERR_nopromote, nopromote_enomem)
+ x(BCH_ERR_nopromote, nopromote_enomem) \
+ x(0, need_inode_lock)
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d32c8bebe46c..043431206799 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
+#include "recovery.h"
#include "super.h"
#include "thread_with_file.h"
@@ -25,11 +26,16 @@ bool bch2_inconsistent_error(struct bch_fs *c)
}
}
-void bch2_topology_error(struct bch_fs *c)
+int bch2_topology_error(struct bch_fs *c)
{
set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_fsck_running, &c->flags))
+ if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
bch2_inconsistent_error(c);
+ return -BCH_ERR_btree_need_topology_repair;
+ } else {
+ return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
+ -BCH_ERR_btree_node_read_validate_error;
+ }
}
void bch2_fatal_error(struct bch_fs *c)
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index fec17d1353d1..ae1d6674c512 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -30,7 +30,7 @@ struct work_struct;
bool bch2_inconsistent_error(struct bch_fs *);
-void bch2_topology_error(struct bch_fs *);
+int bch2_topology_error(struct bch_fs *);
#define bch2_fs_inconsistent(c, ...) \
({ \
@@ -191,9 +191,9 @@ do { \
void bch2_fatal_error(struct bch_fs *);
-#define bch2_fs_fatal_error(c, ...) \
+#define bch2_fs_fatal_error(c, _msg, ...) \
do { \
- bch_err(c, __VA_ARGS__); \
+ bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \
bch2_fatal_error(c); \
} while (0)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6bf839d69e84..fd2669cdd76f 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -43,6 +43,11 @@ enum bkey_invalid_flags;
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+#define extent_entry_next_safe(_entry, _end) \
+ (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \
+ ? extent_entry_next(_entry) \
+ : _end)
+
static inline unsigned
__extent_entry_type(const union bch_extent_entry *e)
{
@@ -103,17 +108,17 @@ static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *en
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+ return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
}
static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+ return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
}
static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
{
- switch (extent_entry_type(e)) {
+ switch (__extent_entry_type(e)) {
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
@@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
for ((_entry) = (_start); \
(_entry) < (_end); \
- (_entry) = extent_entry_next(_entry))
+ (_entry) = extent_entry_next_safe(_entry, _end))
#define __bkey_ptr_next(_ptr, _end) \
({ \
@@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
(_ptr).has_ec = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
- switch (extent_entry_type(_entry)) { \
+ switch (__extent_entry_type(_entry)) { \
case BCH_EXTENT_ENTRY_ptr: \
(_ptr).ptr = _entry->ptr; \
goto out; \
@@ -344,7 +349,7 @@ out: \
for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
(_entry) = _start; \
__bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
- (_entry) = extent_entry_next(_entry))
+ (_entry) = extent_entry_next_safe(_entry, _end))
#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index 66b945be10c2..d8153fe27037 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -24,12 +24,12 @@ struct { \
(fifo)->mask = (fifo)->size \
? roundup_pow_of_two((fifo)->size) - 1 \
: 0; \
- (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
+ (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \
})
#define free_fifo(fifo) \
do { \
- kvpfree((fifo)->data, fifo_buf_size(fifo)); \
+ kvfree((fifo)->data); \
(fifo)->data = NULL; \
} while (0)
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 1c1ea0f0c692..624e6f963240 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans,
u32 new_subvol, dir_snapshot;
ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+ dir.subvol,
snapshot_src.subvol,
&new_subvol, &snapshot,
(flags & BCH_CREATE_SNAPSHOT_RO) != 0);
@@ -242,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
const struct qstr *name,
- bool deleting_snapshot)
+ bool deleting_subvol)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
@@ -260,8 +261,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_hash = bch2_hash_info_init(c, dir_u);
- ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
- name, &inum, BTREE_ITER_INTENT);
+ ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
@@ -270,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+ if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
ret = bch2_empty_dir_trans(trans, inum);
if (ret)
goto err;
}
- if (deleting_snapshot && !inode_u->bi_subvol) {
+ if (deleting_subvol && !inode_u->bi_subvol) {
ret = -BCH_ERR_ENOENT_not_subvol;
goto err;
}
- if (deleting_snapshot || inode_u->bi_subvol) {
+ if (inode_u->bi_subvol) {
+ /* Recursive subvolume destroy not allowed (yet?) */
+ ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
+ if (ret)
+ goto err;
+ }
+
+ if (deleting_subvol || inode_u->bi_subvol) {
ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
if (ret)
goto err;
@@ -349,6 +357,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
return ret;
}
+static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
+{
+ struct btree_iter iter;
+ struct bkey_i_subvolume *s =
+ bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvol),
+ BTREE_ITER_CACHED, subvolume);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ s->v.fs_path_parent = cpu_to_le32(new_parent);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
int bch2_rename_trans(struct btree_trans *trans,
subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
@@ -410,6 +434,36 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
+ if (src_inode_u->bi_subvol &&
+ dst_dir.subvol != src_inode_u->bi_parent_subvol) {
+ ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
+ if (ret)
+ goto err;
+ }
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ dst_inode_u->bi_subvol &&
+ src_dir.subvol != dst_inode_u->bi_parent_subvol) {
+ ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
+ if (ret)
+ goto err;
+ }
+
+ /* Can't move across subvolumes, unless it's a subvolume root: */
+ if (src_dir.subvol != dst_dir.subvol &&
+ (!src_inode_u->bi_subvol ||
+ (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+ ret = -EXDEV;
+ goto err;
+ }
+
+ if (src_inode_u->bi_parent_subvol)
+ src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+ if ((mode == BCH_RENAME_EXCHANGE) &&
+ dst_inode_u->bi_parent_subvol)
+ dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
src_inode_u->bi_dir = dst_dir_u->bi_inum;
src_inode_u->bi_dir_offset = dst_offset;
@@ -432,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inum)) {
- ret = -ENOTEMPTY;
- goto err;
+ if (S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = bch2_empty_dir_trans(trans, dst_inum);
+ if (ret)
+ goto err;
}
}
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 27710cdd5710..39292e7ef342 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
- loff_t pos, unsigned len)
+ loff_t pos, unsigned len,
+ bool inode_locked)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr);
+ /*
+ * If we're not using the inode lock, we need to lock all the folios for
+ * atomiticity of writes vs. other writes:
+ */
+ if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
+ ret = -BCH_ERR_need_inode_lock;
+ goto out;
+ }
+
f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping);
@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied;
spin_lock(&inode->v.i_lock);
- if (end > inode->v.i_size)
+ if (end > inode->v.i_size) {
+ BUG_ON(!inode_locked);
i_size_write(&inode->v, end);
+ }
spin_unlock(&inode->v.i_lock);
f_pos = pos;
@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
- loff_t pos = iocb->ki_pos;
- ssize_t written = 0;
- int ret = 0;
+ loff_t pos;
+ bool inode_locked = false;
+ ssize_t written = 0, written2 = 0, ret = 0;
+
+ /*
+ * We don't take the inode lock unless i_size will be changing. Folio
+ * locks provide exclusion with other writes, and the pagecache add lock
+ * provides exclusion with truncate and hole punching.
+ *
+ * There is one nasty corner case where atomicity would be broken
+ * without great care: when copying data from userspace to the page
+ * cache, we do that with faults disable - a page fault would recurse
+ * back into the filesystem, taking filesystem locks again, and
+ * deadlock; so it's done with faults disabled, and we fault in the user
+ * buffer when we aren't holding locks.
+ *
+ * If we do part of the write, but we then race and in the userspace
+ * buffer have been evicted and are no longer resident, then we have to
+ * drop our folio locks to re-fault them in, breaking write atomicity.
+ *
+ * To fix this, we restart the write from the start, if we weren't
+ * holding the inode lock.
+ *
+ * There is another wrinkle after that; if we restart the write from the
+ * start, and then get an unrecoverable error, we _cannot_ claim to
+ * userspace that we did not write data we actually did - so we must
+ * track (written2) the most we ever wrote.
+ */
+
+ if ((iocb->ki_flags & IOCB_APPEND) ||
+ (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
+ inode_lock(&inode->v);
+ inode_locked = true;
+ }
+
+ ret = generic_write_checks(iocb, iter);
+ if (ret <= 0)
+ goto unlock;
+
+ ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
+ if (ret) {
+ if (!inode_locked) {
+ inode_lock(&inode->v);
+ inode_locked = true;
+ ret = file_remove_privs_flags(file, 0);
+ }
+ if (ret)
+ goto unlock;
+ }
+
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+
+ pos = iocb->ki_pos;
bch2_pagecache_add_get(inode);
+ if (!inode_locked &&
+ (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
+ goto get_inode_lock;
+
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter);
@@ -1004,12 +1072,17 @@ again:
}
}
+ if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
+ goto get_inode_lock;
+
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+ if (ret == -BCH_ERR_need_inode_lock)
+ goto get_inode_lock;
if (unlikely(ret < 0))
break;
@@ -1030,50 +1103,46 @@ again:
}
pos += ret;
written += ret;
+ written2 = max(written, written2);
+
+ if (ret != bytes && !inode_locked)
+ goto get_inode_lock;
ret = 0;
balance_dirty_pages_ratelimited(mapping);
- } while (iov_iter_count(iter));
+ if (0) {
+get_inode_lock:
+ bch2_pagecache_add_put(inode);
+ inode_lock(&inode->v);
+ inode_locked = true;
+ bch2_pagecache_add_get(inode);
+
+ iov_iter_revert(iter, written);
+ pos -= written;
+ written = 0;
+ ret = 0;
+ }
+ } while (iov_iter_count(iter));
bch2_pagecache_add_put(inode);
+unlock:
+ if (inode_locked)
+ inode_unlock(&inode->v);
+
+ iocb->ki_pos += written;
- return written ? written : ret;
+ ret = max(written, written2) ?: ret;
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
}
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- ssize_t ret;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = bch2_direct_write(iocb, from);
- goto out;
- }
-
- inode_lock(&inode->v);
-
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto unlock;
-
- ret = file_remove_privs(file);
- if (ret)
- goto unlock;
-
- ret = file_update_time(file);
- if (ret)
- goto unlock;
-
- ret = bch2_buffered_write(iocb, from);
- if (likely(ret > 0))
- iocb->ki_pos += ret;
-unlock:
- inode_unlock(&inode->v);
+ ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+ ? bch2_direct_write(iocb, iter)
+ : bch2_buffered_write(iocb, iter);
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-out:
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 8cbaba6565b4..828c3d7c8f19 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -51,13 +51,10 @@ enum bch_folio_sector_state {
struct bch_folio_sector {
/* Uncompressed, fully allocated replicas (or on disk reservation): */
- unsigned nr_replicas:4;
-
+ u8 nr_replicas:4,
/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
- unsigned replicas_reserved:4;
-
- /* i_sectors: */
- enum bch_folio_sector_state state:8;
+ replicas_reserved:4;
+ u8 state;
};
struct bch_folio {
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 77ae65542db9..0ccee05f6887 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,8 @@ retry:
goto retry;
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
- "inode %u:%llu not found when updating",
+ "%s: inode %u:%llu not found when updating",
+ bch2_err_str(ret),
inode_inum(inode).subvol,
inode_inum(inode).inum);
@@ -176,45 +177,88 @@ static unsigned bch2_inode_hash(subvol_inum inum)
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
{
- struct bch_inode_unpacked inode_u;
- struct bch_inode_info *inode;
- struct btree_trans *trans;
- struct bch_subvolume subvol;
- int ret;
+ subvol_inum inum = inode_inum(inode);
+ struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
+ BUG_ON(!old);
- inode = to_bch_ei(iget5_locked(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->v.i_state & I_NEW))
- return &inode->v;
+ if (unlikely(old != inode)) {
+ discard_new_inode(&inode->v);
+ inode = old;
+ } else {
+ mutex_lock(&c->vfs_inodes_lock);
+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ mutex_unlock(&c->vfs_inodes_lock);
+ /*
+ * we really don't want insert_inode_locked2() to be setting
+ * I_NEW...
+ */
+ unlock_new_inode(&inode->v);
+ }
- trans = bch2_trans_get(c);
- ret = lockrestart_do(trans,
- bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
- bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+ return inode;
+}
- if (!ret)
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- bch2_trans_put(trans);
+#define memalloc_flags_do(_flags, _do) \
+({ \
+ unsigned _saved_flags = memalloc_flags_save(_flags); \
+ typeof(_do) _ret = _do; \
+ memalloc_noreclaim_restore(_saved_flags); \
+ _ret; \
+})
- if (ret) {
- iget_failed(&inode->v);
- return ERR_PTR(bch2_err_class(ret));
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+
+ struct bch_inode_info *inode =
+ memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
+ to_bch_ei(new_inode(c->vfs_sb)));
+
+ if (unlikely(!inode)) {
+ int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
+ if (ret && inode)
+ discard_new_inode(&inode->v);
+ if (ret)
+ return ERR_PTR(ret);
}
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
+ return inode;
+}
- unlock_new_inode(&inode->v);
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+ struct bch_inode_info *inode =
+ to_bch_ei(ilookup5_nowait(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ &inum));
+ if (inode)
+ return &inode->v;
- return &inode->v;
+ struct btree_trans *trans = bch2_trans_get(c);
+
+ struct bch_inode_unpacked inode_u;
+ struct bch_subvolume subvol;
+ int ret = lockrestart_do(trans,
+ bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+ PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+ if (!ret) {
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ inode = bch2_inode_insert(c, inode);
+ }
+ bch2_trans_put(trans);
+
+ return ret ? ERR_PTR(ret) : &inode->v;
}
struct bch_inode_info *
@@ -226,7 +270,7 @@ __bch2_create(struct mnt_idmap *idmap,
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans *trans;
struct bch_inode_unpacked dir_u;
- struct bch_inode_info *inode, *old;
+ struct bch_inode_info *inode;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
subvol_inum inum;
@@ -293,7 +337,6 @@ err_before_quota:
mutex_unlock(&dir->ei_update_lock);
}
- bch2_iget5_set(&inode->v, &inum);
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -304,36 +347,7 @@ err_before_quota:
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
*/
-
- inode->v.i_state |= I_CREATING;
-
- old = to_bch_ei(inode_insert5(&inode->v,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- BUG_ON(!old);
-
- if (unlikely(old != inode)) {
- /*
- * We raced, another process pulled the new inode into cache
- * before us:
- */
- make_bad_inode(&inode->v);
- iput(&inode->v);
-
- inode = old;
- } else {
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
- /*
- * we really don't want insert_inode_locked2() to be setting
- * I_NEW...
- */
- unlock_new_inode(&inode->v);
- }
-
+ inode = bch2_inode_insert(c, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@@ -352,23 +366,78 @@ err_trans:
/* methods */
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+ subvol_inum dir, struct bch_hash_info *dir_hash_info,
+ const struct qstr *name)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dirent_iter = {};
+ subvol_inum inum = {};
+
+ int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+ dir_hash_info, dir, name, 0);
+ if (ret)
+ return ERR_PTR(ret);
+
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret)
+ goto err;
+
+ struct bch_inode_info *inode =
+ to_bch_ei(ilookup5_nowait(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ &inum));
+ if (inode)
+ goto out;
+
+ struct bch_subvolume subvol;
+ struct bch_inode_unpacked inode_u;
+ ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+ PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+ if (bch2_err_matches(ret, ENOENT)) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "%s points to missing inode", buf.buf);
+ printbuf_exit(&buf);
+ }
+ if (ret)
+ goto err;
+
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ inode = bch2_inode_insert(c, inode);
+out:
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ return inode;
+err:
+ inode = ERR_PTR(ret);
+ goto out;
+}
+
static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
unsigned int flags)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
- struct inode *vinode = NULL;
- subvol_inum inum = { .subvol = 1 };
- int ret;
- ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
- &dentry->d_name, &inum);
-
- if (!ret)
- vinode = bch2_vfs_inode_get(c, inum);
+ struct bch_inode_info *inode;
+ bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+ &hash, &dentry->d_name)));
+ if (IS_ERR(inode))
+ inode = NULL;
- return d_splice_alias(vinode, dentry);
+ return d_splice_alias(&inode->v, dentry);
}
static int bch2_mknod(struct mnt_idmap *idmap,
@@ -1372,6 +1441,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
+ bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0);
if (BCH_SUBVOLUME_SNAP(subvol))
@@ -1572,7 +1642,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
* number:
*/
u64 avail_inodes = ((usage.capacity - usage.used) << 3);
- u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_bsize = sb->s_blocksize;
@@ -1583,10 +1652,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = usage.nr_inodes + avail_inodes;
buf->f_ffree = avail_inodes;
- fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
- le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b);
buf->f_namelen = BCH_NAME_MAX;
return 0;
@@ -1805,8 +1871,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
ret = bch2_parse_mount_opts(NULL, &opts, data);
- if (ret)
+ if (ret) {
+ ret = bch2_err_class(ret);
return ERR_PTR(ret);
+ }
if (!dev_name || strlen(dev_name) == 0)
return ERR_PTR(-EINVAL);
@@ -1882,6 +1950,7 @@ got_sb:
sb->s_time_gran = c->sb.nsec_per_time_unit;
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
+ sb->s_uuid = c->sb.user_uuid;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6a760777bafb..47d4eefaba7b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -100,8 +100,8 @@ err:
}
static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -142,34 +142,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
return 0;
}
-static int __write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- struct bkey_inode_buf *inode_p =
- bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack(inode_p, inode);
- inode_p->inode.k.p.snapshot = snapshot;
-
- return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
- &inode_p->inode.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static int fsck_write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __write_inode(trans, inode, snapshot));
- bch_err_fn(trans->c, ret);
- return ret;
-}
-
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
@@ -280,7 +252,7 @@ create_lostfound:
goto err;
ret = bch2_dirent_create_snapshot(trans,
- root_inode.bi_inum, snapshot, &root_hash_info,
+ 0, root_inode.bi_inum, snapshot, &root_hash_info,
mode_to_type(lostfound->bi_mode),
&lostfound_str,
lostfound->bi_inum,
@@ -303,30 +275,47 @@ static int reattach_inode(struct btree_trans *trans,
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
+ u32 dirent_snapshot = inode_snapshot;
int ret;
- ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
+ if (inode->bi_subvol) {
+ inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ u64 root_inum;
+ ret = subvol_lookup(trans, inode->bi_parent_subvol,
+ &dirent_snapshot, &root_inum);
+ if (ret)
+ return ret;
+
+ snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
+ } else {
+ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+ }
+
+ ret = lookup_lostfound(trans, dirent_snapshot, &lostfound);
if (ret)
return ret;
if (S_ISDIR(inode->bi_mode)) {
lostfound.bi_nlink++;
- ret = __write_inode(trans, &lostfound, U32_MAX);
+ ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
if (ret)
return ret;
}
dir_hash = bch2_hash_info_init(trans->c, &lostfound);
- snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
name = (struct qstr) QSTR(name_buf);
ret = bch2_dirent_create_snapshot(trans,
- lostfound.bi_inum, inode_snapshot,
+ inode->bi_parent_subvol, lostfound.bi_inum,
+ dirent_snapshot,
&dir_hash,
inode_d_type(inode),
- &name, inode->bi_inum, &dir_offset,
+ &name,
+ inode->bi_subvol ?: inode->bi_inum,
+ &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
return ret;
@@ -334,7 +323,7 @@ static int reattach_inode(struct btree_trans *trans,
inode->bi_dir = lostfound.bi_inum;
inode->bi_dir_offset = dir_offset;
- return __write_inode(trans, inode, inode_snapshot);
+ return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
}
static int remove_backpointer(struct btree_trans *trans,
@@ -353,6 +342,27 @@ static int remove_backpointer(struct btree_trans *trans,
return ret;
}
+static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+ struct bch_fs *c = trans->c;
+
+ struct bch_inode_unpacked inode;
+ int ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &inode);
+ if (ret)
+ return ret;
+
+ ret = remove_backpointer(trans, &inode);
+ bch_err_msg(c, ret, "removing dirent");
+ if (ret)
+ return ret;
+
+ ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
+ bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+ return ret;
+}
+
struct snapshots_seen_entry {
u32 id;
u32 equiv;
@@ -592,13 +602,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
}
static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
- u32 snapshot, bool is_whiteout)
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
{
- struct inode_walker_entry *i;
-
- snapshot = bch2_snapshot_equiv(c, snapshot);
+ bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
+ u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ struct inode_walker_entry *i;
__darray_for_each(w->inodes, i)
if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
goto found;
@@ -609,20 +618,24 @@ found:
if (snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- size_t pos;
- int ret;
new.snapshot = snapshot;
new.count = 0;
- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
- w->last_pos.inode, snapshot, i->snapshot);
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+ "unexpected because we should always update the inode when we update a key in that inode\n"
+ "%s",
+ w->last_pos.inode, snapshot, i->snapshot, buf.buf);
+ printbuf_exit(&buf);
while (i > w->inodes.data && i[-1].snapshot > snapshot)
--i;
- pos = i - w->inodes.data;
- ret = darray_insert_item(&w->inodes, pos, new);
+ size_t pos = i - w->inodes.data;
+ int ret = darray_insert_item(&w->inodes, pos, new);
if (ret)
return ERR_PTR(ret);
@@ -633,21 +646,21 @@ found:
}
static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
- struct inode_walker *w, struct bpos pos,
- bool is_whiteout)
+ struct inode_walker *w,
+ struct bkey_s_c k)
{
- if (w->last_pos.inode != pos.inode) {
- int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+ if (w->last_pos.inode != k.k->p.inode) {
+ int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
- } else if (bkey_cmp(w->last_pos, pos)) {
+ } else if (bkey_cmp(w->last_pos, k.k->p)) {
darray_for_each(w->inodes, i)
i->seen_this_pos = false;
}
- w->last_pos = pos;
+ w->last_pos = k.k->p;
- return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+ return lookup_inode_for_snapshot(trans->c, w, k);
}
static int __get_visible_inodes(struct btree_trans *trans,
@@ -722,7 +735,7 @@ static int hash_redo_key(struct btree_trans *trans,
delete->k.p = k_iter->pos;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set_snapshot(trans, desc, hash_info,
+ bch2_hash_set_in_snapshot(trans, desc, hash_info,
(subvol_inum) { 0, k.k->p.inode },
k.k->p.snapshot, tmp,
BCH_HASH_SET_MUST_CREATE,
@@ -795,16 +808,93 @@ fsck_err:
goto out;
}
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
+{
+ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
+static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
+{
+ if (inode->bi_subvol) {
+ u64 inum;
+ int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
+ if (ret)
+ return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
+ }
+
+ return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ return d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
+}
+
static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
- int ret = bkey_err(k);
- if (ret)
+ int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot, bool *write_inode)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ struct btree_iter dirent_iter = {};
+ struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
+ int ret = bkey_err(d);
+ if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- bch2_trans_iter_exit(trans, &iter);
- return k.k->type == KEY_TYPE_set;
+ if (fsck_err_on(ret,
+ c, inode_points_to_missing_dirent,
+ "inode points to missing dirent\n%s",
+ (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
+ fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
+ c, inode_points_to_wrong_dirent,
+ "inode points to dirent that does not point back:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, inode_k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ /*
+ * We just clear the backpointer fields for now. If we find a
+ * dirent that points to this inode in check_dirents(), we'll
+ * update it then; then when we get to check_path() if the
+ * backpointer is still 0 we'll reattach it.
+ */
+ inode->bi_dir = 0;
+ inode->bi_dir_offset = 0;
+ inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
+ *write_inode = true;
+ }
+
+ ret = 0;
+fsck_err:
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
}
static int check_inode(struct btree_trans *trans,
@@ -861,7 +951,8 @@ static int check_inode(struct btree_trans *trans,
u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
- ret = __write_inode(trans, &u, iter->pos.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
return ret;
@@ -876,7 +967,7 @@ static int check_inode(struct btree_trans *trans,
if (ret < 0)
return ret;
- fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+ fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
"inode %llu:%u unlinked, but not on deleted list",
u.bi_inum, k.k->p.snapshot);
ret = 0;
@@ -950,8 +1041,49 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
+ if (u.bi_dir || u.bi_dir_offset) {
+ ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
+ if (ret)
+ goto err;
+ }
+
+ if (fsck_err_on(u.bi_parent_subvol &&
+ (u.bi_subvol == 0 ||
+ u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
+ c, inode_bi_parent_nonzero,
+ "inode %llu:%u has subvol %u but nonzero parent subvol %u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
+ u.bi_parent_subvol = 0;
+ do_update = true;
+ }
+
+ if (u.bi_subvol) {
+ struct bch_subvolume s;
+
+ ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (fsck_err_on(ret,
+ c, inode_bi_subvol_missing,
+ "inode %llu:%u bi_subvol points to missing subvolume %u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+ fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+ !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+ k.k->p.snapshot),
+ c, inode_bi_subvol_wrong,
+ "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+ le64_to_cpu(s.inode),
+ le32_to_cpu(s.snapshot))) {
+ u.bi_subvol = 0;
+ u.bi_parent_subvol = 0;
+ do_update = true;
+ }
+ }
+
if (do_update) {
- ret = __write_inode(trans, &u, iter->pos.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
return ret;
@@ -982,32 +1114,9 @@ int bch2_check_inodes(struct bch_fs *c)
return ret;
}
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos)
-{
- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
-{
- return d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1032,14 +1141,21 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
w->last_pos.inode, i->snapshot,
i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
- ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
}
fsck_err:
bch_err_fn(c, ret);
- return ret ?: trans_was_restarted(trans, restart_count);
+ return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+ u32 restart_count = trans->restart_count;
+ return check_i_sectors_notnested(trans, w) ?:
+ trans_was_restarted(trans, restart_count);
}
struct extent_end {
@@ -1312,7 +1428,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
goto err;
@@ -1423,7 +1539,7 @@ int bch2_check_extents(struct bch_fs *c)
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
- check_i_sectors(trans, &w));
+ check_i_sectors_notnested(trans, &w));
bch2_disk_reservation_put(c, &res);
extent_ends_exit(&extent_ends);
@@ -1453,10 +1569,9 @@ int bch2_check_indirect_extents(struct bch_fs *c)
return ret;
}
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1481,96 +1596,123 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
- ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
}
fsck_err:
bch_err_fn(c, ret);
- return ret ?: trans_was_restarted(trans, restart_count);
+ return ret;
}
-static int check_dirent_target(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- u32 target_snapshot)
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+ u32 restart_count = trans->restart_count;
+ return check_subdir_count_notnested(trans, w) ?:
+ trans_was_restarted(trans, restart_count);
+}
+
+static int check_dirent_inode_dirent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
{
struct bch_fs *c = trans->c;
- struct bkey_i_dirent *n;
struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
int ret = 0;
+ if (inode_points_to_dirent(target, d))
+ return 0;
+
if (!target->bi_dir &&
!target->bi_dir_offset) {
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
+ return __bch2_fsck_write_inode(trans, target, target_snapshot);
}
- if (!inode_points_to_dirent(target, d)) {
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
- SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
+ struct btree_iter bp_iter = { NULL };
+ struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+ SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
- bool backpointer_exists = !ret;
- ret = 0;
+ bool backpointer_exists = !ret;
+ ret = 0;
+
+ if (fsck_err_on(!backpointer_exists,
+ c, inode_wrong_backpointer,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+ goto out;
+ }
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- if (backpointer_exists)
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ if (backpointer_exists)
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+ if (fsck_err_on(backpointer_exists &&
+ (S_ISDIR(target->bi_mode) ||
+ target->bi_subvol),
+ c, inode_dir_multiple_links,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target_snapshot, buf.buf)) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto out;
+ }
- if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
- c, inode_dir_multiple_links,
- "directory %llu:%u with multiple links\n%s",
- target->bi_inum, target_snapshot, buf.buf)) {
- ret = __remove_dirent(trans, d.k->p);
- goto out;
- }
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
+ if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+ c, inode_multiple_links_but_nlink_0,
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
- c, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
- }
+static int check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_dirent *n;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- if (fsck_err_on(!backpointer_exists,
- c, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
- }
- }
+ ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
+ if (ret)
+ goto err;
if (fsck_err_on(d.v->d_type != inode_d_type(target),
c, dirent_d_type_wrong,
@@ -1586,6 +1728,12 @@ static int check_dirent_target(struct btree_trans *trans,
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = inode_d_type(target);
+ if (n->v.d_type == DT_SUBVOL) {
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+ } else {
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
+ }
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
@@ -1593,33 +1741,134 @@ static int check_dirent_target(struct btree_trans *trans,
d = dirent_i_to_s_c(n);
}
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
- if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
- target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
- c, dirent_d_parent_subvol_wrong,
- "dirent has wrong d_parent_subvol field: got %u, should be %u",
- le32_to_cpu(d.v->d_parent_subvol),
- target->bi_parent_subvol)) {
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
+/* find a subvolume that's a descendent of @snapshot: */
+static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_subvolume)
+ continue;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+ if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
+ bch2_trans_iter_exit(trans, &iter);
+ *subvolid = k.k->p.offset;
+ goto found;
+ }
+ }
+ if (!ret)
+ ret = -ENOENT;
+found:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c_dirent d)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter subvol_iter = {};
+ struct bch_inode_unpacked subvol_root;
+ u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
+ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+ u32 parent_snapshot;
+ u64 parent_inum;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
+ "dirent parent_subvol points to missing subvolume\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
+ fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
+ c, dirent_not_visible_in_parent_subvol,
+ "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
+ parent_snapshot,
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ u32 new_parent_subvol;
+ ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
if (ret)
goto err;
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
+ ret = PTR_ERR_OR_ZERO(new_dirent);
+ if (ret)
+ goto err;
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
+ }
+
+ struct bkey_s_c_subvolume s =
+ bch2_bkey_get_iter_typed(trans, &subvol_iter,
+ BTREE_ID_subvolumes, POS(0, target_subvol),
+ 0, subvolume);
+ ret = bkey_err(s.s_c);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (ret) {
+ if (fsck_err(c, dirent_to_missing_subvol,
+ "dirent points to missing subvolume\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
+ return __remove_dirent(trans, d.k->p);
+ ret = 0;
+ goto out;
+ }
+
+ if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
+ c, subvol_fs_path_parent_wrong,
+ "subvol with wrong fs_path_parent, should be be %u\n%s",
+ parent_subvol,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto err;
- d = dirent_i_to_s_c(n);
+ n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+ }
+
+ u64 target_inum = le64_to_cpu(s.v->inode);
+ u32 target_snapshot = le32_to_cpu(s.v->snapshot);
+
+ ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol,
+ c, inode_bi_parent_wrong,
+ "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
+ target_inum,
+ subvol_root.bi_parent_subvol, parent_subvol)) {
+ subvol_root.bi_parent_subvol = parent_subvol;
+ ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+ if (ret)
+ return ret;
}
+
+ ret = check_dirent_target(trans, iter, d, &subvol_root,
+ target_snapshot);
+ if (ret)
+ return ret;
out:
err:
fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &subvol_iter);
printbuf_exit(&buf);
- bch_err_fn(c, ret);
return ret;
}
@@ -1661,7 +1910,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
- i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, dir, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret < 0)
goto err;
@@ -1707,50 +1956,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) {
- struct bch_inode_unpacked subvol_root;
- u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
- u32 target_snapshot;
- u64 target_inum;
-
- ret = subvol_lookup(trans, target_subvol,
- &target_snapshot, &target_inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c, dirent_to_missing_subvol,
- "dirent points to missing subvolume %u",
- le32_to_cpu(d.v->d_child_subvol))) {
- ret = __remove_dirent(trans, d.k->p);
- goto err;
- }
-
- ret = lookup_inode(trans, target_inum,
- &subvol_root, &target_snapshot);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c, subvol_to_missing_root,
- "subvolume %u points to missing subvolume root %llu",
- target_subvol,
- target_inum)) {
- bch_err(c, "repair not implemented yet");
- ret = -EINVAL;
- goto err;
- }
-
- if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
- c, subvol_root_wrong_bi_subvol,
- "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
- target_inum,
- subvol_root.bi_subvol, target_subvol)) {
- subvol_root.bi_subvol = target_subvol;
- ret = __write_inode(trans, &subvol_root, target_snapshot);
- if (ret)
- goto err;
- }
-
- ret = check_dirent_target(trans, iter, d, &subvol_root,
- target_snapshot);
+ ret = check_dirent_to_subvol(trans, iter, d);
if (ret)
goto err;
} else {
@@ -1776,12 +1982,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
}
- }
-
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, equiv.snapshot, i)
- i->count++;
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+ i->count++;
+ }
out:
err:
fsck_err:
@@ -1810,7 +2015,8 @@ int bch2_check_dirents(struct bch_fs *c)
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+ check_subdir_count_notnested(trans, &dir));
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
@@ -1829,10 +2035,12 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
int ret;
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
+ if (ret < 0)
return ret;
+ if (ret)
+ return 0;
- i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
@@ -1919,7 +2127,7 @@ static int check_root_trans(struct btree_trans *trans)
0, NULL);
root_inode.bi_inum = inum;
- ret = __write_inode(trans, &root_inode, snapshot);
+ ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
bch_err_msg(c, ret, "writing root inode");
}
err:
@@ -1936,6 +2144,107 @@ int bch2_check_root(struct bch_fs *c)
return ret;
}
+typedef DARRAY(u32) darray_u32;
+
+static bool darray_u32_has(darray_u32 *d, u32 v)
+{
+ darray_for_each(*d, i)
+ if (*i == v)
+ return true;
+ return false;
+}
+
+/*
+ * We've checked that inode backpointers point to valid dirents; here, it's
+ * sufficient to check that the subvolume root has a dirent:
+ */
+static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+ struct bch_inode_unpacked inode;
+ int ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &inode);
+ if (ret)
+ return ret;
+
+ return inode.bi_dir != 0;
+}
+
+static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter parent_iter = {};
+ darray_u32 subvol_path = {};
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
+ ret = darray_push(&subvol_path, k.k->p.offset);
+ if (ret)
+ goto err;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+ ret = subvol_has_dirent(trans, s);
+ if (ret < 0)
+ break;
+
+ if (fsck_err_on(!ret,
+ c, subvol_unreachable,
+ "unreachable subvolume %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ buf.buf))) {
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+
+ u32 parent = le32_to_cpu(s.v->fs_path_parent);
+
+ if (darray_u32_has(&subvol_path, parent)) {
+ if (fsck_err(c, subvol_loop, "subvolume loop"))
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &parent_iter);
+ bch2_trans_iter_init(trans, &parent_iter,
+ BTREE_ID_subvolumes, POS(0, parent), 0);
+ k = bch2_btree_iter_peek_slot(&parent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
+ c, subvol_unreachable,
+ "unreachable subvolume %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ buf.buf))) {
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+ }
+fsck_err:
+err:
+ printbuf_exit(&buf);
+ darray_exit(&subvol_path);
+ bch2_trans_iter_exit(trans, &parent_iter);
+ return ret;
+}
+
+int bch2_check_subvolume_structure(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol_path(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
struct pathbuf_entry {
u64 inum;
u32 snapshot;
@@ -1952,89 +2261,71 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-static int path_down(struct bch_fs *c, pathbuf *p,
- u64 inum, u32 snapshot)
-{
- int ret = darray_push(p, ((struct pathbuf_entry) {
- .inum = inum,
- .snapshot = snapshot,
- }));
-
- if (ret)
- bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
- p->size);
- return ret;
-}
-
/*
- * Check that a given inode is reachable from the root:
+ * Check that a given inode is reachable from its subvolume root - we already
+ * verified subvolume connectivity:
*
* XXX: we should also be verifying that inodes are in the right subvolumes
*/
-static int check_path(struct btree_trans *trans,
- pathbuf *p,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
+static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked inode;
+ struct printbuf buf = PRINTBUF;
+ u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
int ret = 0;
- snapshot = bch2_snapshot_equiv(c, snapshot);
p->nr = 0;
- while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
- inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ BUG_ON(bch2_inode_unpack(inode_k, &inode));
+
+ while (!inode.bi_subvol) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
u32 parent_snapshot = snapshot;
- if (inode->bi_subvol) {
- u64 inum;
-
- ret = subvol_lookup(trans, inode->bi_parent_subvol,
- &parent_snapshot, &inum);
- if (ret)
- break;
- }
-
- d = dirent_get_by_pos(trans, &dirent_iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset,
- parent_snapshot));
+ d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
break;
- if (!ret && !dirent_points_to_inode(d, inode)) {
+ if (!ret && !dirent_points_to_inode(d, &inode)) {
bch2_trans_iter_exit(trans, &dirent_iter);
ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
}
if (bch2_err_matches(ret, ENOENT)) {
- if (fsck_err(c, inode_unreachable,
- "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
- inode->bi_inum, snapshot,
- bch2_d_type_str(inode_d_type(inode)),
- inode->bi_nlink,
- inode->bi_dir,
- inode->bi_dir_offset))
- ret = reattach_inode(trans, inode, snapshot);
- break;
+ ret = 0;
+ if (fsck_err(c, inode_unreachable,
+ "unreachable inode\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, inode_k),
+ buf.buf)))
+ ret = reattach_inode(trans, &inode, snapshot);
+ goto out;
}
bch2_trans_iter_exit(trans, &dirent_iter);
- if (!S_ISDIR(inode->bi_mode))
+ if (!S_ISDIR(inode.bi_mode))
break;
- ret = path_down(c, p, inode->bi_inum, snapshot);
- if (ret) {
- bch_err(c, "memory allocation failure");
+ ret = darray_push(p, ((struct pathbuf_entry) {
+ .inum = inode.bi_inum,
+ .snapshot = snapshot,
+ }));
+ if (ret)
return ret;
- }
snapshot = parent_snapshot;
- ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
+ SPOS(0, inode.bi_dir, snapshot), 0);
+ ret = bkey_err(inode_k) ?:
+ !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
+ : bch2_inode_unpack(inode_k, &inode);
if (ret) {
/* Should have been caught in dirents pass */
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2042,30 +2333,32 @@ static int check_path(struct btree_trans *trans,
break;
}
- if (path_is_dup(p, inode->bi_inum, snapshot)) {
+ snapshot = inode_k.k->p.snapshot;
+
+ if (path_is_dup(p, inode.bi_inum, snapshot)) {
/* XXX print path */
bch_err(c, "directory structure loop");
darray_for_each(*p, i)
pr_err("%llu:%u", i->inum, i->snapshot);
- pr_err("%llu:%u", inode->bi_inum, snapshot);
-
- if (!fsck_err(c, dir_loop, "directory structure loop"))
- return 0;
+ pr_err("%llu:%u", inode.bi_inum, snapshot);
- ret = remove_backpointer(trans, inode);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ if (fsck_err(c, dir_loop, "directory structure loop")) {
+ ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent");
- if (ret)
- break;
+ if (ret)
+ break;
- ret = reattach_inode(trans, inode, snapshot);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
+ ret = reattach_inode(trans, &inode, snapshot);
+ bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+ }
break;
}
}
+out:
fsck_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+ printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
@@ -2077,7 +2370,6 @@ fsck_err:
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- struct bch_inode_unpacked u;
pathbuf path = { 0, };
int ret;
@@ -2090,12 +2382,10 @@ int bch2_check_directory_structure(struct bch_fs *c)
if (!bkey_is_inode(k.k))
continue;
- BUG_ON(bch2_inode_unpack(k, &u));
-
- if (u.bi_flags & BCH_INODE_unlinked)
+ if (bch2_inode_flags(k) & BCH_INODE_unlinked)
continue;
- check_path(trans, &path, &u, iter.pos.snapshot);
+ check_path(trans, &path, k);
})));
darray_exit(&path);
@@ -2291,7 +2581,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
bch2_inode_nlink_set(&u, link->count);
- ret = __write_inode(trans, &u, k.k->p.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
}
fsck_err:
return ret;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index da991e8cf27e..a4ef94271784 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
+int bch2_check_subvolume_structure(struct bch_fs *);
int bch2_check_directory_structure(struct bch_fs *);
int bch2_check_nlinks(struct bch_fs *);
int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 086f0090b03a..2b5e06770ab3 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
return bch2_inode_unpack_slowpath(k, unpacked);
}
-static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+int bch2_inode_peek_nowarn(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
subvol_inum inum, unsigned flags)
@@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
+int __bch2_fsck_write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ struct bkey_inode_buf *inode_p =
+ bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
+
+ bch2_inode_pack(inode_p, inode);
+ inode_p->inode.k.p.snapshot = snapshot;
+
+ return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+ &inode_p->inode.k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_fsck_write_inode(trans, inode, snapshot));
+ bch_err_fn(trans->c, ret);
+ return ret;
+}
+
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
{
struct bch_inode_unpacked u;
@@ -592,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
bool old_deleted = bkey_is_deleted_inode(old);
bool new_deleted = bkey_is_deleted_inode(new.s_c);
if (old_deleted != new_deleted) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+ new.k->p, new_deleted);
if (ret)
return ret;
}
@@ -1088,8 +1117,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
if (S_ISDIR(inode.bi_mode)) {
- ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
- if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+ ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
+ if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+ c, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
@@ -1141,7 +1171,7 @@ fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
- ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
goto out;
}
@@ -1151,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
bool need_another_pass;
int ret;
again:
+ /*
+ * if we ran check_inodes() unlinked inodes will have already been
+ * cleaned up but the write buffer will be out of sync; therefore we
+ * alway need a write buffer flush
+ */
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
need_another_pass = false;
/*
@@ -1183,12 +1222,8 @@ again:
ret;
}));
- if (!ret && need_another_pass) {
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
+ if (!ret && need_another_pass)
goto again;
- }
err:
bch2_trans_put(trans);
return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b63f312581cf..056298050550 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
+int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
@@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
return bch2_inode_write_flags(trans, iter, inode, 0);
}
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
@@ -172,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
}
+static inline u32 bch2_inode_flags(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+ case KEY_TYPE_inode_v2:
+ return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+ case KEY_TYPE_inode_v3:
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+ default:
+ return 0;
+ }
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c574d8873a1..8a556e6d1ab6 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
if (!op) {
ret = -BCH_ERR_nopromote_enomem;
goto err;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 2c098ac017b3..f137252bccc5 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
bch2_congested_acct(ca, io_latency, now, rw);
- __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+ __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}
#endif
@@ -530,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op)
bch_err_inum_offset_ratelimited(c,
insert->k.p.inode, insert->k.p.offset << 9,
- "write error while doing btree update: %s",
+ "%s write error while doing btree update: %s",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
}
@@ -1067,7 +1068,8 @@ do_write:
*_dst = dst;
return more;
csum_err:
- bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+ bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user");
ret = -EIO;
err:
if (to_wbio(dst)->bounce)
@@ -1169,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
bch_err_inum_offset_ratelimited(c,
insert->k.p.inode, insert->k.p.offset << 9,
- "write error while doing btree update: %s",
+ "%s write error while doing btree update: %s",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
}
@@ -1449,7 +1452,9 @@ err:
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
- "%s(): error: %s", __func__, bch2_err_str(ret));
+ "%s(): %s error: %s", __func__,
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
+ bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1573,7 +1578,8 @@ CLOSURE_CALLBACK(bch2_write)
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
- "misaligned write");
+ "%s write error: misaligned write",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user");
op->error = -EIO;
goto err;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bc890776eb57..9c9a25dbd613 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -27,33 +27,71 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+ return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+ return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+ return __journal_entry_is_open(j->reservations);
+}
+
static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
{
union journal_res_state s = READ_ONCE(j->reservations);
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
- prt_printf(out, "seq:");
+ prt_str(out, "seq:");
prt_tab(out);
prt_printf(out, "%llu", seq);
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "refcount:");
+ prt_str(out, "refcount:");
prt_tab(out);
prt_printf(out, "%u", journal_state_count(s, i));
prt_newline(out);
- prt_printf(out, "size:");
+ prt_str(out, "size:");
prt_tab(out);
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
- prt_printf(out, "expires");
+ prt_str(out, "expires:");
prt_tab(out);
prt_printf(out, "%li jiffies", buf->expires - jiffies);
prt_newline(out);
+ prt_str(out, "flags:");
+ prt_tab(out);
+ if (buf->noflush)
+ prt_str(out, "noflush ");
+ if (buf->must_flush)
+ prt_str(out, "must_flush ");
+ if (buf->separate_flush)
+ prt_str(out, "separate_flush ");
+ if (buf->need_flush_to_write_buffer)
+ prt_str(out, "need_flush_to_write_buffer ");
+ if (buf->write_started)
+ prt_str(out, "write_started ");
+ if (buf->write_allocated)
+ prt_str(out, "write allocated ");
+ if (buf->write_done)
+ prt_str(out, "write done");
+ prt_newline(out);
+
printbuf_indent_sub(out, 2);
}
@@ -66,26 +104,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
seq <= journal_cur_seq(j);
seq++)
bch2_journal_buf_to_text(out, j, seq);
-}
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
- return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
- return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
- return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return __journal_entry_is_open(j->reservations);
+ prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
}
static inline struct journal_buf *
@@ -174,21 +193,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
return stuck;
}
+void bch2_journal_do_writes(struct journal *j)
+{
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ unsigned idx = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *w = j->buf + idx;
+
+ if (w->write_started && !w->write_allocated)
+ break;
+ if (w->write_started)
+ continue;
+
+ if (!journal_state_count(j->reservations, idx)) {
+ w->write_started = true;
+ closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+ }
+
+ break;
+ }
+}
+
/*
* Final processing when the last reference of a journal buffer has been
* dropped. Drop the pin list reference acquired at journal entry open and write
* the buffer, if requested.
*/
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
lockdep_assert_held(&j->lock);
if (__bch2_journal_pin_put(j, seq))
bch2_journal_reclaim_fast(j);
- if (write)
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ bch2_journal_do_writes(j);
}
/*
@@ -380,11 +418,14 @@ static int journal_entry_open(struct journal *j)
BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
- buf->flush_time = 0;
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+ buf->flush_time = 0;
buf->need_flush_to_write_buffer = true;
+ buf->write_started = false;
+ buf->write_allocated = false;
+ buf->write_done = false;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@@ -418,9 +459,10 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- mod_delayed_work(c->io_complete_wq,
- &j->write_work,
- msecs_to_jiffies(c->opts.journal_flush_delay));
+ if (nr_unwritten_journal_entries(j) == 1)
+ mod_delayed_work(j->wq,
+ &j->write_work,
+ msecs_to_jiffies(c->opts.journal_flush_delay));
journal_wake(j);
if (j->early_journal_entries.nr)
@@ -445,20 +487,16 @@ static void journal_quiesce(struct journal *j)
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- long delta;
spin_lock(&j->lock);
- if (!__journal_entry_is_open(j->reservations))
- goto unlock;
+ if (__journal_entry_is_open(j->reservations)) {
+ long delta = journal_cur_buf(j)->expires - jiffies;
- delta = journal_cur_buf(j)->expires - jiffies;
-
- if (delta > 0)
- mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
- else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-unlock:
+ if (delta > 0)
+ mod_delayed_work(j->wq, &j->write_work, delta);
+ else
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+ }
spin_unlock(&j->lock);
}
@@ -476,30 +514,29 @@ retry:
if (bch2_journal_error(j))
return -BCH_ERR_erofs_journal_err;
- spin_lock(&j->lock);
+ if (j->blocked)
+ return -BCH_ERR_journal_res_get_blocked;
- /* check once more in case somebody else shut things down... */
- if (bch2_journal_error(j)) {
- spin_unlock(&j->lock);
- return -BCH_ERR_erofs_journal_err;
+ if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+ ret = JOURNAL_ERR_journal_full;
+ can_discard = j->can_discard;
+ goto out;
+ }
+
+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+ ret = JOURNAL_ERR_max_in_flight;
+ goto out;
}
+ spin_lock(&j->lock);
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
* unnecessarily
*/
if (journal_res_get_fast(j, res, flags)) {
- spin_unlock(&j->lock);
- return 0;
- }
-
- if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- /*
- * Don't want to close current journal entry, just need to
- * invoke reclaim:
- */
- ret = JOURNAL_ERR_journal_full;
+ ret = 0;
goto unlock;
}
@@ -515,30 +552,30 @@ retry:
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
- ret = journal_entry_open(j);
-
- if (ret == JOURNAL_ERR_max_in_flight) {
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, true);
- if (trace_journal_entry_full_enabled()) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- bch2_journal_bufs_to_text(&buf, j);
- trace_journal_entry_full(c, buf.buf);
- printbuf_exit(&buf);
- }
- count_event(c, journal_entry_full);
- }
+ ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
-
- if (!ret)
+out:
+ if (ret == JOURNAL_ERR_retry)
goto retry;
+ if (!ret)
+ return 0;
+
if (journal_error_check_stuck(j, ret, flags))
ret = -BCH_ERR_journal_res_get_blocked;
+ if (ret == JOURNAL_ERR_max_in_flight &&
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ count_event(c, journal_entry_full);
+ }
+
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
@@ -674,7 +711,7 @@ recheck_need_open:
return ret;
seq = res.seq;
- buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ buf = journal_seq_to_buf(j, seq);
buf->must_flush = true;
if (!buf->flush_time) {
@@ -692,8 +729,8 @@ recheck_need_open:
}
/*
- * if write was kicked off without a flush, flush the next sequence
- * number instead
+ * if write was kicked off without a flush, or if we promised it
+ * wouldn't be a flush, flush the next sequence number instead
*/
buf = journal_seq_to_buf(j, seq);
if (buf->noflush) {
@@ -771,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
- /* journal write is already in flight, and was a flush write: */
- if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+ /* journal flush already in flight, or flush requseted */
+ if (buf->must_flush)
goto out;
buf->noflush = true;
@@ -1157,13 +1194,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
struct journal_replay *i, **_i;
struct genradix_iter iter;
bool had_entries = false;
- unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
genradix_for_each_reverse(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
last_seq = le64_to_cpu(i->j.last_seq);
@@ -1196,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
@@ -1211,8 +1247,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
p = journal_seq_pin(j, seq);
p->devs.nr = 0;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++)
- bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+ darray_for_each(i->ptrs, ptr)
+ bch2_dev_list_add_dev(&p->devs, ptr->dev);
had_entries = true;
}
@@ -1240,13 +1276,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
void bch2_dev_journal_exit(struct bch_dev *ca)
{
- kfree(ca->journal.bio);
- kfree(ca->journal.buckets);
- kfree(ca->journal.bucket_seq);
+ struct journal_device *ja = &ca->journal;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+ kfree(ja->bio[i]);
+ ja->bio[i] = NULL;
+ }
- ca->journal.bio = NULL;
- ca->journal.buckets = NULL;
- ca->journal.bucket_seq = NULL;
+ kfree(ja->buckets);
+ kfree(ja->bucket_seq);
+ ja->buckets = NULL;
+ ja->bucket_seq = NULL;
}
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1256,14 +1296,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
bch2_sb_field_get(sb, journal);
struct bch_sb_field_journal_v2 *journal_buckets_v2 =
bch2_sb_field_get(sb, journal_v2);
- unsigned i, nr_bvecs;
ja->nr = 0;
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- for (i = 0; i < nr; i++)
+ for (unsigned i = 0; i < nr; i++)
ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
} else if (journal_buckets) {
ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1273,13 +1312,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (!ja->bucket_seq)
return -BCH_ERR_ENOMEM_dev_journal_init;
- nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+ unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
- ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
- if (!ca->journal.bio)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+ ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+ nr_bvecs), GFP_KERNEL);
+ if (!ja->bio[i])
+ return -BCH_ERR_ENOMEM_dev_journal_init;
- bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+ ja->bio[i]->ca = ca;
+ ja->bio[i]->buf_idx = i;
+ bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+ }
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
@@ -1287,14 +1331,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- unsigned j, dst = 0;
+ unsigned dst = 0;
- for (i = 0; i < nr; i++)
- for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+ for (unsigned i = 0; i < nr; i++)
+ for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
ja->buckets[dst++] =
le64_to_cpu(journal_buckets_v2->d[i].start) + j;
} else if (journal_buckets) {
- for (i = 0; i < ja->nr; i++)
+ for (unsigned i = 0; i < ja->nr; i++)
ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
}
@@ -1303,19 +1347,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
- unsigned i;
+ if (j->wq)
+ destroy_workqueue(j->wq);
darray_exit(&j->early_journal_entries);
- for (i = 0; i < ARRAY_SIZE(j->buf); i++)
- kvpfree(j->buf[i].data, j->buf[i].buf_size);
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+ kvfree(j->buf[i].data);
free_fifo(&j->pin);
}
int bch2_fs_journal_init(struct journal *j)
{
static struct lock_class_key res_key;
- unsigned i;
mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
@@ -1336,14 +1380,20 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
return -BCH_ERR_ENOMEM_journal_pin_fifo;
- for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+ j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data)
return -BCH_ERR_ENOMEM_journal_buf;
+ j->buf[i].idx = i;
}
j->pin.front = j->pin.back = 1;
+
+ j->wq = alloc_workqueue("bcachefs_journal",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+ if (!j->wq)
+ return -BCH_ERR_ENOMEM_fs_other_alloc;
return 0;
}
@@ -1381,6 +1431,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+ prt_printf(out, "blocked:\t\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
@@ -1455,7 +1506,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
- unsigned i;
spin_lock(&j->lock);
*seq = max(*seq, j->pin.front);
@@ -1473,7 +1523,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
prt_newline(out);
printbuf_indent_add(out, 2);
- for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
list_for_each_entry(pin, &pin_list->list[i], list) {
prt_printf(out, "\t%px %ps", pin, pin->flush);
prt_newline(out);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4544ce24bb8a..7c7528f839c5 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
}
bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
{
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx))
- bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ bch2_journal_buf_put_final(j, seq);
}
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx)) {
spin_lock(&j->lock);
- bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ bch2_journal_buf_put_final(j, seq);
spin_unlock(&j->lock);
}
}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 47805193f18c..725fcf46f631 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,37 @@
#include "sb-clean.h"
#include "trace.h"
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ darray_for_each(j->ptrs, i) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+ u64 offset;
+
+ div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
+
+ if (i != j->ptrs.data)
+ prt_printf(out, " ");
+ prt_printf(out, "%u:%u:%u (sector %llu)",
+ i->dev, i->bucket, i->bucket_offset, i->sector);
+ }
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+ bch2_journal_ptrs_to_text(out, c, j);
+
+ for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+ struct jset_entry_datetime *datetime =
+ container_of(entry, struct jset_entry_datetime, entry);
+ bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+ break;
+ }
+}
+
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@@ -52,13 +83,15 @@ static void __journal_replay_free(struct bch_fs *c,
BUG_ON(*p != i);
*p = NULL;
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ kvfree(i);
}
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
{
- i->ignore = true;
+ if (blacklisted)
+ i->ignore_blacklisted = true;
+ else
+ i->ignore_not_dirty = true;
if (!c->opts.read_entire_journal)
__journal_replay_free(c, i);
@@ -84,9 +117,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
{
struct genradix_iter iter;
struct journal_replay **_i, *i, *dup;
- struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+ struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
/* Is this entry older than the range we need? */
@@ -108,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, jlist->last_seq)) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
if (le64_to_cpu(i->j.seq) >= last_seq)
break;
- journal_replay_free(c, i);
+
+ journal_replay_free(c, i, false);
}
}
@@ -131,72 +165,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
*/
dup = *_i;
if (dup) {
- if (bytes == vstruct_bytes(&dup->j) &&
- !memcmp(j, &dup->j, bytes)) {
- i = dup;
- goto found;
- }
+ bool identical = bytes == vstruct_bytes(&dup->j) &&
+ !memcmp(j, &dup->j, bytes);
+ bool not_identical = !identical &&
+ entry_ptr.csum_good &&
+ dup->csum_good;
+
+ bool same_device = false;
+ darray_for_each(dup->ptrs, ptr)
+ if (ptr->dev == ca->dev_idx)
+ same_device = true;
+
+ ret = darray_push(&dup->ptrs, entry_ptr);
+ if (ret)
+ goto out;
- if (!entry_ptr.csum_good) {
- i = dup;
- goto found;
- }
+ bch2_journal_replay_to_text(&buf, c, dup);
- if (!dup->csum_good)
+ fsck_err_on(same_device,
+ c, journal_entry_dup_same_device,
+ "duplicate journal entry on same device\n %s",
+ buf.buf);
+
+ fsck_err_on(not_identical,
+ c, journal_entry_replicas_data_mismatch,
+ "found duplicate but non identical journal entries\n %s",
+ buf.buf);
+
+ if (entry_ptr.csum_good && !identical)
goto replace;
- fsck_err(c, journal_entry_replicas_data_mismatch,
- "found duplicate but non identical journal entries (seq %llu)",
- le64_to_cpu(j->seq));
- i = dup;
- goto found;
+ goto out;
}
replace:
- i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
return -BCH_ERR_ENOMEM_journal_entry_add;
- i->nr_ptrs = 0;
- i->csum_good = entry_ptr.csum_good;
- i->ignore = false;
+ darray_init(&i->ptrs);
+ i->csum_good = entry_ptr.csum_good;
+ i->ignore_blacklisted = false;
+ i->ignore_not_dirty = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
- i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
- if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
- bch_err(c, "found too many copies of journal entry %llu",
- le64_to_cpu(i->j.seq));
- dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
- }
-
/* The first ptr should represent the jset we kept: */
- memcpy(i->ptrs + i->nr_ptrs,
- dup->ptrs,
- sizeof(dup->ptrs[0]) * dup->nr_ptrs);
- i->nr_ptrs += dup->nr_ptrs;
+ darray_for_each(dup->ptrs, ptr)
+ darray_push(&i->ptrs, *ptr);
__journal_replay_free(c, dup);
+ } else {
+ darray_push(&i->ptrs, entry_ptr);
}
*_i = i;
- return 0;
-found:
- for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
- if (ptr->dev == ca->dev_idx) {
- bch_err(c, "duplicate journal entry %llu on same device",
- le64_to_cpu(i->j.seq));
- goto out;
- }
- }
-
- if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
- bch_err(c, "found too many copies of journal entry %llu",
- le64_to_cpu(i->j.seq));
- goto out;
- }
-
- i->ptrs[i->nr_ptrs++] = entry_ptr;
out:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -374,7 +398,6 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
- struct bkey_i *k;
bool first = true;
jset_entry_for_each_key(entry, k) {
@@ -741,6 +764,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_datetime_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ unsigned bytes = vstruct_bytes(entry);
+ unsigned expected = 16;
+ int ret = 0;
+
+ if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_size,
+ "bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_datetime *datetime =
+ container_of(entry, struct jset_entry_datetime, entry);
+
+ bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@@ -913,11 +967,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
new_size = roundup_pow_of_two(new_size);
- n = kvpmalloc(new_size, GFP_KERNEL);
+ n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
- kvpfree(b->data, b->size);
+ kvfree(b->data);
b->data = n;
b->size = new_size;
return 0;
@@ -1028,9 +1082,7 @@ reread:
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
- bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %s",
- bch2_err_str(ret));
+ bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1102,16 +1154,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
if (!r)
continue;
- for (i = 0; i < r->nr_ptrs; i++) {
- if (r->ptrs[i].dev == ca->dev_idx) {
- unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+ darray_for_each(r->ptrs, i)
+ if (i->dev == ca->dev_idx) {
+ unsigned wrote = bucket_remainder(ca, i->sector) +
vstruct_sectors(&r->j, c->block_bits);
- ja->cur_idx = r->ptrs[i].bucket;
+ ja->cur_idx = i->bucket;
ja->sectors_free = ca->mi.bucket_size - wrote;
goto found;
}
- }
}
found:
mutex_unlock(&jlist->lock);
@@ -1144,7 +1195,7 @@ found:
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
- kvpfree(buf.data, buf.size);
+ kvfree(buf.data);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
return;
@@ -1155,27 +1206,6 @@ err:
goto out;
}
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
-{
- unsigned i;
-
- for (i = 0; i < j->nr_ptrs; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
- u64 offset;
-
- div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
-
- if (i)
- prt_printf(out, " ");
- prt_printf(out, "%u:%u:%u (sector %llu)",
- j->ptrs[i].dev,
- j->ptrs[i].bucket,
- j->ptrs[i].bucket_offset,
- j->ptrs[i].sector);
- }
-}
-
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
@@ -1228,20 +1258,20 @@ int bch2_journal_read(struct bch_fs *c,
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
if (!*start_seq)
*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
if (JSET_NO_FLUSH(&i->j)) {
- i->ignore = true;
+ i->ignore_blacklisted = true;
continue;
}
if (!last_write_torn && !i->csum_good) {
last_write_torn = true;
- i->ignore = true;
+ i->ignore_blacklisted = true;
continue;
}
@@ -1280,12 +1310,12 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
if (seq < *last_seq) {
- journal_replay_free(c, i);
+ journal_replay_free(c, i, false);
continue;
}
@@ -1293,7 +1323,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
- i->ignore = true;
+ i->ignore_blacklisted = true;
}
}
@@ -1302,7 +1332,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1353,32 +1383,31 @@ int bch2_journal_read(struct bch_fs *c,
.e.data_type = BCH_DATA_journal,
.e.nr_required = 1,
};
- unsigned ptr;
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+ darray_for_each(i->ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!i->ptrs[ptr].csum_good)
- bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+ if (!ptr->csum_good)
+ bch_err_dev_offset(ca, ptr->sector,
"invalid journal checksum, seq %llu%s",
le64_to_cpu(i->j.seq),
i->csum_good ? " (had good copy on another device)" : "");
}
ret = jset_validate(c,
- bch_dev_bkey_exists(c, i->ptrs[0].dev),
+ bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
&i->j,
- i->ptrs[0].sector,
+ i->ptrs.data[0].sector,
READ);
if (ret)
goto err;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++)
- replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+ darray_for_each(i->ptrs, ptr)
+ replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
bch2_replicas_entry_sort(&replicas.e);
@@ -1547,7 +1576,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
return;
- new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+ new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@@ -1558,7 +1587,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
swap(buf->buf_size, new_size);
spin_unlock(&j->lock);
- kvpfree(new_buf, new_size);
+ kvfree(new_buf);
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@@ -1568,12 +1597,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
static CLOSURE_CALLBACK(journal_write_done)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
union journal_res_state old, new;
- u64 v, seq;
+ u64 v, seq = le64_to_cpu(w->data->seq);
int err = 0;
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@@ -1593,63 +1622,68 @@ static CLOSURE_CALLBACK(journal_write_done)
if (err)
bch2_fatal_error(c);
- spin_lock(&j->lock);
- seq = le64_to_cpu(w->data->seq);
+ closure_debug_destroy(cl);
+ spin_lock(&j->lock);
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = w->devs_written;
+ if (err && (!j->err_seq || seq < j->err_seq))
+ j->err_seq = seq;
+ w->write_done = true;
+
+ bool completed = false;
- if (!err) {
- if (!JSET_NO_FLUSH(w->data)) {
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ w = j->buf + (seq & JOURNAL_BUF_MASK);
+ if (!w->write_done)
+ break;
+
+ if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
-
bch2_reset_alloc_cursors(c);
}
- } else if (!j->err_seq || seq < j->err_seq)
- j->err_seq = seq;
- j->seq_ondisk = seq;
+ j->seq_ondisk = seq;
- /*
- * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
- * more buckets:
- *
- * Must come before signaling write completion, for
- * bch2_fs_journal_stop():
- */
- if (j->watermark != BCH_WATERMARK_stripe)
- journal_reclaim_kick(&c->journal);
+ /*
+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+ * more buckets:
+ *
+ * Must come before signaling write completion, for
+ * bch2_fs_journal_stop():
+ */
+ if (j->watermark != BCH_WATERMARK_stripe)
+ journal_reclaim_kick(&c->journal);
- /* also must come before signalling write completion: */
- closure_debug_destroy(cl);
+ v = atomic64_read(&j->reservations.counter);
+ do {
+ old.v = new.v = v;
+ BUG_ON(journal_state_count(new, new.unwritten_idx));
+ BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
- v = atomic64_read(&j->reservations.counter);
- do {
- old.v = new.v = v;
- BUG_ON(journal_state_count(new, new.unwritten_idx));
+ new.unwritten_idx++;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
- new.unwritten_idx++;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ closure_wake_up(&w->wait);
+ completed = true;
+ }
- bch2_journal_reclaim_fast(j);
- bch2_journal_space_available(j);
+ if (completed) {
+ bch2_journal_reclaim_fast(j);
+ bch2_journal_space_available(j);
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, false);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
- closure_wake_up(&w->wait);
- journal_wake(j);
+ journal_wake(j);
+ }
- if (!journal_state_count(new, new.unwritten_idx) &&
- journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
- spin_unlock(&j->lock);
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
- } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+ if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
struct journal_buf *buf = journal_cur_buf(j);
long delta = buf->expires - jiffies;
@@ -1659,46 +1693,46 @@ static CLOSURE_CALLBACK(journal_write_done)
* previous entries still in flight - the current journal entry
* might want to be written now:
*/
-
- spin_unlock(&j->lock);
- mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
- } else {
- spin_unlock(&j->lock);
+ mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
+
+ spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
{
- struct bch_dev *ca = bio->bi_private;
+ struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+ struct bch_dev *ca = jbio->ca;
struct journal *j = &ca->fs->journal;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- unsigned long flags;
+ struct journal_buf *w = j->buf + jbio->buf_idx;
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
+ unsigned long flags;
+
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
}
- closure_put(&j->io);
+ closure_put(&w->io);
percpu_ref_put(&ca->io_ref);
}
static CLOSURE_CALLBACK(do_journal_write)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct journal_device *ja = &ca->journal;
+
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
@@ -1708,7 +1742,7 @@ static CLOSURE_CALLBACK(do_journal_write)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
- bio = ca->journal.bio;
+ struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@@ -1727,11 +1761,10 @@ static CLOSURE_CALLBACK(do_journal_write)
trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
- ca->journal.bucket_seq[ca->journal.cur_idx] =
- le64_to_cpu(w->data->seq);
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
}
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
}
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@@ -1782,11 +1815,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (!wb.wb)
bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
- struct bkey_i *k;
jset_entry_for_each_key(i, k) {
ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
if (ret) {
- bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
+ bch2_err_str(ret));
bch2_journal_keys_to_write_buffer_end(c, &wb);
return ret;
}
@@ -1798,15 +1831,24 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (wb.wb)
bch2_journal_keys_to_write_buffer_end(c, &wb);
+
+ spin_lock(&c->journal.lock);
w->need_flush_to_write_buffer = false;
+ spin_unlock(&c->journal.lock);
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
+ struct jset_entry_datetime *d =
+ container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+ d->entry.type = BCH_JSET_ENTRY_datetime;
+ d->seconds = cpu_to_le64(ktime_get_real_seconds());
+
bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
- BUG_ON(u64s > j->entry_u64s_reserved);
+
+ WARN_ON(u64s > j->entry_u64s_reserved);
le32_add_cpu(&jset->u64s, u64s);
@@ -1814,7 +1856,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
bytes = vstruct_bytes(jset);
if (sectors > w->sectors) {
- bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+ bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
return -EINVAL;
@@ -1842,8 +1884,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
- if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %i", ret))
+ if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
@@ -1893,6 +1934,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
j->nr_noflush_writes++;
} else {
+ w->must_flush = true;
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
@@ -1903,20 +1945,28 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
CLOSURE_CALLBACK(bch2_journal_write)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
- struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
+ for_each_rw_member(c, ca)
+ nr_rw_members++;
+
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+ BUG_ON(!w->write_started);
+ BUG_ON(w->write_allocated);
+ BUG_ON(w->write_done);
j->write_start_time = local_clock();
spin_lock(&j->lock);
+ if (nr_rw_members > 1)
+ w->separate_flush = true;
+
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret)
@@ -1956,12 +2006,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
* bch2_journal_space_available():
*/
w->sectors = 0;
+ w->write_allocated = true;
/*
* journal entry has been compacted and allocated, recalculate space
* available:
*/
bch2_journal_space_available(j);
+ bch2_journal_do_writes(j);
spin_unlock(&j->lock);
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@@ -1969,12 +2021,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (c->opts.nochanges)
goto no_io;
- for_each_rw_member(c, ca)
- nr_rw_members++;
-
- if (nr_rw_members > 1)
- w->separate_flush = true;
-
/*
* Mark journal replicas before we submit the write to guarantee
* recovery will find the journal entries after a crash.
@@ -1985,25 +2031,29 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
+ if (!JSET_NO_FLUSH(w->data))
+ closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
+
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
- bio = ca->journal.bio;
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_PREFLUSH);
+ REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
}
- continue_at(cl, do_journal_write, c->io_complete_wq);
+ continue_at(cl, do_journal_write, j->wq);
return;
no_io:
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
return;
err:
bch2_fatal_error(c);
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index c035e7c108e1..4f1e763ab506 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -2,26 +2,35 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
+#include "darray.h"
+
+struct journal_ptr {
+ bool csum_good;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+};
+
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
- struct journal_ptr {
- bool csum_good;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
- } ptrs[BCH_REPLICAS_MAX];
- unsigned nr_ptrs;
+ DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
bool csum_good;
- bool ignore;
+ bool ignore_blacklisted;
+ bool ignore_not_dirty;
/* must be last: */
struct jset j;
};
+static inline bool journal_replay_ignore(struct journal_replay *i)
+{
+ return !i || i->ignore_blacklisted || i->ignore_not_dirty;
+}
+
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
@@ -36,12 +45,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
}
#define for_each_jset_entry_type(entry, jset, type) \
- for (entry = (jset)->start; \
+ for (struct jset_entry *entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define jset_entry_for_each_key(_e, _k) \
- for (_k = (_e)->start; \
+ for (struct bkey_i *_k = (_e)->start; \
_k < vstruct_last(_e); \
_k = bkey_next(_k))
@@ -62,4 +71,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
CLOSURE_CALLBACK(bch2_journal_write);
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+ memset(entry, 0, u64s * sizeof(u64));
+ /*
+ * The u64s field counts from the start of data, ignoring the shared
+ * fields.
+ */
+ entry->u64s = cpu_to_le16(u64s - 1);
+
+ *end = vstruct_next(*end);
+ return entry;
+}
+
#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c33dca641575..ab811c0dad26 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j)
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
- if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
- &j->low_on_space_start, low_on_space) ||
- track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
- &j->low_on_pin_start, low_on_pin) ||
- track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
- &j->write_buffer_full_start, low_on_wb))
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
@@ -394,8 +391,6 @@ void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
- bool reclaim;
-
spin_lock(&j->lock);
u64 seq = READ_ONCE(src->seq);
@@ -411,44 +406,44 @@ void bch2_journal_pin_copy(struct journal *j,
return;
}
- reclaim = __journal_pin_drop(j, dst);
+ bool reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
- journal_wake(j);
+ if (seq == journal_last_seq(j))
+ journal_wake(j);
+ spin_unlock(&j->lock);
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
- bool reclaim;
-
spin_lock(&j->lock);
BUG_ON(seq < journal_last_seq(j));
- reclaim = __journal_pin_drop(j, pin);
+ bool reclaim = __journal_pin_drop(j, pin);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
-
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
- journal_wake(j);
+ if (seq == journal_last_seq(j))
+ journal_wake(j);
+
+ spin_unlock(&j->lock);
}
/**
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 0200e299cfbb..b5303874fc35 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr)
return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
}
-static struct bch_sb_field_journal_seq_blacklist *
-blacklist_entry_try_merge(struct bch_fs *c,
- struct bch_sb_field_journal_seq_blacklist *bl,
- unsigned i)
-{
- unsigned nr = blacklist_nr_entries(bl);
-
- if (le64_to_cpu(bl->start[i].end) >=
- le64_to_cpu(bl->start[i + 1].start)) {
- bl->start[i].end = bl->start[i + 1].end;
- --nr;
- memmove(&bl->start[i],
- &bl->start[i + 1],
- sizeof(bl->start[0]) * (nr - i));
-
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- sb_blacklist_u64s(nr));
- BUG_ON(!bl);
- }
-
- return bl;
-}
-
-static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
- u64 start, u64 end)
-{
- return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
-}
-
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
- unsigned i, nr;
+ unsigned i = 0, nr;
int ret = 0;
mutex_lock(&c->sb_lock);
bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
nr = blacklist_nr_entries(bl);
- for (i = 0; i < nr; i++) {
+ while (i < nr) {
struct journal_seq_blacklist_entry *e =
bl->start + i;
- if (bl_entry_contig_or_overlaps(e, start, end)) {
- e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
- e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
-
- if (i + 1 < nr)
- bl = blacklist_entry_try_merge(c,
- bl, i);
- if (i)
- bl = blacklist_entry_try_merge(c,
- bl, i - 1);
- goto out_write_sb;
+ if (end < le64_to_cpu(e->start))
+ break;
+
+ if (start > le64_to_cpu(e->end)) {
+ i++;
+ continue;
}
+
+ /*
+ * Entry is contiguous or overlapping with new entry: merge it
+ * with new entry, and delete:
+ */
+
+ start = min(start, le64_to_cpu(e->start));
+ end = max(end, le64_to_cpu(e->end));
+ array_remove_item(bl->start, nr, i);
}
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
@@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
goto out;
}
- bl->start[nr].start = cpu_to_le64(start);
- bl->start[nr].end = cpu_to_le64(end);
-out_write_sb:
+ array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
+ .start = cpu_to_le64(start),
+ .end = cpu_to_le64(end),
+ }));
c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
ret = bch2_write_super(c);
@@ -165,8 +141,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
if (!bl)
return 0;
- t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
- GFP_KERNEL);
+ t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
return -BCH_ERR_ENOMEM_blacklist_table_init;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 38817c7a0851..8c053cb64ca5 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -18,6 +18,7 @@
* the journal that are being staged or in flight.
*/
struct journal_buf {
+ struct closure io;
struct jset *data;
__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -33,10 +34,14 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
- bool noflush; /* write has already been kicked off, and was noflush */
- bool must_flush; /* something wants a flush */
- bool separate_flush;
- bool need_flush_to_write_buffer;
+ bool noflush:1; /* write has already been kicked off, and was noflush */
+ bool must_flush:1; /* something wants a flush */
+ bool separate_flush:1;
+ bool need_flush_to_write_buffer:1;
+ bool write_started:1;
+ bool write_allocated:1;
+ bool write_done:1;
+ u8 idx;
};
/*
@@ -134,6 +139,7 @@ enum journal_flags {
/* Reasons we may fail to get a journal reservation: */
#define JOURNAL_ERRORS() \
x(ok) \
+ x(retry) \
x(blocked) \
x(max_in_flight) \
x(journal_full) \
@@ -149,6 +155,13 @@ enum journal_errors {
typedef DARRAY(u64) darray_u64;
+struct journal_bio {
+ struct bch_dev *ca;
+ unsigned buf_idx;
+
+ struct bio bio;
+};
+
/* Embedded in struct bch_fs */
struct journal {
/* Fastpath stuff up front: */
@@ -203,8 +216,8 @@ struct journal {
wait_queue_head_t wait;
struct closure_waitlist async_wait;
- struct closure io;
struct delayed_work write_work;
+ struct workqueue_struct *wq;
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
@@ -274,11 +287,6 @@ struct journal {
u64 nr_noflush_writes;
u64 entry_bytes_written;
- u64 low_on_space_start;
- u64 low_on_pin_start;
- u64 max_in_flight_start;
- u64 write_buffer_full_start;
-
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *flush_seq_time;
@@ -313,7 +321,7 @@ struct journal_device {
u64 *buckets;
/* Bio for journal reads/writes to this device */
- struct bio *bio;
+ struct journal_bio *bio[JOURNAL_BUF_NR];
/* for bch_journal_read_device */
struct closure read;
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index ad598105c587..9fac838d123e 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -101,8 +101,8 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
- __func__, buf.buf, bch2_err_str(ret));
+ bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
+ buf.buf, bch2_err_str(ret));
printbuf_exit(&buf);
}
}
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 7a4ca5a28b3e..26569043e368 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
u64 dev_bucket, u64 time, bool set)
{
return time
- ? bch2_btree_bit_mod(trans, BTREE_ID_lru,
- lru_pos(lru_id, dev_bucket, time), set)
+ ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
+ lru_pos(lru_id, dev_bucket, time), set)
: 0;
}
@@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
goto out;
}
- if (c->opts.reconstruct_alloc ||
- fsck_err(c, lru_entry_bad,
+ if (fsck_err(c, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
index bf0ef668fd38..0ea9f30803a2 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/fs/bcachefs/mean_and_variance.c
@@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
* mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
* @s: mean and variance number of samples and their sums
* @x: new value to include in the &mean_and_variance_weighted
+ * @initted: caller must track whether this is the first use or not
+ * @weight: ewma weight
*
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
*/
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+ s64 x, bool initted, u8 weight)
{
// previous weighted variance.
- u8 w = s->weight;
+ u8 w = weight;
u64 var_w0 = s->variance;
// new value weighted.
s64 x_w = x << w;
@@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
// new mean weighted.
s64 u_w1 = s->mean + diff;
- if (!s->init) {
+ if (!initted) {
s->mean = x_w;
s->variance = 0;
} else {
s->mean = u_w1;
s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
}
- s->init = true;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
/**
* mean_and_variance_weighted_get_mean() - get mean from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+ u8 weight)
{
- return fast_divpow2(s.mean, s.weight);
+ return fast_divpow2(s.mean, weight);
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
/**
* mean_and_variance_weighted_get_variance() -- get variance from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+ u8 weight)
{
// always positive don't need fast divpow2
- return s.variance >> s.weight;
+ return s.variance >> weight;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
/**
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+ u8 weight)
{
- return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+ return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 64df11ab422b..4fcf062dd22c 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -154,8 +154,6 @@ struct mean_and_variance {
/* expontentially weighted variant */
struct mean_and_variance_weighted {
- bool init;
- u8 weight; /* base 2 logarithim */
s64 mean;
u64 variance;
};
@@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
u64 mean_and_variance_get_variance(struct mean_and_variance s1);
u32 mean_and_variance_get_stddev(struct mean_and_variance s);
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+ s64 v, bool initted, u8 weight);
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+ u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+ u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+ u8 weight);
#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index 019583c3ca0e..db63b3f3b338 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test)
static void mean_and_variance_weighted_test(struct kunit *test)
{
- struct mean_and_variance_weighted s = { .weight = 2 };
+ struct mean_and_variance_weighted s = { };
- mean_and_variance_weighted_update(&s, 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+ mean_and_variance_weighted_update(&s, 10, false, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
- mean_and_variance_weighted_update(&s, 20);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+ mean_and_variance_weighted_update(&s, 20, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
- mean_and_variance_weighted_update(&s, 30);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+ mean_and_variance_weighted_update(&s, 30, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
- s = (struct mean_and_variance_weighted) { .weight = 2 };
+ s = (struct mean_and_variance_weighted) { };
- mean_and_variance_weighted_update(&s, -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+ mean_and_variance_weighted_update(&s, -10, false, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
- mean_and_variance_weighted_update(&s, -20);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+ mean_and_variance_weighted_update(&s, -20, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
- mean_and_variance_weighted_update(&s, -30);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+ mean_and_variance_weighted_update(&s, -30, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
}
static void mean_and_variance_weighted_advanced_test(struct kunit *test)
{
- struct mean_and_variance_weighted s = { .weight = 8 };
+ struct mean_and_variance_weighted s = { };
+ bool initted = false;
s64 i;
- for (i = 10; i <= 100; i += 10)
- mean_and_variance_weighted_update(&s, i);
+ for (i = 10; i <= 100; i += 10) {
+ mean_and_variance_weighted_update(&s, i, initted, 8);
+ initted = true;
+ }
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
- s = (struct mean_and_variance_weighted) { .weight = 8 };
+ s = (struct mean_and_variance_weighted) { };
+ initted = false;
- for (i = -10; i >= -100; i -= 10)
- mean_and_variance_weighted_update(&s, i);
+ for (i = -10; i >= -100; i -= 10) {
+ mean_and_variance_weighted_update(&s, i, initted, 8);
+ initted = true;
+ }
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
}
static void do_mean_and_variance_test(struct kunit *test,
@@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test,
s64 *weighted_stddev)
{
struct mean_and_variance mv = {};
- struct mean_and_variance_weighted vw = { .weight = weight };
+ struct mean_and_variance_weighted vw = { };
for (unsigned i = 0; i < initial_n; i++) {
mean_and_variance_update(&mv, initial_value);
- mean_and_variance_weighted_update(&vw, initial_value);
+ mean_and_variance_weighted_update(&vw, initial_value, false, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
}
for (unsigned i = 0; i < n; i++) {
mean_and_variance_update(&mv, data[i]);
- mean_and_variance_weighted_update(&vw, data[i]);
+ mean_and_variance_weighted_update(&vw, data[i], true, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
}
KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 5623cee3ef86..69098eeb5d48 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
- return -EINVAL;
+ return -BCH_ERR_remove_would_lose_data;
return 0;
}
@@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
- return -EINVAL;
+ return -BCH_ERR_remove_with_metadata_missing_unimplemented;
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
@@ -132,10 +132,8 @@ retry:
ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
- if (ret) {
- bch_err(c, "Cannot drop device without losing data");
+ if (ret)
break;
- }
ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 69e06a84dad4..0d2b82d8d11f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -155,8 +155,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (bch2_err_matches(ret, EROFS))
return ret;
- if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
- __func__, bch2_err_str(ret)))
+ if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
return ret;
ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index b1ed0b9a20d3..08ea0cfc4aef 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -314,7 +314,7 @@ int bch2_opt_parse(struct bch_fs *c,
if (ret < 0 || (*res != 0 && *res != 1)) {
if (err)
prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret;
+ return ret < 0 ? ret : -BCH_ERR_option_not_bool;
}
break;
case BCH_OPT_UINT:
@@ -456,7 +456,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts = kstrdup(options, GFP_KERNEL);
if (!copied_opts)
- return -1;
+ return -ENOMEM;
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
@@ -501,11 +501,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
bad_opt:
pr_err("Bad mount option %s", name);
- ret = -1;
+ ret = -BCH_ERR_option_name;
goto out;
bad_val:
pr_err("Invalid mount option %s", err.buf);
- ret = -1;
+ ret = -BCH_ERR_option_value;
goto out;
out:
kfree(copied_opts_start);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 9a4b7faa3765..136083c11f3a 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -290,6 +290,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
+ x(no_splitbrain_check, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't kick drives out when splitbrain detected")\
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
@@ -332,6 +337,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
+ x(fsck_memory_usage_percent, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(20, 70), \
+ BCH2_NO_SB_OPT, 50, \
+ NULL, "Maximum percentage of system ram fsck is allowed to pin")\
x(fix_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_FN(bch2_opt_fix_errors), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 22d1017aa49b..56336f3dd1d0 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: ");
- bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
prt_str(out, "io wait remaining: ");
- bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
prt_str(out, "duration waited: ");
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 21e13bb4335b..03f9d6afe467 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -52,15 +52,50 @@ static bool btree_id_is_alloc(enum btree_id id)
}
/* for -o reconstruct_alloc: */
-static void drop_alloc_keys(struct journal_keys *keys)
+static void do_reconstruct_alloc(struct bch_fs *c)
{
+ bch2_journal_log_msg(c, "dropping alloc info");
+ bch_info(c, "dropping and reconstructing all alloc info");
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
+
+ __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
+ struct journal_keys *keys = &c->journal_keys;
size_t src, dst;
- for (src = 0, dst = 0; src < keys->nr; src++)
- if (!btree_id_is_alloc(keys->d[src].btree_id))
- keys->d[dst++] = keys->d[src];
+ move_gap(keys, keys->nr);
- keys->nr = dst;
+ for (src = 0, dst = 0; src < keys->nr; src++)
+ if (!btree_id_is_alloc(keys->data[src].btree_id))
+ keys->data[dst++] = keys->data[src];
+ keys->nr = keys->gap = dst;
}
/*
@@ -70,9 +105,7 @@ static void drop_alloc_keys(struct journal_keys *keys)
*/
static void zero_out_btree_mem_ptr(struct journal_keys *keys)
{
- struct journal_key *i;
-
- for (i = keys->d; i < keys->d + keys->nr; i++)
+ darray_for_each(*keys, i)
if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
}
@@ -124,6 +157,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (ret)
goto out;
+ struct btree_path *path = btree_iter_path(trans, &iter);
+ if (unlikely(!btree_path_node(path, k->level))) {
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, 0, iter_flags);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_increase_depth(trans, iter.path, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+
/* Must be checked with btree locked: */
if (k->overwritten)
goto out;
@@ -161,16 +205,16 @@ static int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
+ move_gap(keys, keys->nr);
+
/*
* First, attempt to replay keys in sorted order. This is more
* efficient - better locality of btree access - but some might fail if
* that would cause a journal deadlock.
*/
- for (size_t i = 0; i < keys->nr; i++) {
+ darray_for_each(*keys, k) {
cond_resched();
- struct journal_key *k = keys->d + i;
-
/* Skip fastpath if we're low on space in the journal */
ret = c->journal.watermark ? -1 :
commit_do(trans, NULL, NULL,
@@ -264,7 +308,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
bkey_copy(&r->key, (struct bkey_i *) entry->start);
r->error = 0;
} else {
- r->error = -EIO;
+ r->error = -BCH_ERR_btree_node_read_error;
}
r->alive = true;
break;
@@ -359,7 +403,7 @@ static int journal_replay_early(struct bch_fs *c,
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
vstruct_for_each(&i->j, entry) {
@@ -388,11 +432,8 @@ static int read_btree_roots(struct bch_fs *c)
if (!r->alive)
continue;
- if (btree_id_is_alloc(i) &&
- c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
continue;
- }
if (r->error) {
__fsck_err(c,
@@ -524,8 +565,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
* setting journal_key->overwritten: it will be accessed by multiple
* threads
*/
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
+ move_gap(keys, keys->nr);
set_bit(BCH_FS_may_go_rw, &c->flags);
@@ -862,7 +902,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto out;
genradix_for_each_reverse(&c->journal_entries, iter, i)
- if (*i && !(*i)->ignore) {
+ if (!journal_replay_ignore(*i)) {
last_journal_entry = &(*i)->j;
break;
}
@@ -887,7 +927,8 @@ int bch2_fs_recovery(struct bch_fs *c)
genradix_for_each_reverse(&c->journal_entries, iter, i)
if (*i) {
last_journal_entry = &(*i)->j;
- (*i)->ignore = false;
+ (*i)->ignore_blacklisted = false;
+ (*i)->ignore_not_dirty= false;
/*
* This was probably a NO_FLUSH entry,
* so last_seq was garbage - but we know
@@ -923,10 +964,8 @@ use_clean:
c->journal_replay_seq_start = last_seq;
c->journal_replay_seq_end = blacklist_seq - 1;
- if (c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- drop_alloc_keys(&c->journal_keys);
- }
+ if (c->opts.reconstruct_alloc)
+ do_reconstruct_alloc(c);
zero_out_btree_mem_ptr(&c->journal_keys);
@@ -950,7 +989,7 @@ use_clean:
bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
if (ret) {
- bch_err(c, "error creating new journal seq blacklist entry");
+ bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
goto err;
}
}
@@ -961,9 +1000,6 @@ use_clean:
if (ret)
goto err;
- if (c->opts.reconstruct_alloc)
- bch2_journal_log_msg(c, "dropping alloc info");
-
/*
* Skip past versions that might have possibly been used (as nonces),
* but hadn't had their pointers written:
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index fa0c8efd2a1b..4959e95e7c74 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -13,11 +13,11 @@
* must never change:
*/
#define BCH_RECOVERY_PASSES() \
+ x(check_topology, 4, 0) \
x(alloc_read, 0, PASS_ALWAYS) \
x(stripes_read, 1, PASS_ALWAYS) \
x(initialize_subvolumes, 2, 0) \
x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_topology, 4, 0) \
x(check_allocations, 5, PASS_FSCK) \
x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
@@ -34,6 +34,7 @@
x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
@@ -43,6 +44,7 @@
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index b6bf0ebe7e84..5980ba2563fe 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -171,22 +171,6 @@ fsck_err:
return ERR_PTR(ret);
}
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
- struct jset_entry *entry = *end;
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
- memset(entry, 0, u64s * sizeof(u64));
- /*
- * The u64s field counts from the start of data, ignoring the shared
- * fields.
- */
- entry->u64s = cpu_to_le16(u64s - 1);
-
- *end = vstruct_next(*end);
- return entry;
-}
-
void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 441dcb1bf160..e4396cb0bacb 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -45,7 +45,13 @@
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
x(rebalance_work, \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
+ x(subvolume_fs_parent, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
+ BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
+ x(btree_subvolume_children, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
+ BCH_FSCK_ERR_subvol_children_not_set)
#define DOWNGRADE_TABLE()
@@ -253,7 +259,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
if (e < BCH_SB_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
- ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
+ __set_bit_le64(e, ext->errors_silent);
}
}
}
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index c08aacdfd073..5178bf579f7c 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -231,7 +231,7 @@
x(dirent_name_dot_or_dotdot, 223) \
x(dirent_name_has_slash, 224) \
x(dirent_d_type_wrong, 225) \
- x(dirent_d_parent_subvol_wrong, 226) \
+ x(inode_bi_parent_wrong, 226) \
x(dirent_in_missing_dir_inode, 227) \
x(dirent_in_non_dir_inode, 228) \
x(dirent_to_missing_inode, 229) \
@@ -250,7 +250,22 @@
x(hash_table_key_duplicate, 242) \
x(hash_table_key_wrong_offset, 243) \
x(unlinked_inode_not_on_deleted_list, 244) \
- x(reflink_p_front_pad_bad, 245)
+ x(reflink_p_front_pad_bad, 245) \
+ x(journal_entry_dup_same_device, 246) \
+ x(inode_bi_subvol_missing, 247) \
+ x(inode_bi_subvol_wrong, 248) \
+ x(inode_points_to_missing_dirent, 249) \
+ x(inode_points_to_wrong_dirent, 250) \
+ x(inode_bi_parent_nonzero, 251) \
+ x(dirent_to_missing_parent_subvol, 252) \
+ x(dirent_not_visible_in_parent_subvol, 253) \
+ x(subvol_fs_path_parent_wrong, 254) \
+ x(subvol_root_fs_path_parent_nonzero, 255) \
+ x(subvol_children_not_set, 256) \
+ x(subvol_children_bad, 257) \
+ x(subvol_loop, 258) \
+ x(subvol_unreachable, 259) \
+ x(btree_node_bkey_bad_u64s, 260)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ac6ba04d5521..39debe814bf3 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -91,18 +91,20 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,
/* Snapshot nodes: */
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
{
- struct snapshot_table *t;
-
- rcu_read_lock();
- t = rcu_dereference(c->snapshots);
-
while (id && id < ancestor)
id = __snapshot_t(t, id)->parent;
+ return id == ancestor;
+}
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+ rcu_read_lock();
+ bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
rcu_read_unlock();
- return id == ancestor;
+ return ret;
}
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
@@ -120,13 +122,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
- struct snapshot_table *t;
bool ret;
- EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
-
rcu_read_lock();
- t = rcu_dereference(c->snapshots);
+ struct snapshot_table *t = rcu_dereference(c->snapshots);
+
+ if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) {
+ ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
+ goto out;
+ }
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
@@ -134,11 +138,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
if (id && id < ancestor) {
ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
- EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
} else {
ret = id == ancestor;
}
-
+out:
rcu_read_unlock();
return ret;
@@ -547,7 +551,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
"snapshot tree points to missing subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+ fsck_err_on(!bch2_snapshot_is_ancestor(c,
le32_to_cpu(subvol.snapshot),
root_id),
c, snapshot_tree_to_wrong_subvol,
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index fcaa5a888744..3976f80721bf 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set_snapshot(struct btree_trans *trans,
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
@@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
struct bkey_i *insert,
bch_str_hash_flags_t str_hash_flags)
{
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
insert->k.p.inode = inum.inum;
- return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, str_hash_flags, 0);
+ u32 snapshot;
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ bch2_hash_set_in_snapshot(trans, desc, info, inum,
+ snapshot, insert, str_hash_flags, 0);
}
static __always_inline
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 7c67c28d3ef8..ce7aed121942 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -13,13 +13,26 @@
static int bch2_subvolume_delete(struct btree_trans *, u32);
+static struct bpos subvolume_children_pos(struct bkey_s_c k)
+{
+ if (k.k->type != KEY_TYPE_subvolume)
+ return POS_MIN;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+ if (!s.v->fs_path_parent)
+ return POS_MIN;
+ return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
+}
+
static int check_subvol(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_subvolume subvol;
+ struct btree_iter subvol_children_iter = {};
struct bch_snapshot snapshot;
+ struct printbuf buf = PRINTBUF;
unsigned snapid;
int ret = 0;
@@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans,
return ret ?: -BCH_ERR_transaction_restart_nested;
}
+ if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
+ subvol.v->fs_path_parent,
+ c, subvol_root_fs_path_parent_nonzero,
+ "root subvolume has nonzero fs_path_parent\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ n->v.fs_path_parent = 0;
+ }
+
+ if (subvol.v->fs_path_parent) {
+ struct bpos pos = subvolume_children_pos(k);
+
+ struct bkey_s_c subvol_children_k =
+ bch2_bkey_get_iter(trans, &subvol_children_iter,
+ BTREE_ID_subvolume_children, pos, 0);
+ ret = bkey_err(subvol_children_k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
+ c, subvol_children_not_set,
+ "subvolume not set in subvolume_children btree at %llu:%llu\n%s",
+ pos.inode, pos.offset,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
+ if (ret)
+ goto err;
+ }
+ }
+
+ struct bch_inode_unpacked inode;
+ struct btree_iter inode_iter = {};
+ ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+ (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+ 0);
+ bch2_trans_iter_exit(trans, &inode_iter);
+
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(ret, c, subvol_to_missing_root,
+ "subvolume %llu points to missing subvolume root %llu:%u",
+ k.k->p.offset, le64_to_cpu(subvol.v->inode),
+ le32_to_cpu(subvol.v->snapshot))) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ return ret ?: -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+ c, subvol_root_wrong_bi_subvol,
+ "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+ inode.bi_inum, inode_iter.k.p.snapshot,
+ inode.bi_subvol, subvol.k->p.offset)) {
+ inode.bi_subvol = subvol.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+ if (ret)
+ goto err;
+ }
+
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
u32 snapshot_tree;
@@ -72,8 +151,10 @@ static int check_subvol(struct btree_trans *trans,
SET_BCH_SUBVOLUME_SNAP(&s->v, true);
}
}
-
+err:
fsck_err:
+ bch2_trans_iter_exit(trans, &subvol_children_iter);
+ printbuf_exit(&buf);
return ret;
}
@@ -88,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c)
return ret;
}
+static int check_subvol_child(struct btree_trans *trans,
+ struct btree_iter *child_iter,
+ struct bkey_s_c child_k)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_subvolume s;
+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
+ 0, subvolume, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(ret ||
+ le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
+ c, subvol_children_bad,
+ "incorrect entry in subvolume_children btree %llu:%llu",
+ child_k.k->p.inode, child_k.k->p.offset)) {
+ ret = bch2_btree_delete_at(trans, child_iter, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ return ret;
+}
+
+int bch2_check_subvol_children(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol_child(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return 0;
+}
+
/* Subvolumes: */
int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -112,8 +229,50 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
le64_to_cpu(s.v->inode),
le32_to_cpu(s.v->snapshot));
- if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
- prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+ if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
+ prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
+ prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
+ }
+}
+
+static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
+{
+ return !bpos_eq(pos, POS_MIN)
+ ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
+ : 0;
+}
+
+int bch2_subvolume_trigger(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
+{
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bpos children_pos_old = subvolume_children_pos(old);
+ struct bpos children_pos_new = subvolume_children_pos(new.s_c);
+
+ if (!bpos_eq(children_pos_old, children_pos_new)) {
+ int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
+ subvolume_children_mod(trans, children_pos_new, true);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
+{
+ struct btree_iter iter;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+
+ return bkey_err(k) ?: k.k && k.k->p.inode == subvol
+ ? -BCH_ERR_ENOTEMPTY_subvol_not_empty
+ : 0;
}
static __always_inline int
@@ -197,8 +356,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_subvolume)
return 0;
- if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
- le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
+ le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
return 0;
s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
@@ -206,7 +365,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (ret)
return ret;
- s->v.parent = cpu_to_le32(new_parent);
+ s->v.creation_parent = cpu_to_le32(new_parent);
return 0;
}
@@ -229,7 +388,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
- subvolid_to_delete, le32_to_cpu(s.parent)));
+ subvolid_to_delete, le32_to_cpu(s.creation_parent)));
}
/*
@@ -360,6 +519,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
}
int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+ u32 parent_subvolid,
u32 src_subvolid,
u32 *new_subvolid,
u32 *new_snapshotid,
@@ -416,12 +576,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
if (ret)
goto err;
- new_subvol->v.flags = 0;
- new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
- new_subvol->v.inode = cpu_to_le64(inode);
- new_subvol->v.parent = cpu_to_le32(src_subvolid);
- new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
- new_subvol->v.otime.hi = 0;
+ new_subvol->v.flags = 0;
+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
+ new_subvol->v.inode = cpu_to_le64(inode);
+ new_subvol->v.creation_parent = cpu_to_le32(src_subvolid);
+ new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid);
+ new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
+ new_subvol->v.otime.hi = 0;
SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a6f56f66e27c..903c05162c06 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -8,17 +8,22 @@
enum bkey_invalid_flags;
int bch2_check_subvols(struct bch_fs *);
+int bch2_check_subvol_children(struct bch_fs *);
int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
.key_invalid = bch2_subvolume_invalid, \
.val_to_text = bch2_subvolume_to_text, \
+ .trigger = bch2_subvolume_trigger, \
.min_val_size = 16, \
})
+int bch2_subvol_has_children(struct btree_trans *, u32);
int bch2_subvolume_get(struct btree_trans *, unsigned,
bool, int, struct bch_subvolume *);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
@@ -30,8 +35,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32,
- u32 *, u32 *, bool);
+int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
int bch2_fs_subvolumes_init(struct bch_fs *);
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
index af79134b07d6..e029df7ba89f 100644
--- a/fs/bcachefs/subvolume_format.h
+++ b/fs/bcachefs/subvolume_format.h
@@ -19,8 +19,8 @@ struct bch_subvolume {
* This is _not_ necessarily the subvolume of the directory containing
* this subvolume:
*/
- __le32 parent;
- __le32 pad;
+ __le32 creation_parent;
+ __le32 fs_path_parent;
bch_le128 otime;
};
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index bd64eb68e84a..ad28e370b640 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return ret;
}
+ if (rw == WRITE &&
+ bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
+ prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
+ le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
+ le64_to_cpu(sb->seq));
+ return -BCH_ERR_invalid_sb_members_missing;
+ }
+
return 0;
}
@@ -717,6 +725,7 @@ retry:
if (IS_ERR(sb->s_bdev_file)) {
ret = PTR_ERR(sb->s_bdev_file);
+ prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
goto err;
}
sb->bdev = file_bdev(sb->s_bdev_file);
@@ -743,9 +752,9 @@ retry:
prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
- printk(KERN_INFO "%s", err2.buf);
+ bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
else
- printk(KERN_ERR "%s", err2.buf);
+ bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
printbuf_exit(&err2);
printbuf_reset(&err);
@@ -803,21 +812,20 @@ got_super:
goto err;
}
- ret = 0;
sb->have_layout = true;
ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
- printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
- path, err.buf);
+ bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+ path, err.buf);
goto err_no_print;
}
out:
printbuf_exit(&err);
return ret;
err:
- printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
- path, err.buf);
+ bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+ path, err.buf);
err_no_print:
bch2_free_super(sb);
goto out;
@@ -977,7 +985,7 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, " > ");
bch2_version_to_text(&buf, bcachefs_metadata_version_current);
prt_str(&buf, ")");
- bch2_fs_fatal_error(c, "%s", buf.buf);
+ bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_sb_not_downgraded;
}
@@ -997,7 +1005,7 @@ int bch2_write_super(struct bch_fs *c)
if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock write was silently dropped! (seq %llu expected %llu)",
+ ": Superblock write was silently dropped! (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
@@ -1007,7 +1015,7 @@ int bch2_write_super(struct bch_fs *c)
if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock modified by another process (seq %llu expected %llu)",
+ ": Superblock modified by another process (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
@@ -1058,7 +1066,7 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written ||
(can_mount_without_written &&
!can_mount_with_written), c,
- "Unable to write superblock to sufficient devices (from %ps)",
+ ": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
ret = -1;
out:
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6b23e11825e6..1ad6e5cd9476 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -56,6 +56,7 @@
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
+#include "thread_with_file.h"
#include "trace.h"
#include <linux/backing-dev.h>
@@ -86,26 +87,38 @@ const char * const bch2_fs_flag_strs[] = {
NULL
};
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+__printf(2, 0)
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
{
- struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+#ifdef __KERNEL__
+ if (unlikely(stdio)) {
+ if (fmt[0] == KERN_SOH[0])
+ fmt += 2;
+
+ bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+ return;
+ }
+#endif
+ vprintk(fmt, args);
+}
+
+void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
va_list args;
va_start(args, fmt);
- if (likely(!stdio)) {
- vprintk(fmt, args);
- } else {
- unsigned long flags;
-
- if (fmt[0] == KERN_SOH[0])
- fmt += 2;
+ bch2_print_maybe_redirect(stdio, fmt, args);
+ va_end(args);
+}
- spin_lock_irqsave(&stdio->output_lock, flags);
- prt_vprintf(&stdio->output_buf, fmt, args);
- spin_unlock_irqrestore(&stdio->output_lock, flags);
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
- wake_up(&stdio->output_wait);
- }
+ va_list args;
+ va_start(args, fmt);
+ bch2_print_maybe_redirect(stdio, fmt, args);
va_end(args);
}
@@ -576,7 +589,7 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->btree_update_wq);
bch2_free_super(&c->disk_sb);
- kvpfree(c, sizeof(*c));
+ kvfree(c);
module_put(THIS_MODULE);
}
@@ -715,7 +728,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
unsigned i, iter_size;
int ret = 0;
- c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+ c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c) {
c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
goto out;
@@ -818,13 +831,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
pr_uuid(&name, c->sb.user_uuid.b);
- strscpy(c->name, name.buf, sizeof(c->name));
- printbuf_exit(&name);
-
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
if (ret)
goto err;
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -862,13 +875,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
- WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -882,8 +895,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
- mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- c->opts.btree_node_size) ||
+ mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+ c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@@ -1061,7 +1074,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
}
static int bch2_dev_in_fs(struct bch_sb_handle *fs,
- struct bch_sb_handle *sb)
+ struct bch_sb_handle *sb,
+ struct bch_opts *opts)
{
if (fs == sb)
return 0;
@@ -1102,11 +1116,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
prt_newline(&buf);
- prt_printf(&buf, "Not using older sb");
+ if (!opts->no_splitbrain_check)
+ prt_printf(&buf, "Not using older sb");
pr_err("%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_device_splitbrain;
+
+ if (!opts->no_splitbrain_check)
+ return -BCH_ERR_device_splitbrain;
}
struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
@@ -1124,17 +1141,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
- prt_str(&buf, "believes seq of ");
+ prt_str(&buf, " believes seq of ");
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " has %llu\n", seq_from_member);
- prt_str(&buf, "Not using ");
- prt_bdevname(&buf, sb->bdev);
+
+ if (!opts->no_splitbrain_check) {
+ prt_str(&buf, "Not using ");
+ prt_bdevname(&buf, sb->bdev);
+ }
pr_err("%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_device_splitbrain;
+
+ if (!opts->no_splitbrain_check)
+ return -BCH_ERR_device_splitbrain;
}
return 0;
@@ -1168,8 +1190,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);
- bch2_time_stats_exit(&ca->io_latency[WRITE]);
- bch2_time_stats_exit(&ca->io_latency[READ]);
+ bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+ bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
@@ -1260,8 +1282,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
- bch2_time_stats_init(&ca->io_latency[READ]);
- bch2_time_stats_init(&ca->io_latency[WRITE]);
+ bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
+ bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
@@ -1597,27 +1619,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- bch_err_msg(ca, ret, "dropping data");
+ bch_err_msg(ca, ret, "bch2_dev_data_drop()");
if (ret)
goto err;
ret = bch2_dev_remove_alloc(c, ca);
- bch_err_msg(ca, ret, "deleting alloc info");
+ bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
if (ret)
goto err;
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- bch_err_msg(ca, ret, "flushing journal");
+ bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
if (ret)
goto err;
ret = bch2_journal_flush(&c->journal);
- bch_err(ca, "journal error");
+ bch_err_msg(ca, ret, "bch2_journal_flush()");
if (ret)
goto err;
ret = bch2_replicas_gc2(c);
- bch_err_msg(ca, ret, "in replicas_gc2()");
+ bch_err_msg(ca, ret, "bch2_replicas_gc2()");
if (ret)
goto err;
@@ -1835,7 +1857,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+ ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
bch_err_msg(c, ret, "bringing %s online", path);
if (ret)
goto err;
@@ -2023,7 +2045,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
best = sb;
darray_for_each_reverse(sbs, sb) {
- ret = bch2_dev_in_fs(best, sb);
+ ret = bch2_dev_in_fs(best, sb, &opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index cee80c47feea..c86a93a8d8fc 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -930,10 +930,10 @@ SHOW(bch2_dev)
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
if (attr == &sysfs_io_latency_stats_read)
- bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+ bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
if (attr == &sysfs_io_latency_stats_write)
- bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 9220d7de10db..940db15d6a93 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -2,7 +2,6 @@
#ifndef NO_BCACHEFS_FS
#include "bcachefs.h"
-#include "printbuf.h"
#include "thread_with_file.h"
#include <linux/anon_inodes.h>
@@ -10,6 +9,7 @@
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
+#include <linux/sched/sysctl.h>
void bch2_thread_with_file_exit(struct thread_with_file *thr)
{
@@ -65,68 +65,82 @@ err:
return ret;
}
-static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+/* stdio_redirect */
+
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
{
- return thr->stdio.output_buf.pos ||
- thr->output2.nr ||
- thr->thr.done;
+ return stdio->input.buf.nr || stdio->done;
}
-static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
- size_t copied = 0, b;
- int ret = 0;
+ return stdio->output.buf.nr || stdio->done;
+}
- if ((file->f_flags & O_NONBLOCK) &&
- !thread_with_stdio_has_output(thr))
- return -EAGAIN;
+#define STDIO_REDIRECT_BUFSIZE 4096
- ret = wait_event_interruptible(thr->stdio.output_wait,
- thread_with_stdio_has_output(thr));
- if (ret)
- return ret;
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+ return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
- if (thr->thr.done)
- return 0;
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+ return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
- while (len) {
- ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
- if (ret)
- break;
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+ spin_lock_init(&buf->lock);
+ init_waitqueue_head(&buf->wait);
+ darray_init(&buf->buf);
+}
- spin_lock_irq(&thr->stdio.output_lock);
- b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
+/* thread_with_stdio */
- memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
- memmove(thr->stdio.output_buf.buf,
- thr->stdio.output_buf.buf + b,
- thr->stdio.output_buf.pos - b);
+static void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+ thr->thr.done = true;
+ thr->stdio.done = true;
+ wake_up(&thr->stdio.input.wait);
+ wake_up(&thr->stdio.output.wait);
+}
- thr->output2.nr += b;
- thr->stdio.output_buf.pos -= b;
- spin_unlock_irq(&thr->stdio.output_lock);
+static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ struct stdio_buf *buf = &thr->stdio.output;
+ size_t copied = 0, b;
+ int ret = 0;
- b = min(len, thr->output2.nr);
- if (!b)
- break;
+ if (!(file->f_flags & O_NONBLOCK)) {
+ ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
+ if (ret)
+ return ret;
+ } else if (!stdio_redirect_has_output(&thr->stdio))
+ return -EAGAIN;
- b -= copy_to_user(buf, thr->output2.data, b);
- if (!b) {
+ while (len && buf->buf.nr) {
+ if (fault_in_writeable(ubuf, len) == len) {
ret = -EFAULT;
break;
}
- copied += b;
- buf += b;
- len -= b;
-
- memmove(thr->output2.data,
- thr->output2.data + b,
- thr->output2.nr - b);
- thr->output2.nr -= b;
+ spin_lock_irq(&buf->lock);
+ b = min_t(size_t, len, buf->buf.nr);
+
+ if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
+ ubuf += b;
+ len -= b;
+ copied += b;
+ buf->buf.nr -= b;
+ memmove(buf->buf.data,
+ buf->buf.data + b,
+ buf->buf.nr);
+ }
+ spin_unlock_irq(&buf->lock);
}
return copied ?: ret;
@@ -137,27 +151,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
+ thread_with_stdio_done(thr);
bch2_thread_with_file_exit(&thr->thr);
- printbuf_exit(&thr->stdio.input_buf);
- printbuf_exit(&thr->stdio.output_buf);
- darray_exit(&thr->output2);
- thr->exit(thr);
+ darray_exit(&thr->stdio.input.buf);
+ darray_exit(&thr->stdio.output.buf);
+ thr->ops->exit(thr);
return 0;
}
-#define WRITE_BUFFER 4096
-
-static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
-{
- return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
-}
-
static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
- struct printbuf *buf = &thr->stdio.input_buf;
+ struct stdio_buf *buf = &thr->stdio.input;
size_t copied = 0;
ssize_t ret = 0;
@@ -173,29 +180,30 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
break;
}
- spin_lock(&thr->stdio.input_lock);
- if (buf->pos < WRITE_BUFFER)
- bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
- b = min(len, printbuf_remaining_size(buf));
-
- if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
- ubuf += b;
- len -= b;
- copied += b;
- buf->pos += b;
+ spin_lock(&buf->lock);
+ if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
+ darray_make_room_gfp(&buf->buf,
+ min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
+ b = min(len, darray_room(buf->buf));
+
+ if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
+ buf->buf.nr += b;
+ ubuf += b;
+ len -= b;
+ copied += b;
}
- spin_unlock(&thr->stdio.input_lock);
+ spin_unlock(&buf->lock);
if (b) {
- wake_up(&thr->stdio.input_wait);
+ wake_up(&buf->wait);
} else {
if ((file->f_flags & O_NONBLOCK)) {
ret = -EAGAIN;
break;
}
- ret = wait_event_interruptible(thr->stdio.input_wait,
- thread_with_stdio_has_input_space(thr));
+ ret = wait_event_interruptible(buf->wait,
+ stdio_redirect_has_input_space(&thr->stdio));
if (ret)
break;
}
@@ -209,90 +217,233 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
- poll_wait(file, &thr->stdio.output_wait, wait);
- poll_wait(file, &thr->stdio.input_wait, wait);
+ poll_wait(file, &thr->stdio.output.wait, wait);
+ poll_wait(file, &thr->stdio.input.wait, wait);
__poll_t mask = 0;
- if (thread_with_stdio_has_output(thr))
+ if (stdio_redirect_has_output(&thr->stdio))
mask |= EPOLLIN;
- if (thread_with_stdio_has_input_space(thr))
+ if (stdio_redirect_has_input_space(&thr->stdio))
mask |= EPOLLOUT;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
+static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ poll_wait(file, &thr->stdio.output.wait, wait);
+
+ __poll_t mask = 0;
+
+ if (stdio_redirect_has_output(&thr->stdio))
+ mask |= EPOLLIN;
+ if (thr->thr.done)
+ mask |= EPOLLHUP|EPOLLERR;
+ return mask;
+}
+
+static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ return thr->thr.ret;
+}
+
+static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ if (thr->ops->unlocked_ioctl)
+ return thr->ops->unlocked_ioctl(thr, cmd, p);
+ return -ENOTTY;
+}
+
static const struct file_operations thread_with_stdio_fops = {
- .release = thread_with_stdio_release,
+ .llseek = no_llseek,
.read = thread_with_stdio_read,
.write = thread_with_stdio_write,
.poll = thread_with_stdio_poll,
+ .flush = thread_with_stdio_flush,
+ .release = thread_with_stdio_release,
+ .unlocked_ioctl = thread_with_stdio_ioctl,
+};
+
+static const struct file_operations thread_with_stdout_fops = {
.llseek = no_llseek,
+ .read = thread_with_stdio_read,
+ .poll = thread_with_stdout_poll,
+ .flush = thread_with_stdio_flush,
+ .release = thread_with_stdio_release,
+ .unlocked_ioctl = thread_with_stdio_ioctl,
};
+static int thread_with_stdio_fn(void *arg)
+{
+ struct thread_with_stdio *thr = arg;
+
+ thr->thr.ret = thr->ops->fn(thr);
+
+ thread_with_stdio_done(thr);
+ return 0;
+}
+
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
- void (*exit)(struct thread_with_stdio *),
- int (*fn)(void *))
+ const struct thread_with_stdio_ops *ops)
{
- thr->stdio.input_buf = PRINTBUF;
- thr->stdio.input_buf.atomic++;
- spin_lock_init(&thr->stdio.input_lock);
- init_waitqueue_head(&thr->stdio.input_wait);
+ stdio_buf_init(&thr->stdio.input);
+ stdio_buf_init(&thr->stdio.output);
+ thr->ops = ops;
- thr->stdio.output_buf = PRINTBUF;
- thr->stdio.output_buf.atomic++;
- spin_lock_init(&thr->stdio.output_lock);
- init_waitqueue_head(&thr->stdio.output_wait);
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
+}
- darray_init(&thr->output2);
- thr->exit = exit;
+int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
+{
+ stdio_buf_init(&thr->stdio.input);
+ stdio_buf_init(&thr->stdio.output);
+ thr->ops = ops;
- return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
}
+EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
- wait_event(stdio->input_wait,
- stdio->input_buf.pos || stdio->done);
+ struct stdio_buf *buf = &stdio->input;
+
+ /*
+ * we're waiting on user input (or for the file descriptor to be
+ * closed), don't want a hung task warning:
+ */
+ do {
+ wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } while (!stdio_redirect_has_input(stdio));
if (stdio->done)
return -1;
- spin_lock(&stdio->input_lock);
- int ret = min(len, stdio->input_buf.pos);
- stdio->input_buf.pos -= ret;
- memcpy(buf, stdio->input_buf.buf, ret);
- memmove(stdio->input_buf.buf,
- stdio->input_buf.buf + ret,
- stdio->input_buf.pos);
- spin_unlock(&stdio->input_lock);
+ spin_lock(&buf->lock);
+ int ret = min(len, buf->buf.nr);
+ buf->buf.nr -= ret;
+ memcpy(ubuf, buf->buf.data, ret);
+ memmove(buf->buf.data,
+ buf->buf.data + ret,
+ buf->buf.nr);
+ spin_unlock(&buf->lock);
- wake_up(&stdio->input_wait);
+ wake_up(&buf->wait);
return ret;
}
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
- wait_event(stdio->input_wait,
- stdio->input_buf.pos || stdio->done);
-
- if (stdio->done)
- return -1;
+ struct stdio_buf *buf = &stdio->input;
+ size_t copied = 0;
+ ssize_t ret = 0;
+again:
+ do {
+ wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } while (!stdio_redirect_has_input(stdio));
+
+ if (stdio->done) {
+ ret = -1;
+ goto out;
+ }
- spin_lock(&stdio->input_lock);
- int ret = min(len, stdio->input_buf.pos);
- char *n = memchr(stdio->input_buf.buf, '\n', ret);
+ spin_lock(&buf->lock);
+ size_t b = min(len, buf->buf.nr);
+ char *n = memchr(buf->buf.data, '\n', b);
if (n)
- ret = min(ret, n + 1 - stdio->input_buf.buf);
- stdio->input_buf.pos -= ret;
- memcpy(buf, stdio->input_buf.buf, ret);
- memmove(stdio->input_buf.buf,
- stdio->input_buf.buf + ret,
- stdio->input_buf.pos);
- spin_unlock(&stdio->input_lock);
-
- wake_up(&stdio->input_wait);
+ b = min_t(size_t, b, n + 1 - buf->buf.data);
+ buf->buf.nr -= b;
+ memcpy(ubuf, buf->buf.data, b);
+ memmove(buf->buf.data,
+ buf->buf.data + b,
+ buf->buf.nr);
+ ubuf += b;
+ len -= b;
+ copied += b;
+ spin_unlock(&buf->lock);
+
+ wake_up(&buf->wait);
+
+ if (!n && len)
+ goto again;
+out:
+ return copied ?: ret;
+}
+
+__printf(3, 0)
+static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+ ssize_t ret;
+
+ do {
+ va_list args2;
+ size_t len;
+
+ va_copy(args2, args);
+ len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+ va_end(args2);
+
+ if (len + 1 <= darray_room(*out)) {
+ out->nr += len;
+ return len;
+ }
+
+ ret = darray_make_room_gfp(out, len + 1, gfp);
+ } while (ret == 0);
+
+ return ret;
+}
+
+ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+ const char *fmt, va_list args)
+{
+ struct stdio_buf *buf = &stdio->output;
+ unsigned long flags;
+ ssize_t ret;
+
+again:
+ spin_lock_irqsave(&buf->lock, flags);
+ ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
+ spin_unlock_irqrestore(&buf->lock, flags);
+
+ if (ret < 0) {
+ if (nonblocking)
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(buf->wait,
+ stdio_redirect_has_output_space(stdio));
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+ wake_up(&buf->wait);
+ return ret;
+}
+
+ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+ const char *fmt, ...)
+{
+ va_list args;
+ ssize_t ret;
+
+ va_start(args, fmt);
+ ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+ va_end(args);
+
return ret;
}
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 05879c5048c8..af54ea8f5b0f 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -4,6 +4,38 @@
#include "thread_with_file_types.h"
+/*
+ * Thread with file: Run a kthread and connect it to a file descriptor, so that
+ * it can be interacted with via fd read/write methods and closing the file
+ * descriptor stops the kthread.
+ *
+ * We have two different APIs:
+ *
+ * thread_with_file, the low level version.
+ * You get to define the full file_operations, including your release function,
+ * which means that you must call bch2_thread_with_file_exit() from your
+ * .release method
+ *
+ * thread_with_stdio, the higher level version
+ * This implements full piping of input and output, including .poll.
+ *
+ * Notes on behaviour:
+ * - kthread shutdown behaves like writing or reading from a pipe that has been
+ * closed
+ * - Input and output buffers are 4096 bytes, although buffers may in some
+ * situations slightly exceed that limit so as to avoid chopping off a
+ * message in the middle in nonblocking mode.
+ * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
+ * should be fine but might change in future revisions.
+ * - Output buffer may grow past 4096 bytes to deal with messages that are
+ * bigger than 4096 bytes
+ * - Writing may be done blocking or nonblocking; in nonblocking mode, we only
+ * drop entire messages.
+ *
+ * To write, use stdio_redirect_printf()
+ * To read, use stdio_redirect_read() or stdio_redirect_readline()
+ */
+
struct task_struct;
struct thread_with_file {
@@ -17,25 +49,28 @@ int bch2_run_thread_with_file(struct thread_with_file *,
const struct file_operations *,
int (*fn)(void *));
+struct thread_with_stdio;
+
+struct thread_with_stdio_ops {
+ void (*exit)(struct thread_with_stdio *);
+ int (*fn)(struct thread_with_stdio *);
+ long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
+};
+
struct thread_with_stdio {
struct thread_with_file thr;
struct stdio_redirect stdio;
- DARRAY(char) output2;
- void (*exit)(struct thread_with_stdio *);
+ const struct thread_with_stdio_ops *ops;
};
-static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
- thr->thr.done = true;
- thr->stdio.done = true;
- wake_up(&thr->stdio.input_wait);
- wake_up(&thr->stdio.output_wait);
-}
-
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
- void (*exit)(struct thread_with_stdio *),
- int (*fn)(void *));
+ const struct thread_with_stdio_ops *);
+int bch2_run_thread_with_stdout(struct thread_with_stdio *,
+ const struct thread_with_stdio_ops *);
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+
#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index 90b5e645e98c..e0daf4eec341 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -2,14 +2,21 @@
#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#include "darray.h"
+
+struct stdio_buf {
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ darray_char buf;
+};
+
struct stdio_redirect {
- spinlock_t output_lock;
- wait_queue_head_t output_wait;
- struct printbuf output_buf;
+ struct stdio_buf input;
+ struct stdio_buf output;
spinlock_t input_lock;
wait_queue_head_t input_wait;
- struct printbuf input_buf;
+ darray_char input_buf;
bool done;
};
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
new file mode 100644
index 000000000000..4508e9dcbee2
--- /dev/null
+++ b/fs/bcachefs/time_stats.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+
+#include "eytzinger.h"
+#include "time_stats.h"
+
+static const struct time_unit time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "d", (u64) NSEC_PER_SEC * 3600 * 24},
+ { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+ { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+ { "eon", U64_MAX },
+};
+
+const struct time_unit *bch2_pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+ unsigned i = 0;
+
+ while (i < ARRAY_SIZE(q->entries)) {
+ struct quantile_entry *e = q->entries + i;
+
+ if (unlikely(!e->step)) {
+ e->m = v;
+ e->step = max_t(unsigned, v / 2, 1024);
+ } else if (e->m > v) {
+ e->m = e->m >= e->step
+ ? e->m - e->step
+ : 0;
+ } else if (e->m < v) {
+ e->m = e->m + e->step > e->m
+ ? e->m + e->step
+ : U32_MAX;
+ }
+
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
+ e->step = max_t(unsigned, e->step / 2, 1);
+
+ if (v >= e->m)
+ break;
+
+ i = eytzinger0_child(i, v > e->m);
+ }
+}
+
+static inline void time_stats_update_one(struct bch2_time_stats *stats,
+ u64 start, u64 end)
+{
+ u64 duration, freq;
+ bool initted = stats->last_event != 0;
+
+ if (time_after64(end, start)) {
+ struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+ duration = end - start;
+ mean_and_variance_update(&stats->duration_stats, duration);
+ mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+ duration, initted, TIME_STATS_MV_WEIGHT);
+ stats->max_duration = max(stats->max_duration, duration);
+ stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
+
+ if (quantiles)
+ quantiles_update(quantiles, duration);
+ }
+
+ if (stats->last_event && time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ mean_and_variance_update(&stats->freq_stats, freq);
+ mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+ freq, initted, TIME_STATS_MV_WEIGHT);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+ }
+
+ stats->last_event = end;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ for (struct time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+
+static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&stats->lock, flags);
+ __bch2_time_stats_clear_buffer(stats, b);
+ spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+ unsigned long flags;
+
+ if (!stats->buffer) {
+ spin_lock_irqsave(&stats->lock, flags);
+ time_stats_update_one(stats, start, end);
+
+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+ stats->duration_stats.n > 1024)
+ stats->buffer =
+ alloc_percpu_gfp(struct time_stat_buffer,
+ GFP_ATOMIC);
+ spin_unlock_irqrestore(&stats->lock, flags);
+ } else {
+ struct time_stat_buffer *b;
+
+ preempt_disable();
+ b = this_cpu_ptr(stats->buffer);
+
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+ b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+ .start = start,
+ .end = end
+ };
+
+ if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+ time_stats_clear_buffer(stats, b);
+ preempt_enable();
+ }
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+ free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->min_duration = U64_MAX;
+ stats->min_freq = U64_MAX;
+ spin_lock_init(&stats->lock);
+}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
new file mode 100644
index 000000000000..5df61403744b
--- /dev/null
+++ b/fs/bcachefs/time_stats.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * bch2_time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ * everywhere without worrying about overhead
+ *
+ * tracks:
+ * - number of events
+ * - maximum event duration ever seen
+ * - sum of all event durations
+ * - average event duration, standard and weighted
+ * - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _BCACHEFS_TIME_STATS_H
+#define _BCACHEFS_TIME_STATS_H
+
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+#include <linux/string.h>
+
+#include "mean_and_variance.h"
+
+struct time_unit {
+ const char *name;
+ u64 nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *bch2_pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES 15
+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+ struct quantile_entry {
+ u64 m;
+ u64 step;
+ } entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+ unsigned nr;
+ struct time_stat_buffer_entry {
+ u64 start;
+ u64 end;
+ } entries[31];
+};
+
+struct bch2_time_stats {
+ spinlock_t lock;
+ bool have_quantiles;
+ /* all fields are in nanoseconds */
+ u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
+ u64 max_freq;
+ u64 min_freq;
+ u64 last_event;
+ u64 last_event_start;
+
+ struct mean_and_variance duration_stats;
+ struct mean_and_variance freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT 8
+
+ struct mean_and_variance_weighted duration_stats_weighted;
+ struct mean_and_variance_weighted freq_stats_weighted;
+ struct time_stat_buffer __percpu *buffer;
+};
+
+struct bch2_time_stats_quantiles {
+ struct bch2_time_stats stats;
+ struct quantiles quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
+{
+ return stats->have_quantiles
+ ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
+ : NULL;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats - bch2_time_stats to update
+ * @start - start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+ __bch2_time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats - bch2_time_stats to update
+ * @v - new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
+{
+ if (v != !!stats->last_event_start) {
+ if (!v) {
+ bch2_time_stats_update(stats, stats->last_event_start);
+ stats->last_event_start = 0;
+ } else {
+ stats->last_event_start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
+{
+ bch2_time_stats_exit(&statq->stats);
+}
+static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
+{
+ bch2_time_stats_init(&statq->stats);
+ statq->stats.have_quantiles = true;
+ memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
+#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 293b90d704fb..6aa81d1e6d36 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1431,6 +1431,25 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
+TRACE_EVENT(error_downcast,
+ TP_PROTO(int bch_err, int std_err, unsigned long ip),
+ TP_ARGS(bch_err, std_err, ip),
+
+ TP_STRUCT__entry(
+ __array(char, bch_err, 32 )
+ __array(char, std_err, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+ strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 3a32faa86b5c..216fadf16928 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -337,157 +337,16 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
}
#endif
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
- const struct time_unit *u = pick_time_units(ns);
+ const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
- unsigned i = 0;
-
- while (i < ARRAY_SIZE(q->entries)) {
- struct bch2_quantile_entry *e = q->entries + i;
-
- if (unlikely(!e->step)) {
- e->m = v;
- e->step = max_t(unsigned, v / 2, 1024);
- } else if (e->m > v) {
- e->m = e->m >= e->step
- ? e->m - e->step
- : 0;
- } else if (e->m < v) {
- e->m = e->m + e->step > e->m
- ? e->m + e->step
- : U32_MAX;
- }
-
- if ((e->m > v ? e->m - v : v - e->m) < e->step)
- e->step = max_t(unsigned, e->step / 2, 1);
-
- if (v >= e->m)
- break;
-
- i = eytzinger0_child(i, v > e->m);
- }
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
- u64 start, u64 end)
-{
- u64 duration, freq;
-
- if (time_after64(end, start)) {
- duration = end - start;
- mean_and_variance_update(&stats->duration_stats, duration);
- mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
- stats->max_duration = max(stats->max_duration, duration);
- stats->min_duration = min(stats->min_duration, duration);
- stats->total_duration += duration;
- bch2_quantiles_update(&stats->quantiles, duration);
- }
-
- if (stats->last_event && time_after64(end, stats->last_event)) {
- freq = end - stats->last_event;
- mean_and_variance_update(&stats->freq_stats, freq);
- mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
- stats->max_freq = max(stats->max_freq, freq);
- stats->min_freq = min(stats->min_freq, freq);
- }
-
- stats->last_event = end;
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- for (struct bch2_time_stat_buffer_entry *i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
- b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&stats->lock, flags);
- __bch2_time_stats_clear_buffer(stats, b);
- spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
- unsigned long flags;
-
- WARN_ONCE(!stats->duration_stats_weighted.weight ||
- !stats->freq_stats_weighted.weight,
- "uninitialized time_stats");
-
- if (!stats->buffer) {
- spin_lock_irqsave(&stats->lock, flags);
- bch2_time_stats_update_one(stats, start, end);
-
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
- stats->duration_stats.n > 1024)
- stats->buffer =
- alloc_percpu_gfp(struct bch2_time_stat_buffer,
- GFP_ATOMIC);
- spin_unlock_irqrestore(&stats->lock, flags);
- } else {
- struct bch2_time_stat_buffer *b;
-
- preempt_disable();
- b = this_cpu_ptr(stats->buffer);
-
- BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
- b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
- .start = start,
- .end = end
- };
-
- if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
- bch2_time_stats_clear_buffer(stats, b);
- preempt_enable();
- }
-}
-
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
- const struct time_unit *u = pick_time_units(ns);
+ const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
prt_tab_rjust(out);
@@ -506,10 +365,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
- const struct time_unit *u;
+ struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
- u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
- int i;
+ u64 f_stddev = 0, d_stddev = 0;
if (stats->buffer) {
int cpu;
@@ -571,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, d_mean);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, d_stddev);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
@@ -594,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, f_mean);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, f_stddev);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
printbuf_tabstops_reset(out);
- i = eytzinger0_first(NR_QUANTILES);
- u = pick_time_units(stats->quantiles.entries[i].m);
-
- prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(i, NR_QUANTILES) {
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
- q = max(stats->quantiles.entries[i].m, last_q);
- prt_printf(out, "%llu ",
- div_u64(q, u->nsecs));
- if (is_last)
- prt_newline(out);
- last_q = q;
+ if (quantiles) {
+ int i = eytzinger0_first(NR_QUANTILES);
+ const struct time_unit *u =
+ bch2_pick_time_units(quantiles->entries[i].m);
+ u64 last_q = 0;
+
+ prt_printf(out, "quantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ u64 q = max(quantiles->entries[i].m, last_q);
+ prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+ if (is_last)
+ prt_newline(out);
+ last_q = q;
+ }
}
}
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
- free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
- memset(stats, 0, sizeof(*stats));
- stats->duration_stats_weighted.weight = 8;
- stats->freq_stats_weighted.weight = 8;
- stats->min_duration = U64_MAX;
- stats->min_freq = U64_MAX;
- spin_lock_init(&stats->lock);
-}
/* ratelimit: */
@@ -1007,28 +850,6 @@ void sort_cmp_size(void *base, size_t num, size_t size,
}
}
-static void mempool_free_vp(void *element, void *pool_data)
-{
- size_t size = (size_t) pool_data;
-
- vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
- size_t size = (size_t) pool_data;
-
- return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
- return size < PAGE_SIZE
- ? mempool_init_kmalloc_pool(pool, min_nr, size)
- : mempool_init(pool, min_nr, mempool_alloc_vp,
- mempool_free_vp, (void *) size);
-}
-
#if 0
void eytzinger1_test(void)
{
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b414736d59a5..175aee3074c7 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -21,6 +21,7 @@
#include "mean_and_variance.h"
#include "darray.h"
+#include "time_stats.h"
struct closure;
@@ -53,38 +54,6 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
-static inline void vpfree(void *p, size_t size)
-{
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- free_pages((unsigned long) p, get_order(size));
-}
-
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
- return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
- get_order(size)) ?:
- __vmalloc(size, gfp_mask);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
- if (size < PAGE_SIZE)
- kfree(p);
- else
- vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
-{
- return size < PAGE_SIZE
- ? kmalloc(size, gfp_mask)
- : vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
#define HEAP(type) \
struct { \
size_t size, used; \
@@ -97,13 +66,13 @@ struct { \
({ \
(heap)->used = 0; \
(heap)->size = (_size); \
- (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+ (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
(gfp)); \
})
#define free_heap(heap) \
do { \
- kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
+ kvfree((heap)->data); \
(heap)->data = NULL; \
} while (0)
@@ -361,84 +330,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
#endif
}
-#define NR_QUANTILES 15
-#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
- struct bch2_quantile_entry {
- u64 m;
- u64 step;
- } entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
- unsigned nr;
- struct bch2_time_stat_buffer_entry {
- u64 start;
- u64 end;
- } entries[32];
-};
-
-struct bch2_time_stats {
- spinlock_t lock;
- /* all fields are in nanoseconds */
- u64 min_duration;
- u64 max_duration;
- u64 total_duration;
- u64 max_freq;
- u64 min_freq;
- u64 last_event;
- struct bch2_quantiles quantiles;
-
- struct mean_and_variance duration_stats;
- struct mean_and_variance_weighted duration_stats_weighted;
- struct mean_and_variance freq_stats;
- struct mean_and_variance_weighted freq_stats_weighted;
- struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
- __bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- if (v != !!*start) {
- if (!v) {
- bch2_time_stats_update(stats, *start);
- *start = 0;
- } else {
- *start = local_clock() ?: 1;
- return true;
- }
- }
-
- return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- bool ret = v && !*start;
- *start = v;
- return ret;
-}
-#endif
-
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-
#define ewma_add(ewma, val, weight) \
({ \
typeof(ewma) _ewma = (ewma); \
@@ -788,8 +681,15 @@ static inline void __move_gap(void *array, size_t element_size,
}
/* Move the gap in a gap buffer: */
-#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \
- __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+#define move_gap(_d, _new_gap) \
+do { \
+ BUG_ON(_new_gap > (_d)->nr); \
+ BUG_ON((_d)->gap > (_d)->nr); \
+ \
+ __move_gap((_d)->data, sizeof((_d)->data[0]), \
+ (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \
+ (_d)->gap = _new_gap; \
+} while (0)
#define bubble_sort(_base, _nr, _cmp) \
do { \
@@ -876,4 +776,25 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
void bch2_darray_str_exit(darray_str *);
int bch2_split_devs(const char *, darray_str *);
+#ifdef __KERNEL__
+
+__must_check
+static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
+__must_check
+static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
+{
+ return copy_from_user(to, from, n) ? -EFAULT : 0;
+}
+
+#endif
+
+static inline void __set_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 9c0d2316031b..754f17bba68e 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
kfree(buf);
if (ret < 0)
- return ret;
+ goto err_class_exit;
ret = bch2_opt_check_may_set(c, opt_id, v);
if (ret < 0)
- return ret;
+ goto err_class_exit;
s.v = v + 1;
s.defined = true;
@@ -595,6 +595,7 @@ err:
(opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
+err_class_exit:
return bch2_err_class(ret);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a2d07fa3cfdf..1dc1f1946ae0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1303,6 +1303,47 @@ int btrfs_forget_devices(dev_t devt)
return ret;
}
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
+ const char *path, dev_t devt,
+ bool mount_arg_dev)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Do not skip device registration for mounted devices with matching
+ * maj:min but different paths. Booting without initrd relies on
+ * /dev/root initially, later replaced with the actual root device.
+ * A successful scan ensures grub2-probe selects the correct device.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ if (!fs_devices->opened) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ continue;
+ }
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->bdev && (device->bdev->bd_dev == devt) &&
+ strcmp(device->name->str, path) != 0) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /* Do not skip registration. */
+ return false;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+ }
+
+ if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+ !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
+ return true;
+
+ return false;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -1320,6 +1361,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
struct btrfs_device *device = NULL;
struct file *bdev_file;
u64 bytenr, bytenr_orig;
+ dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
@@ -1359,19 +1401,13 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto error_bdev_put;
}
- if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
- !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
- dev_t devt;
+ devt = file_bdev(bdev_file)->bd_dev;
+ if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
+ pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+ path, MAJOR(devt), MINOR(devt));
- ret = lookup_bdev(path, &devt);
- if (ret)
- btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
- path, ret);
- else
- btrfs_free_stale_devices(devt, NULL);
+ btrfs_free_stale_devices(devt, NULL);
- pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
- path, MAJOR(devt), MINOR(devt));
device = NULL;
goto free_disk_super;
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 034a617cb1a5..a40da0065433 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -751,13 +751,28 @@ static void __debugfs_file_removed(struct dentry *dentry)
if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
return;
- /* if we hit zero, just wait for all to finish */
- if (!refcount_dec_and_test(&fsd->active_users)) {
- wait_for_completion(&fsd->active_users_drained);
+ /* if this was the last reference, we're done */
+ if (refcount_dec_and_test(&fsd->active_users))
return;
- }
- /* if we didn't hit zero, try to cancel any we can */
+ /*
+ * If there's still a reference, the code that obtained it can
+ * be in different states:
+ * - The common case of not using cancellations, or already
+ * after debugfs_leave_cancellation(), where we just need
+ * to wait for debugfs_file_put() which signals the completion;
+ * - inside a cancellation section, i.e. between
+ * debugfs_enter_cancellation() and debugfs_leave_cancellation(),
+ * in which case we need to trigger the ->cancel() function,
+ * and then wait for debugfs_file_put() just like in the
+ * previous case;
+ * - before debugfs_enter_cancellation() (but obviously after
+ * debugfs_file_get()), in which case we may not see the
+ * cancellation in the list on the first round of the loop,
+ * but debugfs_enter_cancellation() signals the completion
+ * after adding it, so this code gets woken up to call the
+ * ->cancel() function.
+ */
while (refcount_read(&fsd->active_users)) {
struct debugfs_cancellation *c;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index dfc444dad329..3b4dbce849f0 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -246,7 +246,7 @@ struct dlm_lkb {
int8_t lkb_highbast; /* highest mode bast sent for */
int8_t lkb_wait_type; /* type of reply waiting for */
- atomic_t lkb_wait_count;
+ int8_t lkb_wait_count;
int lkb_wait_nodeid; /* for debugging */
struct list_head lkb_statequeue; /* rsb g/c/w list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 652c51fbbf76..fd752dd03896 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1407,7 +1407,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error = 0;
- int wc;
mutex_lock(&ls->ls_waiters_mutex);
@@ -1429,17 +1428,20 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
error = -EBUSY;
goto out;
}
- wc = atomic_inc_return(&lkb->lkb_wait_count);
+ lkb->lkb_wait_count++;
hold_lkb(lkb);
log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
- lkb->lkb_id, lkb->lkb_wait_type, mstype, wc,
- dlm_iflags_val(lkb));
+ lkb->lkb_id, lkb->lkb_wait_type, mstype,
+ lkb->lkb_wait_count, dlm_iflags_val(lkb));
goto out;
}
- wc = atomic_fetch_inc(&lkb->lkb_wait_count);
- DLM_ASSERT(!wc, dlm_print_lkb(lkb); printk("wait_count %d\n", wc););
+ DLM_ASSERT(!lkb->lkb_wait_count,
+ dlm_print_lkb(lkb);
+ printk("wait_count %d\n", lkb->lkb_wait_count););
+
+ lkb->lkb_wait_count++;
lkb->lkb_wait_type = mstype;
lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
hold_lkb(lkb);
@@ -1502,7 +1504,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
lkb->lkb_id);
lkb->lkb_wait_type = 0;
- atomic_dec(&lkb->lkb_wait_count);
+ lkb->lkb_wait_count--;
unhold_lkb(lkb);
goto out_del;
}
@@ -1529,15 +1531,16 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
if (overlap_done && lkb->lkb_wait_type) {
log_error(ls, "remwait error %x reply %d wait_type %d overlap",
lkb->lkb_id, mstype, lkb->lkb_wait_type);
- atomic_dec(&lkb->lkb_wait_count);
+ lkb->lkb_wait_count--;
unhold_lkb(lkb);
lkb->lkb_wait_type = 0;
}
- DLM_ASSERT(atomic_read(&lkb->lkb_wait_count), dlm_print_lkb(lkb););
+ DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
- if (atomic_dec_and_test(&lkb->lkb_wait_count))
+ lkb->lkb_wait_count--;
+ if (!lkb->lkb_wait_count)
list_del_init(&lkb->lkb_wait_reply);
unhold_lkb(lkb);
return 0;
@@ -2666,7 +2669,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
goto out;
/* lock not allowed if there's any op in progress */
- if (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count))
+ if (lkb->lkb_wait_type || lkb->lkb_wait_count)
goto out;
if (is_overlap(lkb))
@@ -2728,7 +2731,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
/* normal unlock not allowed if there's any op in progress */
if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) &&
- (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count)))
+ (lkb->lkb_wait_type || lkb->lkb_wait_count))
goto out;
/* an lkb may be waiting for an rsb lookup to complete where the
@@ -5011,21 +5014,32 @@ static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
return lkb;
}
-/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
- master or dir-node for r. Processing the lkb may result in it being placed
- back on waiters. */
-
-/* We do this after normal locking has been enabled and any saved messages
- (in requestqueue) have been processed. We should be confident that at
- this point we won't get or process a reply to any of these waiting
- operations. But, new ops may be coming in on the rsbs/locks here from
- userspace or remotely. */
-
-/* there may have been an overlap unlock/cancel prior to recovery or after
- recovery. if before, the lkb may still have a pos wait_count; if after, the
- overlap flag would just have been set and nothing new sent. we can be
- confident here than any replies to either the initial op or overlap ops
- prior to recovery have been received. */
+/*
+ * Forced state reset for locks that were in the middle of remote operations
+ * when recovery happened (i.e. lkbs that were on the waiters list, waiting
+ * for a reply from a remote operation.) The lkbs remaining on the waiters
+ * list need to be reevaluated; some may need resending to a different node
+ * than previously, and some may now need local handling rather than remote.
+ *
+ * First, the lkb state for the voided remote operation is forcibly reset,
+ * equivalent to what remove_from_waiters() would normally do:
+ * . lkb removed from ls_waiters list
+ * . lkb wait_type cleared
+ * . lkb waiters_count cleared
+ * . lkb ref count decremented for each waiters_count (almost always 1,
+ * but possibly 2 in case of cancel/unlock overlapping, which means
+ * two remote replies were being expected for the lkb.)
+ *
+ * Second, the lkb is reprocessed like an original operation would be,
+ * by passing it to _request_lock or _convert_lock, which will either
+ * process the lkb operation locally, or send it to a remote node again
+ * and put the lkb back onto the waiters list.
+ *
+ * When reprocessing the lkb, we may find that it's flagged for an overlapping
+ * force-unlock or cancel, either from before recovery began, or after recovery
+ * finished. If this is the case, the unlock/cancel is done directly, and the
+ * original operation is not initiated again (no _request_lock/_convert_lock.)
+ */
int dlm_recover_waiters_post(struct dlm_ls *ls)
{
@@ -5040,6 +5054,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
break;
}
+ /*
+ * Find an lkb from the waiters list that's been affected by
+ * recovery node changes, and needs to be reprocessed. Does
+ * hold_lkb(), adding a refcount.
+ */
lkb = find_resend_waiter(ls);
if (!lkb)
break;
@@ -5048,6 +5067,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
hold_rsb(r);
lock_rsb(r);
+ /*
+ * If the lkb has been flagged for a force unlock or cancel,
+ * then the reprocessing below will be replaced by just doing
+ * the unlock/cancel directly.
+ */
mstype = lkb->lkb_wait_type;
oc = test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT,
&lkb->lkb_iflags);
@@ -5061,22 +5085,40 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
dlm_dir_nodeid(r), oc, ou);
- /* At this point we assume that we won't get a reply to any
- previous op or overlap op on this lock. First, do a big
- remove_from_waiters() for all previous ops. */
+ /*
+ * No reply to the pre-recovery operation will now be received,
+ * so a forced equivalent of remove_from_waiters() is needed to
+ * reset the waiters state that was in place before recovery.
+ */
clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
+
+ /* Forcibly clear wait_type */
lkb->lkb_wait_type = 0;
- /* drop all wait_count references we still
- * hold a reference for this iteration.
+
+ /*
+ * Forcibly reset wait_count and associated refcount. The
+ * wait_count will almost always be 1, but in case of an
+ * overlapping unlock/cancel it could be 2: see where
+ * add_to_waiters() finds the lkb is already on the waiters
+ * list and does lkb_wait_count++; hold_lkb().
*/
- while (!atomic_dec_and_test(&lkb->lkb_wait_count))
+ while (lkb->lkb_wait_count) {
+ lkb->lkb_wait_count--;
unhold_lkb(lkb);
+ }
+ /* Forcibly remove from waiters list */
mutex_lock(&ls->ls_waiters_mutex);
list_del_init(&lkb->lkb_wait_reply);
mutex_unlock(&ls->ls_waiters_mutex);
+ /*
+ * The lkb is now clear of all prior waiters state and can be
+ * processed locally, or sent to remote node again, or directly
+ * cancelled/unlocked.
+ */
+
if (oc || ou) {
/* do an unlock or cancel instead of resending */
switch (mstype) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 695e691b38b3..9f9b68448830 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -806,7 +806,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
struct dlm_lkb *lkb;
DECLARE_WAITQUEUE(wait, current);
struct dlm_callback *cb;
- int rv, copy_lvb = 0;
+ int rv, ret, copy_lvb = 0;
int old_mode, new_mode;
if (count == sizeof(struct dlm_device_version)) {
@@ -906,9 +906,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
}
- rv = copy_result_to_user(lkb->lkb_ua,
- test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
- cb->flags, cb->mode, copy_lvb, buf, count);
+ ret = copy_result_to_user(lkb->lkb_ua,
+ test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+ cb->flags, cb->mode, copy_lvb, buf, count);
kref_put(&cb->ref, dlm_release_callback);
@@ -916,7 +916,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
if (rv == DLM_DEQUEUE_CALLBACK_LAST)
dlm_put_lkb(lkb);
- return rv;
+ return ret;
}
static __poll_t device_poll(struct file *file, poll_table *wait)
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 9f9295847a4e..077944d3c2c0 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -448,88 +448,34 @@ static void exfat_init_name_entry(struct exfat_dentry *ep,
}
}
-int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
- int entry, unsigned int type, unsigned int start_clu,
- unsigned long long size)
+void exfat_init_dir_entry(struct exfat_entry_set_cache *es,
+ unsigned int type, unsigned int start_clu,
+ unsigned long long size, struct timespec64 *ts)
{
- struct super_block *sb = inode->i_sb;
+ struct super_block *sb = es->sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct timespec64 ts = current_time(inode);
struct exfat_dentry *ep;
- struct buffer_head *bh;
-
- /*
- * We cannot use exfat_get_dentry_set here because file ep is not
- * initialized yet.
- */
- ep = exfat_get_dentry(sb, p_dir, entry, &bh);
- if (!ep)
- return -EIO;
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
exfat_set_entry_type(ep, type);
- exfat_set_entry_time(sbi, &ts,
+ exfat_set_entry_time(sbi, ts,
&ep->dentry.file.create_tz,
&ep->dentry.file.create_time,
&ep->dentry.file.create_date,
&ep->dentry.file.create_time_cs);
- exfat_set_entry_time(sbi, &ts,
+ exfat_set_entry_time(sbi, ts,
&ep->dentry.file.modify_tz,
&ep->dentry.file.modify_time,
&ep->dentry.file.modify_date,
&ep->dentry.file.modify_time_cs);
- exfat_set_entry_time(sbi, &ts,
+ exfat_set_entry_time(sbi, ts,
&ep->dentry.file.access_tz,
&ep->dentry.file.access_time,
&ep->dentry.file.access_date,
NULL);
- exfat_update_bh(bh, IS_DIRSYNC(inode));
- brelse(bh);
-
- ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh);
- if (!ep)
- return -EIO;
-
+ ep = exfat_get_dentry_cached(es, ES_IDX_STREAM);
exfat_init_stream_entry(ep, start_clu, size);
- exfat_update_bh(bh, IS_DIRSYNC(inode));
- brelse(bh);
-
- return 0;
-}
-
-int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
- int entry)
-{
- struct super_block *sb = inode->i_sb;
- int ret = 0;
- int i, num_entries;
- u16 chksum;
- struct exfat_dentry *ep, *fep;
- struct buffer_head *fbh, *bh;
-
- fep = exfat_get_dentry(sb, p_dir, entry, &fbh);
- if (!fep)
- return -EIO;
-
- num_entries = fep->dentry.file.num_ext + 1;
- chksum = exfat_calc_chksum16(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
-
- for (i = 1; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
- if (!ep) {
- ret = -EIO;
- goto release_fbh;
- }
- chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
- CS_DEFAULT);
- brelse(bh);
- }
-
- fep->dentry.file.checksum = cpu_to_le16(chksum);
- exfat_update_bh(fbh, IS_DIRSYNC(inode));
-release_fbh:
- brelse(fbh);
- return ret;
}
static void exfat_free_benign_secondary_clusters(struct inode *inode,
@@ -551,76 +497,49 @@ static void exfat_free_benign_secondary_clusters(struct inode *inode,
exfat_free_cluster(inode, &dir);
}
-int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
- int entry, int num_entries, struct exfat_uni_name *p_uniname)
+void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries,
+ struct exfat_uni_name *p_uniname)
{
- struct super_block *sb = inode->i_sb;
int i;
unsigned short *uniname = p_uniname->name;
struct exfat_dentry *ep;
- struct buffer_head *bh;
- int sync = IS_DIRSYNC(inode);
-
- ep = exfat_get_dentry(sb, p_dir, entry, &bh);
- if (!ep)
- return -EIO;
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
ep->dentry.file.num_ext = (unsigned char)(num_entries - 1);
- exfat_update_bh(bh, sync);
- brelse(bh);
-
- ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh);
- if (!ep)
- return -EIO;
+ ep = exfat_get_dentry_cached(es, ES_IDX_STREAM);
ep->dentry.stream.name_len = p_uniname->name_len;
ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash);
- exfat_update_bh(bh, sync);
- brelse(bh);
-
- for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
- if (!ep)
- return -EIO;
-
- if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC)
- exfat_free_benign_secondary_clusters(inode, ep);
+ for (i = ES_IDX_FIRST_FILENAME; i < num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
exfat_init_name_entry(ep, uniname);
- exfat_update_bh(bh, sync);
- brelse(bh);
uniname += EXFAT_FILE_NAME_LEN;
}
- exfat_update_dir_chksum(inode, p_dir, entry);
- return 0;
+ exfat_update_dir_chksum(es);
}
-int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
- int entry, int order, int num_entries)
+void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es,
+ int order)
{
- struct super_block *sb = inode->i_sb;
int i;
struct exfat_dentry *ep;
- struct buffer_head *bh;
- for (i = order; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
- if (!ep)
- return -EIO;
+ for (i = order; i < es->num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC)
exfat_free_benign_secondary_clusters(inode, ep);
exfat_set_entry_type(ep, TYPE_DELETED);
- exfat_update_bh(bh, IS_DIRSYNC(inode));
- brelse(bh);
}
- return 0;
+ if (order < es->num_entries)
+ es->modified = true;
}
-void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
+void exfat_update_dir_chksum(struct exfat_entry_set_cache *es)
{
int chksum_type = CS_DIR_ENTRY, i;
unsigned short chksum = 0;
@@ -775,7 +694,6 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
}
enum exfat_validate_dentry_mode {
- ES_MODE_STARTED,
ES_MODE_GET_FILE_ENTRY,
ES_MODE_GET_STRM_ENTRY,
ES_MODE_GET_NAME_ENTRY,
@@ -790,11 +708,6 @@ static bool exfat_validate_entry(unsigned int type,
return false;
switch (*mode) {
- case ES_MODE_STARTED:
- if (type != TYPE_FILE && type != TYPE_DIR)
- return false;
- *mode = ES_MODE_GET_FILE_ENTRY;
- break;
case ES_MODE_GET_FILE_ENTRY:
if (type != TYPE_STREAM)
return false;
@@ -834,7 +747,7 @@ struct exfat_dentry *exfat_get_dentry_cached(
}
/*
- * Returns a set of dentries for a file or dir.
+ * Returns a set of dentries.
*
* Note It provides a direct pointer to bh->data via exfat_get_dentry_cached().
* User should call exfat_get_dentry_set() after setting 'modified' to apply
@@ -842,22 +755,24 @@ struct exfat_dentry *exfat_get_dentry_cached(
*
* in:
* sb+p_dir+entry: indicates a file/dir
- * type: specifies how many dentries should be included.
+ * num_entries: specifies how many dentries should be included.
+ * It will be set to es->num_entries if it is not 0.
+ * If num_entries is 0, es->num_entries will be obtained
+ * from the first dentry.
+ * out:
+ * es: pointer of entry set on success.
* return:
- * pointer of entry set on success,
- * NULL on failure.
+ * 0 on success
+ * -error code on failure
*/
-int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+static int __exfat_get_dentry_set(struct exfat_entry_set_cache *es,
struct super_block *sb, struct exfat_chain *p_dir, int entry,
- unsigned int type)
+ unsigned int num_entries)
{
int ret, i, num_bh;
unsigned int off;
sector_t sec;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct exfat_dentry *ep;
- int num_entries;
- enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
struct buffer_head *bh;
if (p_dir->dir == DIR_DELETED) {
@@ -880,12 +795,18 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
return -EIO;
es->bh[es->num_bh++] = bh;
- ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
- if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto put_es;
+ if (num_entries == ES_ALL_ENTRIES) {
+ struct exfat_dentry *ep;
+
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
+ if (ep->type != EXFAT_FILE) {
+ brelse(bh);
+ return -EIO;
+ }
+
+ num_entries = ep->dentry.file.num_ext + 1;
+ }
- num_entries = type == ES_ALL_ENTRIES ?
- ep->dentry.file.num_ext + 1 : type;
es->num_entries = num_entries;
num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
@@ -918,8 +839,27 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
es->bh[es->num_bh++] = bh;
}
+ return 0;
+
+put_es:
+ exfat_put_dentry_set(es, false);
+ return -EIO;
+}
+
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, unsigned int num_entries)
+{
+ int ret, i;
+ struct exfat_dentry *ep;
+ enum exfat_validate_dentry_mode mode = ES_MODE_GET_FILE_ENTRY;
+
+ ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries);
+ if (ret < 0)
+ return ret;
+
/* validate cached dentries */
- for (i = ES_IDX_STREAM; i < num_entries; i++) {
+ for (i = ES_IDX_STREAM; i < es->num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
goto put_es;
@@ -931,6 +871,85 @@ put_es:
return -EIO;
}
+static int exfat_validate_empty_dentry_set(struct exfat_entry_set_cache *es)
+{
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+ int i, off;
+ bool unused_hit = false;
+
+ /*
+ * ONLY UNUSED OR DELETED DENTRIES ARE ALLOWED:
+ * Although it violates the specification for a deleted entry to
+ * follow an unused entry, some exFAT implementations could work
+ * like this. Therefore, to improve compatibility, let's allow it.
+ */
+ for (i = 0; i < es->num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
+ if (ep->type == EXFAT_UNUSED) {
+ unused_hit = true;
+ } else if (!IS_EXFAT_DELETED(ep->type)) {
+ if (unused_hit)
+ goto err_used_follow_unused;
+ i++;
+ goto count_skip_entries;
+ }
+ }
+
+ return 0;
+
+err_used_follow_unused:
+ off = es->start_off + (i << DENTRY_SIZE_BITS);
+ bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)];
+
+ exfat_fs_error(es->sb,
+ "in sector %lld, dentry %d should be unused, but 0x%x",
+ bh->b_blocknr, off >> DENTRY_SIZE_BITS, ep->type);
+
+ return -EIO;
+
+count_skip_entries:
+ es->num_entries = EXFAT_B_TO_DEN(EXFAT_BLK_TO_B(es->num_bh, es->sb) - es->start_off);
+ for (; i < es->num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
+ if (IS_EXFAT_DELETED(ep->type))
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Get an empty dentry set.
+ *
+ * in:
+ * sb+p_dir+entry: indicates the empty dentry location
+ * num_entries: specifies how many empty dentries should be included.
+ * out:
+ * es: pointer of empty dentry set on success.
+ * return:
+ * 0 : on success
+ * >0 : the dentries are not empty, the return value is the number of
+ * dentries to be skipped for the next lookup.
+ * <0 : on failure
+ */
+int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, unsigned int num_entries)
+{
+ int ret;
+
+ ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries);
+ if (ret < 0)
+ return ret;
+
+ ret = exfat_validate_empty_dentry_set(es);
+ if (ret)
+ exfat_put_dentry_set(es, false);
+
+ return ret;
+}
+
static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
{
hint_femp->eidx = EXFAT_HINT_NONE;
@@ -1187,27 +1206,6 @@ found:
return dentry - num_ext;
}
-int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
- int entry, struct exfat_dentry *ep)
-{
- int i, count = 0;
- unsigned int type;
- struct exfat_dentry *ext_ep;
- struct buffer_head *bh;
-
- for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) {
- ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh);
- if (!ext_ep)
- return -EIO;
-
- type = exfat_get_entry_type(ext_ep);
- brelse(bh);
- if (type & TYPE_CRITICAL_SEC || type & TYPE_BENIGN_SEC)
- count++;
- }
- return count;
-}
-
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
{
int i, count = 0;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 361595433480..ecc5db952deb 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -431,8 +431,6 @@ int exfat_ent_get(struct super_block *sb, unsigned int loc,
unsigned int *content);
int exfat_ent_set(struct super_block *sb, unsigned int loc,
unsigned int content);
-int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
- int entry, struct exfat_dentry *p_entry);
int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain,
unsigned int len);
int exfat_zeroed_cluster(struct inode *dir, unsigned int clu);
@@ -480,16 +478,14 @@ int exfat_get_cluster(struct inode *inode, unsigned int cluster,
extern const struct inode_operations exfat_dir_inode_operations;
extern const struct file_operations exfat_dir_operations;
unsigned int exfat_get_entry_type(struct exfat_dentry *p_entry);
-int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
- int entry, unsigned int type, unsigned int start_clu,
- unsigned long long size);
-int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
- int entry, int num_entries, struct exfat_uni_name *p_uniname);
-int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
- int entry, int order, int num_entries);
-int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
- int entry);
-void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es);
+void exfat_init_dir_entry(struct exfat_entry_set_cache *es,
+ unsigned int type, unsigned int start_clu,
+ unsigned long long size, struct timespec64 *ts);
+void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries,
+ struct exfat_uni_name *p_uniname);
+void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es,
+ int order);
+void exfat_update_dir_chksum(struct exfat_entry_set_cache *es);
int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
@@ -501,7 +497,10 @@ struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
int num);
int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
struct super_block *sb, struct exfat_chain *p_dir, int entry,
- unsigned int type);
+ unsigned int num_entries);
+int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir, int entry,
+ unsigned int num_entries);
int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 0687f952956c..dd894e558c91 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -94,7 +94,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
}
- exfat_update_dir_chksum_with_entry_set(&es);
+ exfat_update_dir_chksum(&es);
return exfat_put_dentry_set(&es, sync);
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 9c549fd11fc8..631ad9e8e32a 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -204,21 +204,16 @@ const struct dentry_operations exfat_utf8_dentry_ops = {
.d_compare = exfat_utf8_d_cmp,
};
-/* used only in search empty_slot() */
-#define CNT_UNUSED_NOHIT (-1)
-#define CNT_UNUSED_HIT (-2)
/* search EMPTY CONTINUOUS "num_entries" entries */
static int exfat_search_empty_slot(struct super_block *sb,
struct exfat_hint_femp *hint_femp, struct exfat_chain *p_dir,
- int num_entries)
+ int num_entries, struct exfat_entry_set_cache *es)
{
- int i, dentry, num_empty = 0;
+ int i, dentry, ret;
int dentries_per_clu;
- unsigned int type;
struct exfat_chain clu;
- struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct buffer_head *bh;
+ int total_entries = EXFAT_CLU_TO_DEN(p_dir->size, sbi);
dentries_per_clu = sbi->dentries_per_clu;
@@ -231,7 +226,7 @@ static int exfat_search_empty_slot(struct super_block *sb,
* Otherwise, and if "dentry + hint_famp->count" is also equal
* to "p_dir->size * dentries_per_clu", it means ENOSPC.
*/
- if (dentry + hint_femp->count == p_dir->size * dentries_per_clu &&
+ if (dentry + hint_femp->count == total_entries &&
num_entries > hint_femp->count)
return -ENOSPC;
@@ -242,69 +237,41 @@ static int exfat_search_empty_slot(struct super_block *sb,
dentry = 0;
}
- while (clu.dir != EXFAT_EOF_CLUSTER) {
+ while (dentry + num_entries < total_entries &&
+ clu.dir != EXFAT_EOF_CLUSTER) {
i = dentry & (dentries_per_clu - 1);
- for (; i < dentries_per_clu; i++, dentry++) {
- ep = exfat_get_dentry(sb, &clu, i, &bh);
- if (!ep)
- return -EIO;
- type = exfat_get_entry_type(ep);
- brelse(bh);
-
- if (type == TYPE_UNUSED || type == TYPE_DELETED) {
- num_empty++;
- if (hint_femp->eidx == EXFAT_HINT_NONE) {
- hint_femp->eidx = dentry;
- hint_femp->count = CNT_UNUSED_NOHIT;
- exfat_chain_set(&hint_femp->cur,
- clu.dir, clu.size, clu.flags);
- }
-
- if (type == TYPE_UNUSED &&
- hint_femp->count != CNT_UNUSED_HIT)
- hint_femp->count = CNT_UNUSED_HIT;
+ ret = exfat_get_empty_dentry_set(es, sb, &clu, i, num_entries);
+ if (ret < 0)
+ return ret;
+ else if (ret == 0)
+ return dentry;
+
+ dentry += ret;
+ i += ret;
+
+ while (i >= dentries_per_clu) {
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ if (--clu.size > 0)
+ clu.dir++;
+ else
+ clu.dir = EXFAT_EOF_CLUSTER;
} else {
- if (hint_femp->eidx != EXFAT_HINT_NONE &&
- hint_femp->count == CNT_UNUSED_HIT) {
- /* unused empty group means
- * an empty group which includes
- * unused dentry
- */
- exfat_fs_error(sb,
- "found bogus dentry(%d) beyond unused empty group(%d) (start_clu : %u, cur_clu : %u)",
- dentry, hint_femp->eidx,
- p_dir->dir, clu.dir);
+ if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
- }
-
- num_empty = 0;
- hint_femp->eidx = EXFAT_HINT_NONE;
}
- if (num_empty >= num_entries) {
- /* found and invalidate hint_femp */
- hint_femp->eidx = EXFAT_HINT_NONE;
- return (dentry - (num_entries - 1));
- }
- }
-
- if (clu.flags == ALLOC_NO_FAT_CHAIN) {
- if (--clu.size > 0)
- clu.dir++;
- else
- clu.dir = EXFAT_EOF_CLUSTER;
- } else {
- if (exfat_get_next_cluster(sb, &clu.dir))
- return -EIO;
+ i -= dentries_per_clu;
}
}
- hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty;
- hint_femp->count = num_empty;
- if (num_empty == 0)
+ hint_femp->eidx = dentry;
+ hint_femp->count = 0;
+ if (dentry == total_entries || clu.dir == EXFAT_EOF_CLUSTER)
exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0,
clu.flags);
+ else
+ hint_femp->cur = clu;
return -ENOSPC;
}
@@ -325,7 +292,8 @@ static int exfat_check_max_dentries(struct inode *inode)
* if there isn't any empty slot, expand cluster chain.
*/
static int exfat_find_empty_entry(struct inode *inode,
- struct exfat_chain *p_dir, int num_entries)
+ struct exfat_chain *p_dir, int num_entries,
+ struct exfat_entry_set_cache *es)
{
int dentry;
unsigned int ret, last_clu;
@@ -344,7 +312,7 @@ static int exfat_find_empty_entry(struct inode *inode,
}
while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir,
- num_entries)) < 0) {
+ num_entries, es)) < 0) {
if (dentry == -EIO)
break;
@@ -499,6 +467,8 @@ static int exfat_add_entry(struct inode *inode, const char *path,
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_uni_name uniname;
struct exfat_chain clu;
+ struct timespec64 ts = current_time(inode);
+ struct exfat_entry_set_cache es;
int clu_size = 0;
unsigned int start_clu = EXFAT_FREE_CLUSTER;
@@ -513,7 +483,7 @@ static int exfat_add_entry(struct inode *inode, const char *path,
}
/* exfat_find_empty_entry must be called before alloc_cluster() */
- dentry = exfat_find_empty_entry(inode, p_dir, num_entries);
+ dentry = exfat_find_empty_entry(inode, p_dir, num_entries, &es);
if (dentry < 0) {
ret = dentry; /* -EIO or -ENOSPC */
goto out;
@@ -521,8 +491,10 @@ static int exfat_add_entry(struct inode *inode, const char *path,
if (type == TYPE_DIR && !sbi->options.zero_size_dir) {
ret = exfat_alloc_new_dir(inode, &clu);
- if (ret)
+ if (ret) {
+ exfat_put_dentry_set(&es, false);
goto out;
+ }
start_clu = clu.dir;
clu_size = sbi->cluster_size;
}
@@ -531,12 +503,10 @@ static int exfat_add_entry(struct inode *inode, const char *path,
/* fill the dos name directory entry information of the created file.
* the first cluster is not determined yet. (0)
*/
- ret = exfat_init_dir_entry(inode, p_dir, dentry, type,
- start_clu, clu_size);
- if (ret)
- goto out;
+ exfat_init_dir_entry(&es, type, start_clu, clu_size, &ts);
+ exfat_init_ext_entry(&es, num_entries, &uniname);
- ret = exfat_init_ext_entry(inode, p_dir, dentry, num_entries, &uniname);
+ ret = exfat_put_dentry_set(&es, IS_DIRSYNC(inode));
if (ret)
goto out;
@@ -577,6 +547,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
struct exfat_dir_entry info;
loff_t i_pos;
int err;
+ loff_t size = i_size_read(dir);
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
@@ -587,7 +558,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
inode_inc_iversion(dir);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- if (IS_DIRSYNC(dir))
+ if (IS_DIRSYNC(dir) && size != i_size_read(dir))
exfat_sync_inode(dir);
else
mark_inode_dirty(dir);
@@ -795,12 +766,11 @@ unlock:
static int exfat_unlink(struct inode *dir, struct dentry *dentry)
{
struct exfat_chain cdir;
- struct exfat_dentry *ep;
struct super_block *sb = dir->i_sb;
struct inode *inode = dentry->d_inode;
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct buffer_head *bh;
- int num_entries, entry, err = 0;
+ struct exfat_entry_set_cache es;
+ int entry, err = 0;
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_chain_dup(&cdir, &ei->dir);
@@ -811,26 +781,20 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
goto unlock;
}
- ep = exfat_get_dentry(sb, &cdir, entry, &bh);
- if (!ep) {
- err = -EIO;
- goto unlock;
- }
- num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep);
- if (num_entries < 0) {
+ err = exfat_get_dentry_set(&es, sb, &cdir, entry, ES_ALL_ENTRIES);
+ if (err) {
err = -EIO;
- brelse(bh);
goto unlock;
}
- num_entries++;
- brelse(bh);
exfat_set_volume_dirty(sb);
+
/* update the directory entry */
- if (exfat_remove_entries(dir, &cdir, entry, 0, num_entries)) {
- err = -EIO;
+ exfat_remove_entries(inode, &es, ES_IDX_FILE);
+
+ err = exfat_put_dentry_set(&es, IS_DIRSYNC(inode));
+ if (err)
goto unlock;
- }
/* This doesn't modify ei */
ei->dir.dir = DIR_DELETED;
@@ -838,10 +802,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
inode_inc_iversion(dir);
simple_inode_init_ts(dir);
exfat_truncate_inode_atime(dir);
- if (IS_DIRSYNC(dir))
- exfat_sync_inode(dir);
- else
- mark_inode_dirty(dir);
+ mark_inode_dirty(dir);
clear_nlink(inode);
simple_inode_init_ts(inode);
@@ -862,6 +823,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct exfat_chain cdir;
loff_t i_pos;
int err;
+ loff_t size = i_size_read(dir);
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
@@ -872,7 +834,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inode_inc_iversion(dir);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- if (IS_DIRSYNC(dir))
+ if (IS_DIRSYNC(dir) && size != i_size_read(dir))
exfat_sync_inode(dir);
else
mark_inode_dirty(dir);
@@ -946,13 +908,12 @@ static int exfat_check_dir_empty(struct super_block *sb,
static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- struct exfat_dentry *ep;
struct exfat_chain cdir, clu_to_free;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct buffer_head *bh;
- int num_entries, entry, err;
+ struct exfat_entry_set_cache es;
+ int entry, err;
mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
@@ -976,27 +937,20 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
goto unlock;
}
- ep = exfat_get_dentry(sb, &cdir, entry, &bh);
- if (!ep) {
- err = -EIO;
- goto unlock;
- }
-
- num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep);
- if (num_entries < 0) {
+ err = exfat_get_dentry_set(&es, sb, &cdir, entry, ES_ALL_ENTRIES);
+ if (err) {
err = -EIO;
- brelse(bh);
goto unlock;
}
- num_entries++;
- brelse(bh);
exfat_set_volume_dirty(sb);
- err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
- if (err) {
- exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err);
+
+ exfat_remove_entries(inode, &es, ES_IDX_FILE);
+
+ err = exfat_put_dentry_set(&es, IS_DIRSYNC(dir));
+ if (err)
goto unlock;
- }
+
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
@@ -1022,67 +976,52 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
int oldentry, struct exfat_uni_name *p_uniname,
struct exfat_inode_info *ei)
{
- int ret, num_old_entries, num_new_entries;
+ int ret, num_new_entries;
struct exfat_dentry *epold, *epnew;
struct super_block *sb = inode->i_sb;
- struct buffer_head *new_bh, *old_bh;
+ struct exfat_entry_set_cache old_es, new_es;
int sync = IS_DIRSYNC(inode);
- epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh);
- if (!epold)
- return -EIO;
-
- num_old_entries = exfat_count_ext_entries(sb, p_dir, oldentry, epold);
- if (num_old_entries < 0)
- return -EIO;
- num_old_entries++;
-
num_new_entries = exfat_calc_num_entries(p_uniname);
if (num_new_entries < 0)
return num_new_entries;
- if (num_old_entries < num_new_entries) {
- int newentry;
+ ret = exfat_get_dentry_set(&old_es, sb, p_dir, oldentry, ES_ALL_ENTRIES);
+ if (ret) {
+ ret = -EIO;
+ return ret;
+ }
- newentry =
- exfat_find_empty_entry(inode, p_dir, num_new_entries);
- if (newentry < 0)
- return newentry; /* -EIO or -ENOSPC */
+ epold = exfat_get_dentry_cached(&old_es, ES_IDX_FILE);
- epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh);
- if (!epnew)
- return -EIO;
+ if (old_es.num_entries < num_new_entries) {
+ int newentry;
+ newentry = exfat_find_empty_entry(inode, p_dir, num_new_entries,
+ &new_es);
+ if (newentry < 0) {
+ ret = newentry; /* -EIO or -ENOSPC */
+ goto put_old_es;
+ }
+
+ epnew = exfat_get_dentry_cached(&new_es, ES_IDX_FILE);
*epnew = *epold;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
ei->attr |= EXFAT_ATTR_ARCHIVE;
}
- exfat_update_bh(new_bh, sync);
- brelse(old_bh);
- brelse(new_bh);
-
- epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh);
- if (!epold)
- return -EIO;
- epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh);
- if (!epnew) {
- brelse(old_bh);
- return -EIO;
- }
+ epold = exfat_get_dentry_cached(&old_es, ES_IDX_STREAM);
+ epnew = exfat_get_dentry_cached(&new_es, ES_IDX_STREAM);
*epnew = *epold;
- exfat_update_bh(new_bh, sync);
- brelse(old_bh);
- brelse(new_bh);
- ret = exfat_init_ext_entry(inode, p_dir, newentry,
- num_new_entries, p_uniname);
+ exfat_init_ext_entry(&new_es, num_new_entries, p_uniname);
+
+ ret = exfat_put_dentry_set(&new_es, sync);
if (ret)
- return ret;
+ goto put_old_es;
- exfat_remove_entries(inode, p_dir, oldentry, 0,
- num_old_entries);
+ exfat_remove_entries(inode, &old_es, ES_IDX_FILE);
ei->dir = *p_dir;
ei->entry = newentry;
} else {
@@ -1090,85 +1029,72 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
epold->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
ei->attr |= EXFAT_ATTR_ARCHIVE;
}
- exfat_update_bh(old_bh, sync);
- brelse(old_bh);
- ret = exfat_init_ext_entry(inode, p_dir, oldentry,
- num_new_entries, p_uniname);
- if (ret)
- return ret;
- exfat_remove_entries(inode, p_dir, oldentry, num_new_entries,
- num_old_entries);
+ exfat_remove_entries(inode, &old_es, ES_IDX_FIRST_FILENAME + 1);
+ exfat_init_ext_entry(&old_es, num_new_entries, p_uniname);
}
- return 0;
+ return exfat_put_dentry_set(&old_es, sync);
+
+put_old_es:
+ exfat_put_dentry_set(&old_es, false);
+ return ret;
}
static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
int oldentry, struct exfat_chain *p_newdir,
struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei)
{
- int ret, newentry, num_new_entries, num_old_entries;
+ int ret, newentry, num_new_entries;
struct exfat_dentry *epmov, *epnew;
struct super_block *sb = inode->i_sb;
- struct buffer_head *mov_bh, *new_bh;
-
- epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh);
- if (!epmov)
- return -EIO;
-
- num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry,
- epmov);
- if (num_old_entries < 0)
- return -EIO;
- num_old_entries++;
+ struct exfat_entry_set_cache mov_es, new_es;
num_new_entries = exfat_calc_num_entries(p_uniname);
if (num_new_entries < 0)
return num_new_entries;
- newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries);
- if (newentry < 0)
- return newentry; /* -EIO or -ENOSPC */
-
- epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh);
- if (!epnew)
+ ret = exfat_get_dentry_set(&mov_es, sb, p_olddir, oldentry,
+ ES_ALL_ENTRIES);
+ if (ret)
return -EIO;
+ newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries,
+ &new_es);
+ if (newentry < 0) {
+ ret = newentry; /* -EIO or -ENOSPC */
+ goto put_mov_es;
+ }
+
+ epmov = exfat_get_dentry_cached(&mov_es, ES_IDX_FILE);
+ epnew = exfat_get_dentry_cached(&new_es, ES_IDX_FILE);
*epnew = *epmov;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
ei->attr |= EXFAT_ATTR_ARCHIVE;
}
- exfat_update_bh(new_bh, IS_DIRSYNC(inode));
- brelse(mov_bh);
- brelse(new_bh);
-
- epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh);
- if (!epmov)
- return -EIO;
- epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh);
- if (!epnew) {
- brelse(mov_bh);
- return -EIO;
- }
+ epmov = exfat_get_dentry_cached(&mov_es, ES_IDX_STREAM);
+ epnew = exfat_get_dentry_cached(&new_es, ES_IDX_STREAM);
*epnew = *epmov;
- exfat_update_bh(new_bh, IS_DIRSYNC(inode));
- brelse(mov_bh);
- brelse(new_bh);
-
- ret = exfat_init_ext_entry(inode, p_newdir, newentry, num_new_entries,
- p_uniname);
- if (ret)
- return ret;
- exfat_remove_entries(inode, p_olddir, oldentry, 0, num_old_entries);
+ exfat_init_ext_entry(&new_es, num_new_entries, p_uniname);
+ exfat_remove_entries(inode, &mov_es, ES_IDX_FILE);
exfat_chain_set(&ei->dir, p_newdir->dir, p_newdir->size,
p_newdir->flags);
ei->entry = newentry;
- return 0;
+
+ ret = exfat_put_dentry_set(&new_es, IS_DIRSYNC(inode));
+ if (ret)
+ goto put_mov_es;
+
+ return exfat_put_dentry_set(&mov_es, IS_DIRSYNC(inode));
+
+put_mov_es:
+ exfat_put_dentry_set(&mov_es, false);
+
+ return ret;
}
/* rename or move a old file into a new file */
@@ -1186,7 +1112,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
struct exfat_sb_info *sbi = EXFAT_SB(sb);
const unsigned char *new_path = new_dentry->d_name.name;
struct inode *new_inode = new_dentry->d_inode;
- int num_entries;
struct exfat_inode_info *new_ei = NULL;
unsigned int new_entry_type = TYPE_UNUSED;
int new_entry = 0;
@@ -1257,25 +1182,21 @@ static int __exfat_rename(struct inode *old_parent_inode,
&newdir, &uni_name, ei);
if (!ret && new_inode) {
+ struct exfat_entry_set_cache es;
+
/* delete entries of new_dir */
- ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh);
- if (!ep) {
+ ret = exfat_get_dentry_set(&es, sb, p_dir, new_entry,
+ ES_ALL_ENTRIES);
+ if (ret) {
ret = -EIO;
goto del_out;
}
- num_entries = exfat_count_ext_entries(sb, p_dir, new_entry, ep);
- if (num_entries < 0) {
- ret = -EIO;
- goto del_out;
- }
- brelse(new_bh);
+ exfat_remove_entries(new_inode, &es, ES_IDX_FILE);
- if (exfat_remove_entries(new_inode, p_dir, new_entry, 0,
- num_entries + 1)) {
- ret = -EIO;
+ ret = exfat_put_dentry_set(&es, IS_DIRSYNC(new_inode));
+ if (ret)
goto del_out;
- }
/* Free the clusters if new_inode is a dir(as if exfat_rmdir) */
if (new_entry_type == TYPE_DIR &&
@@ -1317,6 +1238,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
struct super_block *sb = old_dir->i_sb;
loff_t i_pos;
int err;
+ loff_t size = i_size_read(new_dir);
/*
* The VFS already checks for existence, so for local filesystems
@@ -1338,7 +1260,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
exfat_truncate_inode_atime(new_dir);
- if (IS_DIRSYNC(new_dir))
+ if (IS_DIRSYNC(new_dir) && size != i_size_read(new_dir))
exfat_sync_inode(new_dir);
else
mark_inode_dirty(new_dir);
@@ -1359,9 +1281,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
}
inode_inc_iversion(old_dir);
- if (IS_DIRSYNC(old_dir))
- exfat_sync_inode(old_dir);
- else
+ if (new_dir != old_dir)
mark_inode_dirty(old_dir);
if (new_inode) {
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 74d98965902e..d6cfb1849580 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,16 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
config EXT2_FS
- tristate "Second extended fs support"
+ tristate "Second extended fs support (DEPRECATED)"
select BUFFER_HEAD
select FS_IOMAP
select LEGACY_DIRECT_IO
help
Ext2 is a standard Linux file system for hard disks.
- To compile this file system support as a module, choose M here: the
- module will be called ext2.
+ This filesystem driver is deprecated because it does not properly
+ support inode time stamps beyond 03:14:07 UTC on 19 January 2038.
- If unsure, say Y.
+ Ext2 users are advised to use ext4 driver to access their filesystem.
+ The driver is fully compatible, supports filesystems without journal
+ or extents, and also supports larger time stamps if the filesystem
+ is created with at least 256 byte inodes.
+
+ This code is kept as a simple reference for filesystem developers.
+
+ If unsure, say N.
config EXT2_FS_XATTR
bool "Ext2 extended attributes"
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e124f3d709b2..1bfd6ab11038 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -412,7 +412,7 @@ void ext2_init_block_alloc_info(struct inode *inode)
struct ext2_block_alloc_info *block_i;
struct super_block *sb = inode->i_sb;
- block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+ block_i = kmalloc(sizeof(*block_i), GFP_KERNEL);
if (block_i) {
struct ext2_reserve_window_node *rsv = &block_i->rsv_window_node;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 677a9ad45dcb..f38bdd46e4f7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -674,7 +674,7 @@ struct ext2_inode_info {
struct inode vfs_inode;
struct list_head i_orphan; /* unlinked but open inodes */
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 5a4272b2c6b0..f3d570a9302b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -754,7 +754,7 @@ static int ext2_get_blocks(struct inode *inode,
*/
err = sb_issue_zeroout(inode->i_sb,
le32_to_cpu(chain[depth-1].key), count,
- GFP_NOFS);
+ GFP_KERNEL);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index cabea887314d..37f7ce56adce 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -319,7 +319,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, siz
static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
static int ext2_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path);
-static struct dquot **ext2_get_dquots(struct inode *inode)
+static struct dquot __rcu **ext2_get_dquots(struct inode *inode)
{
return EXT2_I(inode)->i_dquot;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e849241ebb8f..c885dcc3bd0d 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -874,7 +874,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
int error;
- error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr,
+ error = mb_cache_entry_create(cache, GFP_KERNEL, hash, bh->b_blocknr,
true);
if (error) {
if (error == -EBUSY) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c0d7d143036..8d126654019e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1158,7 +1158,7 @@ struct ext4_inode_info {
tid_t i_datasync_tid;
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7669d154c05e..e57054bdc5fd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4111,10 +4111,10 @@ insert_hole:
*
* Need to be called with
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
*
* return > 0, number of blocks already mapped/allocated
- * if create == 0 and these are pre-allocated blocks
+ * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
* buffer head is unmapped
* otherwise blocks are mapped
*
@@ -4218,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/*
* requested block isn't allocated yet;
- * we couldn't try to create block if create flag is zero
+ * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
ext4_lblk_t len;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2ccf3b5e3a7c..537803250ca9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -465,9 +465,10 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocated. if
- * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
- * is marked as unwritten. If the create == 1, it will mark @map as mapped.
+ * On success, it returns the number of blocks being mapped or allocated.
+ * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
+ * pre-allocated and unwritten, the resulting @map is marked as unwritten.
+ * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
* that case, @map is returned as unmapped but we still do fill map->m_len to
@@ -589,8 +590,7 @@ found:
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns the create = 0
- * with buffer head unmapped.
+ * ext4_ext_map_blocks() returns with buffer head unmapped
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
/*
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index f94901fd3835..044ca5238f41 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -5,6 +5,7 @@
#include <kunit/test.h>
#include <kunit/static_stub.h>
+#include <linux/random.h>
#include "ext4.h"
@@ -20,41 +21,135 @@ struct mbt_ctx {
};
struct mbt_ext4_super_block {
- struct super_block sb;
+ struct ext4_super_block es;
+ struct ext4_sb_info sbi;
struct mbt_ctx mbt_ctx;
};
-#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx))
+#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi))
+#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx)
#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+static const struct super_operations mbt_sops = {
+};
+
+static void mbt_kill_sb(struct super_block *sb)
+{
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type mbt_fs_type = {
+ .name = "mballoc test",
+ .kill_sb = mbt_kill_sb,
+};
+
+static int mbt_mb_init(struct super_block *sb)
+{
+ ext4_fsblk_t block;
+ int ret;
+
+ /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */
+ sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL);
+ if (sb->s_bdev == NULL)
+ return -ENOMEM;
+
+ sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL);
+ if (sb->s_bdev->bd_queue == NULL) {
+ kfree(sb->s_bdev);
+ return -ENOMEM;
+ }
+
+ /*
+ * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache =
+ * new_inode(sb);
+ */
+ INIT_LIST_HEAD(&sb->s_inodes);
+ sb->s_op = &mbt_sops;
+
+ ret = ext4_mb_init(sb);
+ if (ret != 0)
+ goto err_out;
+
+ block = ext4_count_free_clusters(sb);
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_mb_release;
+
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_freeclusters;
+
+ return 0;
+
+err_freeclusters:
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+err_mb_release:
+ ext4_mb_release(sb);
+err_out:
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+ return ret;
+}
+
+static void mbt_mb_release(struct super_block *sb)
+{
+ percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter);
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+ ext4_mb_release(sb);
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+}
+
+static int mbt_set(struct super_block *sb, void *data)
+{
+ return 0;
+}
+
static struct super_block *mbt_ext4_alloc_super_block(void)
{
- struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL);
- struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+ struct mbt_ext4_super_block *fsb;
+ struct super_block *sb;
+ struct ext4_sb_info *sbi;
+
+ fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+ if (fsb == NULL)
+ return NULL;
- if (fsb == NULL || sbi == NULL || es == NULL)
+ sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+ if (IS_ERR(sb))
goto out;
- sbi->s_es = es;
- fsb->sb.s_fs_info = sbi;
- return &fsb->sb;
+ sbi = &fsb->sbi;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock)
+ goto out_deactivate;
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ sbi->s_es = &fsb->es;
+ sb->s_fs_info = sbi;
+
+ up_write(&sb->s_umount);
+ return sb;
+out_deactivate:
+ deactivate_locked_super(sb);
out:
kfree(fsb);
- kfree(sbi);
- kfree(es);
return NULL;
}
static void mbt_ext4_free_super_block(struct super_block *sb)
{
- struct mbt_ext4_super_block *fsb =
- container_of(sb, struct mbt_ext4_super_block, sb);
+ struct mbt_ext4_super_block *fsb = MBT_SB(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- kfree(sbi->s_es);
- kfree(sbi);
+ kfree(sbi->s_blockgroup_lock);
+ deactivate_super(sb);
kfree(fsb);
}
@@ -82,6 +177,9 @@ static void mbt_init_sb_layout(struct super_block *sb,
sbi->s_clusters_per_group = layout->blocks_per_group >>
layout->cluster_bits;
sbi->s_desc_size = layout->desc_size;
+ sbi->s_desc_per_block_bits =
+ sb->s_blocksize_bits - (fls(layout->desc_size) - 1);
+ sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits;
es->s_first_data_block = cpu_to_le32(0);
es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
@@ -91,9 +189,13 @@ static void mbt_init_sb_layout(struct super_block *sb,
static int mbt_grp_ctx_init(struct super_block *sb,
struct mbt_grp_ctx *grp_ctx)
{
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+
grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
if (grp_ctx->bitmap_bh.b_data == NULL)
return -ENOMEM;
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max);
+ ext4_free_group_clusters_set(sb, &grp_ctx->desc, max);
return 0;
}
@@ -112,6 +214,13 @@ static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
}
+static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ return grp_ctx->bitmap_bh.b_data;
+}
+
/* called after mbt_init_sb_layout */
static int mbt_ctx_init(struct super_block *sb)
{
@@ -133,6 +242,8 @@ static int mbt_ctx_init(struct super_block *sb)
* block which will fail ext4_sb_block_valid check.
*/
mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+ ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc,
+ EXT4_CLUSTERS_PER_GROUP(sb) - 1);
return 0;
out:
@@ -167,6 +278,13 @@ static int ext4_wait_block_bitmap_stub(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh)
{
+ /*
+ * real ext4_wait_block_bitmap will set these flags and
+ * functions like ext4_mb_init_cache will verify the flags.
+ */
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ set_buffer_verified(bh);
return 0;
}
@@ -232,6 +350,14 @@ static int mbt_kunit_init(struct kunit *test)
kunit_activate_static_stub(test,
ext4_mb_mark_context,
ext4_mb_mark_context_stub);
+
+ /* stub function will be called in mbt_mb_init->ext4_mb_init */
+ if (mbt_mb_init(sb) != 0) {
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -239,6 +365,7 @@ static void mbt_kunit_exit(struct kunit *test)
{
struct super_block *sb = (struct super_block *)test->priv;
+ mbt_mb_release(sb);
mbt_ctx_release(sb);
mbt_ext4_free_super_block(sb);
}
@@ -246,14 +373,19 @@ static void mbt_kunit_exit(struct kunit *test)
static void test_new_blocks_simple(struct kunit *test)
{
struct super_block *sb = (struct super_block *)test->priv;
- struct inode inode = { .i_sb = sb, };
+ struct inode *inode;
struct ext4_allocation_request ar;
ext4_group_t i, goal_group = TEST_GOAL_GROUP;
int err = 0;
ext4_fsblk_t found;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ar.inode = &inode;
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+
+ inode->i_sb = sb;
+ ar.inode = inode;
/* get block at goal */
ar.goal = ext4_group_first_block_no(sb, goal_group);
@@ -297,6 +429,436 @@ static void test_new_blocks_simple(struct kunit *test)
"unexpectedly get block when no block is available");
}
+#define TEST_RANGE_COUNT 8
+
+struct test_range {
+ ext4_grpblk_t start;
+ ext4_grpblk_t len;
+};
+
+static void
+mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges,
+ int count)
+{
+ ext4_grpblk_t start, len, max;
+ int i;
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb) / count;
+ for (i = 0; i < count; i++) {
+ start = get_random_u32() % max;
+ len = get_random_u32() % max;
+ len = min(len, max - start);
+
+ ranges[i].start = start + i * max;
+ ranges[i].len = len;
+ }
+}
+
+static void
+validate_free_blocks_simple(struct kunit *test, struct super_block *sb,
+ ext4_group_t goal_group, ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ void *bitmap;
+ ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++) {
+ if (i == goal_group)
+ continue;
+
+ bitmap = mbt_ctx_bitmap(sb, i);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ_MSG(test, bit, max,
+ "free block on unexpected group %d", i);
+ }
+
+ bitmap = mbt_ctx_bitmap(sb, goal_group);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, bit, start);
+
+ bit = mb_find_next_bit(bitmap, max, bit + 1);
+ KUNIT_ASSERT_EQ(test, bit, start + len);
+}
+
+static void
+test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group,
+ ext4_grpblk_t start, ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode;
+ ext4_fsblk_t block;
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ if (len == 0)
+ return;
+
+ block = ext4_group_first_block_no(sb, goal_group) +
+ EXT4_C2B(sbi, start);
+ ext4_free_blocks_simple(inode, block, len);
+ validate_free_blocks_simple(test, sb, goal_group, start, len);
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+}
+
+static void test_free_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, max);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_free_blocks_simple_range(test, TEST_GOAL_GROUP,
+ ranges[i].start, ranges[i].len);
+}
+
+static void
+test_mark_diskspace_used_range(struct kunit *test,
+ struct ext4_allocation_context *ac,
+ ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int ret;
+ void *bitmap;
+ ext4_grpblk_t i, max;
+
+ /* ext4_mb_mark_diskspace_used will BUG if len is 0 */
+ if (len == 0)
+ return;
+
+ ac->ac_b_ex.fe_group = TEST_GOAL_GROUP;
+ ac->ac_b_ex.fe_start = start;
+ ac->ac_b_ex.fe_len = len;
+
+ bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
+ memset(bitmap, 0, sb->s_blocksize);
+ ret = ext4_mb_mark_diskspace_used(ac, NULL, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ i = mb_find_next_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, i, start);
+ i = mb_find_next_zero_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, i, start + len);
+ i = mb_find_next_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, max, i);
+}
+
+static void test_mark_diskspace_used(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode *inode;
+ struct ext4_allocation_context ac;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ ac.ac_status = AC_STATUS_FOUND;
+ ac.ac_sb = sb;
+ ac.ac_inode = inode;
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mark_diskspace_used_range(test, &ac, ranges[i].start,
+ ranges[i].len);
+}
+
+static void mbt_generate_buddy(struct super_block *sb, void *buddy,
+ void *bitmap, struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ uint32_t order, off;
+ void *bb, *bb_h;
+ int max;
+
+ memset(buddy, 0xff, sb->s_blocksize);
+ memset(grp, 0, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]));
+
+ bb = bitmap;
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ bb_h = buddy + sbi->s_mb_offsets[1];
+
+ off = mb_find_next_zero_bit(bb, max, 0);
+ grp->bb_first_free = off;
+ while (off < max) {
+ grp->bb_counters[0]++;
+ grp->bb_free++;
+
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ grp->bb_free++;
+ grp->bb_counters[0]--;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[1]++;
+ grp->bb_largest_free_order = 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+
+ for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) {
+ bb = buddy + sbi->s_mb_offsets[order];
+ bb_h = buddy + sbi->s_mb_offsets[order + 1];
+ max = max >> 1;
+ off = mb_find_next_zero_bit(bb, max, 0);
+
+ while (off < max) {
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ mb_set_bits(bb, off, 2);
+ grp->bb_counters[order] -= 2;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[order + 1]++;
+ grp->bb_largest_free_order = order + 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+ }
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ off = mb_find_next_zero_bit(bitmap, max, 0);
+ while (off < max) {
+ grp->bb_fragments++;
+
+ off = mb_find_next_bit(bitmap, max, off + 1);
+ if (off + 1 >= max)
+ break;
+
+ off = mb_find_next_zero_bit(bitmap, max, off + 1);
+ }
+}
+
+static void
+mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1,
+ struct ext4_group_info *grp2)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ KUNIT_ASSERT_EQ(test, grp1->bb_first_free,
+ grp2->bb_first_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_fragments,
+ grp2->bb_fragments);
+ KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order,
+ grp2->bb_largest_free_order);
+
+ for (i = 1; i < MB_NUM_ORDERS(sb); i++) {
+ KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i],
+ grp2->bb_counters[i],
+ "bb_counters[%d] diffs, expected %d, generated %d",
+ i, grp1->bb_counters[i],
+ grp2->bb_counters[i]);
+ }
+}
+
+static void
+do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
+ void *mbt_buddy, struct ext4_group_info *mbt_grp,
+ void *ext4_buddy, struct ext4_group_info *ext4_grp)
+{
+ int i;
+
+ mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp);
+
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ ext4_grp->bb_counters[i] = 0;
+ /* needed by validation in ext4_mb_generate_buddy */
+ ext4_grp->bb_free = mbt_grp->bb_free;
+ memset(ext4_buddy, 0xff, sb->s_blocksize);
+ ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
+ ext4_grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, mbt_grp, ext4_grp);
+}
+
+static void test_mb_generate_buddy(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *expected_bb, *generate_bb;
+ struct ext4_group_info *expected_grp, *generate_grp;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb);
+ generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb);
+ expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp);
+ generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP);
+ KUNIT_ASSERT_NOT_NULL(test, generate_grp);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ mb_set_bits(bitmap, ranges[i].start, ranges[i].len);
+ do_test_generate_buddy(test, sb, bitmap, expected_bb,
+ expected_grp, generate_bb, generate_grp);
+ }
+}
+
+static void
+test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int i;
+
+ /* mb_mark_used only accepts non-zero len */
+ if (len == 0)
+ return;
+
+ ex.fe_start = start;
+ ex.fe_len = len;
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ mb_set_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free -= len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+}
+
+static void test_mb_mark_used(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_mark_used_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static void
+test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ /* mb_free_blocks will WARN if len is 0 */
+ if (len == 0)
+ return;
+
+ ext4_lock_group(sb, e4b->bd_group);
+ mb_free_blocks(NULL, e4b, start, len);
+ ext4_unlock_group(sb, e4b->bd_group);
+
+ mb_clear_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free += len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+
+}
+
+static void test_mb_free_blocks(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ struct ext4_free_extent ex;
+ int ret;
+ int i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_start = 0;
+ ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb);
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ grp->bb_free = 0;
+ memset(bitmap, 0xff, sb->s_blocksize);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_free_blocks_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
{
.blocksize_bits = 10,
@@ -334,6 +896,11 @@ KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
static struct kunit_case mbt_test_cases[] = {
KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params),
{}
};
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e4f7cf9d89c4..12b3f196010b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3015,8 +3015,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
- int i;
- int err, buddy_loaded = 0;
+ int i, err;
+ char nbuf[16];
struct ext4_buddy e4b;
struct ext4_group_info *grinfo;
unsigned char blocksize_bits = min_t(unsigned char,
@@ -3043,23 +3043,26 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
- seq_printf(seq, "#%-5u: I/O error\n", group);
+ seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf));
return 0;
}
- buddy_loaded = 1;
+ ext4_mb_unload_buddy(&e4b);
}
+ /*
+ * We care only about free space counters in the group info and
+ * these are safe to access even after the buddy has been unloaded
+ */
memcpy(&sg, grinfo, i);
-
- if (buddy_loaded)
- ext4_mb_unload_buddy(&e4b);
-
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
sg.info.bb_counters[i] : 0);
- seq_puts(seq, " ]\n");
+ seq_puts(seq, " ]");
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
+ seq_puts(seq, " Block bitmap corrupted!");
+ seq_puts(seq, "\n");
return 0;
}
@@ -3829,8 +3832,7 @@ void ext4_mb_release(struct super_block *sb)
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t cluster, int count,
- struct bio **biop)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
ext4_fsblk_t discard_block;
@@ -3839,13 +3841,8 @@ static inline int ext4_issue_discard(struct super_block *sb,
count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- if (biop) {
- return __blkdev_issue_discard(sb->s_bdev,
- (sector_t)discard_block << (sb->s_blocksize_bits - 9),
- (sector_t)count << (sb->s_blocksize_bits - 9),
- GFP_NOFS, biop);
- } else
- return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
static void ext4_free_data_in_buddy(struct super_block *sb,
@@ -5169,10 +5166,16 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
.fe_len = ac->ac_orig_goal_len,
};
loff_t orig_goal_end = extent_logical_end(sbi, &ex);
+ loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);
- /* we can't allocate as much as normalizer wants.
- * so, found space must get proper lstart
- * to cover original request */
+ /*
+ * We can't allocate as much as normalizer wants, so we try
+ * to get proper lstart to cover the original request, except
+ * when the goal doesn't cover the original request as below:
+ *
+ * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
+ * best_ex:0/200(200) -> adjusted: 1848/2048(200)
+ */
BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
@@ -5184,7 +5187,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
* 1. Check if best ex can be kept at end of goal (before
* cr_best_avail trimmed it) and still cover original start
* 2. Else, check if best ex can be kept at start of goal and
- * still cover original start
+ * still cover original end
* 3. Else, keep the best ex at start of original request.
*/
ex.fe_len = ac->ac_b_ex.fe_len;
@@ -5194,7 +5197,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
goto adjust_bex;
ex.fe_logical = ac->ac_g_ex.fe_logical;
- if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex))
+ if (o_ex_end <= extent_logical_end(sbi, &ex))
goto adjust_bex;
ex.fe_logical = ac->ac_o_ex.fe_logical;
@@ -5202,7 +5205,6 @@ adjust_bex:
ac->ac_b_ex.fe_logical = ex.fe_logical;
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
- BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
}
@@ -6487,8 +6489,14 @@ do_more:
} else {
if (test_opt(sb, DISCARD)) {
err = ext4_issue_discard(sb, block_group, bit,
- count_clusters, NULL);
- if (err && err != -EOPNOTSUPP)
+ count_clusters);
+ /*
+ * Ignore EOPNOTSUPP error. This is consistent with
+ * what happens when using journal.
+ */
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ if (err)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
@@ -6738,7 +6746,7 @@ __acquires(bitlock)
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ret = ext4_issue_discard(sb, group, start, count, NULL);
+ ret = ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4d4a5a32e310..0ba9837d65ca 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1602,7 +1602,8 @@ exit_journal:
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = ext4_has_feature_meta_bg(sb);
+ int meta_bg = ext4_has_feature_meta_bg(sb) &&
+ gdb_num >= le32_to_cpu(es->s_first_meta_bg);
sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
ext4_group_first_block_no(sb, 0);
@@ -2084,7 +2085,7 @@ retry:
}
}
- if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+ if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) {
err = ext4_convert_meta_bg(sb, resize_inode);
if (err)
goto out;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 59c72b6dd153..cfb8449c731f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1599,7 +1599,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
-static struct dquot **ext4_get_dquots(struct inode *inode)
+static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
{
return EXT4_I(inode)->i_dquot;
}
@@ -4421,22 +4421,6 @@ static int ext4_handle_clustersize(struct super_block *sb)
}
sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
le32_to_cpu(es->s_log_block_size);
- sbi->s_clusters_per_group =
- le32_to_cpu(es->s_clusters_per_group);
- if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#clusters per group too big: %lu",
- sbi->s_clusters_per_group);
- return -EINVAL;
- }
- if (sbi->s_blocks_per_group !=
- (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
- ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
- "clusters per group (%lu) inconsistent",
- sbi->s_blocks_per_group,
- sbi->s_clusters_per_group);
- return -EINVAL;
- }
} else {
if (clustersize != sb->s_blocksize) {
ext4_msg(sb, KERN_ERR,
@@ -4450,9 +4434,21 @@ static int ext4_handle_clustersize(struct super_block *sb)
sbi->s_blocks_per_group);
return -EINVAL;
}
- sbi->s_clusters_per_group = sbi->s_blocks_per_group;
sbi->s_cluster_bits = 0;
}
+ sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
+ ext4_msg(sb, KERN_ERR,
+ "blocks per group (%lu) and clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group, sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
/* Do we have standard group size of clustersize * 8 blocks ? */
@@ -6864,6 +6860,10 @@ static int ext4_write_dquot(struct dquot *dquot)
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to commit dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
@@ -6880,6 +6880,10 @@ static int ext4_acquire_dquot(struct dquot *dquot)
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_acquire(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to acquire dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
@@ -6899,6 +6903,10 @@ static int ext4_release_dquot(struct dquot *dquot)
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to release dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 82dc5e673d5c..b67a176bfcf9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1565,46 +1565,49 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
/*
* Add value of the EA in an inode.
*/
-static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
- const void *value, size_t value_len,
- struct inode **ret_inode)
+static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
+ struct inode *inode, const void *value, size_t value_len)
{
struct inode *ea_inode;
u32 hash;
int err;
+ /* Account inode & space to quota even if sharing... */
+ err = ext4_xattr_inode_alloc_quota(inode, value_len);
+ if (err)
+ return ERR_PTR(err);
+
hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
if (ea_inode) {
err = ext4_xattr_inode_inc_ref(handle, ea_inode);
- if (err) {
- iput(ea_inode);
- return err;
- }
-
- *ret_inode = ea_inode;
- return 0;
+ if (err)
+ goto out_err;
+ return ea_inode;
}
/* Create an inode for the EA value */
ea_inode = ext4_xattr_inode_create(handle, inode, hash);
- if (IS_ERR(ea_inode))
- return PTR_ERR(ea_inode);
+ if (IS_ERR(ea_inode)) {
+ ext4_xattr_inode_free_quota(inode, NULL, value_len);
+ return ea_inode;
+ }
err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
if (err) {
if (ext4_xattr_inode_dec_ref(handle, ea_inode))
ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
- iput(ea_inode);
- return err;
+ goto out_err;
}
if (EA_INODE_CACHE(inode))
mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
ea_inode->i_ino, true /* reusable */);
-
- *ret_inode = ea_inode;
- return 0;
+ return ea_inode;
+out_err:
+ iput(ea_inode);
+ ext4_xattr_inode_free_quota(inode, NULL, value_len);
+ return ERR_PTR(err);
}
/*
@@ -1712,16 +1715,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
if (i->value && in_inode) {
WARN_ON_ONCE(!i->value_len);
- ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
- if (ret)
- goto out;
-
- ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
- i->value_len,
- &new_ea_inode);
- if (ret) {
+ new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
+ i->value, i->value_len);
+ if (IS_ERR(new_ea_inode)) {
+ ret = PTR_ERR(new_ea_inode);
new_ea_inode = NULL;
- ext4_xattr_inode_free_quota(inode, NULL, i->value_len);
goto out;
}
}
@@ -2160,17 +2158,6 @@ getblk_failed:
ENTRY(header(s->base)+1));
if (error)
goto getblk_failed;
- if (ea_inode) {
- /* Drop the extra ref on ea_inode. */
- error = ext4_xattr_inode_dec_ref(handle,
- ea_inode);
- if (error)
- ext4_warning_inode(ea_inode,
- "dec ref error=%d",
- error);
- iput(ea_inode);
- ea_inode = NULL;
- }
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, sb,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b0597a539fc5..eac698b8dd38 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -154,49 +154,47 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
if (unlikely(f2fs_cp_error(sbi)))
return exist;
- if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) {
- f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
- blkaddr, exist);
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- return exist;
- }
+ if ((exist && type == DATA_GENERIC_ENHANCE_UPDATE) ||
+ (!exist && type == DATA_GENERIC_ENHANCE))
+ goto out_err;
+ if (!exist && type != DATA_GENERIC_ENHANCE_UPDATE)
+ goto out_handle;
+ return exist;
- if (!exist && type == DATA_GENERIC_ENHANCE) {
- f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
- blkaddr, exist);
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- dump_stack();
- }
+out_err:
+ f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
+ blkaddr, exist);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ dump_stack();
+out_handle:
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return exist;
}
-bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
- if (time_to_inject(sbi, FAULT_BLKADDR))
- return false;
-
switch (type) {
case META_NAT:
break;
case META_SIT:
if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
- return false;
+ goto err;
break;
case META_SSA:
if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
blkaddr < SM_I(sbi)->ssa_blkaddr))
- return false;
+ goto err;
break;
case META_CP:
if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
blkaddr < __start_cp_addr(sbi)))
- return false;
+ goto err;
break;
case META_POR:
if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
blkaddr < MAIN_BLKADDR(sbi)))
- return false;
+ goto err;
break;
case DATA_GENERIC:
case DATA_GENERIC_ENHANCE:
@@ -213,7 +211,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
blkaddr);
set_sbi_flag(sbi, SBI_NEED_FSCK);
dump_stack();
- return false;
+ goto err;
} else {
return __is_bitmap_valid(sbi, blkaddr, type);
}
@@ -221,13 +219,30 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
case META_GENERIC:
if (unlikely(blkaddr < SEG0_BLKADDR(sbi) ||
blkaddr >= MAIN_BLKADDR(sbi)))
- return false;
+ goto err;
break;
default:
BUG();
}
return true;
+err:
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ return false;
+}
+
+bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type)
+{
+ if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY))
+ return false;
+ return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
+}
+
+bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type)
+{
+ return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
}
/*
@@ -889,7 +904,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
- if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
+ if (cp_blocks > BLKS_PER_SEG(sbi) || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
@@ -1324,7 +1339,7 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (cpc->reason & CP_UMOUNT) {
if (le32_to_cpu(ckpt->cp_pack_total_block_count) +
- NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) {
+ NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) {
clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
f2fs_notice(sbi, "Disable nat_bits due to no space");
} else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
@@ -1527,7 +1542,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
cp_ver |= ((__u64)crc32 << 32);
*(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
- blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
+ blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++)
f2fs_update_meta_page(sbi, nm_i->nat_bits +
(i << F2FS_BLKSIZE_BITS), blk + i);
@@ -1587,8 +1602,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
*/
if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) ||
f2fs_sb_has_compression(sbi))
- invalidate_mapping_pages(META_MAPPING(sbi),
- MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
+ f2fs_bug_on(sbi,
+ invalidate_inode_pages2_range(META_MAPPING(sbi),
+ MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1));
f2fs_release_ino_entry(sbi, false);
@@ -1730,9 +1746,9 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
im->ino_num = 0;
}
- sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
+ sbi->max_orphans = (BLKS_PER_SEG(sbi) - F2FS_CP_PACKS -
NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) *
- F2FS_ORPHANS_PER_BLOCK;
+ F2FS_ORPHANS_PER_BLOCK;
}
int __init f2fs_create_checkpoint_caches(void)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 531517dac079..8892c8262141 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -512,8 +512,8 @@ static int lzorle_compress_pages(struct compress_ctx *cc)
ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
&cc->clen, cc->private);
if (ret != LZO_E_OK) {
- printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret);
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "lzo-rle compress failed, ret:%d", ret);
return -EIO;
}
return 0;
@@ -780,9 +780,9 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
if (provided != calculated) {
if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) {
set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT);
- printk_ratelimited(
- "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x",
- KERN_INFO, sbi->sb->s_id, dic->inode->i_ino,
+ f2fs_info_ratelimited(sbi,
+ "checksum invalid, nid = %lu, %x vs %x",
+ dic->inode->i_ino,
provided, calculated);
}
set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -1418,6 +1418,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
struct f2fs_sb_info *sbi = bio->bi_private;
struct compress_io_ctx *cic =
(struct compress_io_ctx *)page_private(page);
+ enum count_type type = WB_DATA_TYPE(page,
+ f2fs_is_compressed_page(page));
int i;
if (unlikely(bio->bi_status))
@@ -1425,7 +1427,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
f2fs_compress_free_page(page);
- dec_page_count(sbi, F2FS_WB_DATA);
+ dec_page_count(sbi, type);
if (atomic_dec_return(&cic->pending_pages))
return;
@@ -1441,12 +1443,14 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
}
static int f2fs_write_raw_pages(struct compress_ctx *cc,
- int *submitted,
+ int *submitted_p,
struct writeback_control *wbc,
enum iostat_type io_type)
{
struct address_space *mapping = cc->inode->i_mapping;
- int _submitted, compr_blocks, ret, i;
+ struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ int submitted, compr_blocks, i;
+ int ret = 0;
compr_blocks = f2fs_compressed_blocks(cc);
@@ -1461,6 +1465,10 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
if (compr_blocks < 0)
return compr_blocks;
+ /* overwrite compressed cluster w/ normal cluster */
+ if (compr_blocks > 0)
+ f2fs_lock_op(sbi);
+
for (i = 0; i < cc->cluster_size; i++) {
if (!cc->rpages[i])
continue;
@@ -1485,7 +1493,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(cc->rpages[i]))
goto continue_unlock;
- ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
+ ret = f2fs_write_single_data_page(cc->rpages[i], &submitted,
NULL, NULL, wbc, io_type,
compr_blocks, false);
if (ret) {
@@ -1493,26 +1501,29 @@ continue_unlock:
unlock_page(cc->rpages[i]);
ret = 0;
} else if (ret == -EAGAIN) {
+ ret = 0;
/*
* for quota file, just redirty left pages to
* avoid deadlock caused by cluster update race
* from foreground operation.
*/
if (IS_NOQUOTA(cc->inode))
- return 0;
- ret = 0;
+ goto out;
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry_write;
}
- return ret;
+ goto out;
}
- *submitted += _submitted;
+ *submitted_p += submitted;
}
- f2fs_balance_fs(F2FS_M_SB(mapping), true);
+out:
+ if (compr_blocks > 0)
+ f2fs_unlock_op(sbi);
- return 0;
+ f2fs_balance_fs(sbi, true);
+ return ret;
}
int f2fs_write_multi_pages(struct compress_ctx *cc,
@@ -1806,16 +1817,18 @@ void f2fs_put_page_dic(struct page *page, bool in_task)
* check whether cluster blocks are contiguous, and add extent cache entry
* only if cluster blocks are logically and physically contiguous.
*/
-unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn,
+ unsigned int ofs_in_node)
{
- bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR;
+ bool compressed = data_blkaddr(dn->inode, dn->node_page,
+ ofs_in_node) == COMPRESS_ADDR;
int i = compressed ? 1 : 0;
block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
- dn->ofs_in_node + i);
+ ofs_in_node + i);
for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) {
block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
- dn->ofs_in_node + i);
+ ofs_in_node + i);
if (!__is_valid_data_blkaddr(blkaddr))
break;
@@ -1878,12 +1891,8 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
set_page_private_data(cpage, ino);
- if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ))
- goto out;
-
memcpy(page_address(cpage), page_address(page), PAGE_SIZE);
SetPageUptodate(cpage);
-out:
f2fs_put_page(cpage, 1);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 26e317696b33..d9494b5fc7c1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -48,7 +48,7 @@ void f2fs_destroy_bioset(void)
bioset_exit(&f2fs_bioset);
}
-static bool __is_cp_guaranteed(struct page *page)
+bool f2fs_is_cp_guaranteed(struct page *page)
{
struct address_space *mapping = page->mapping;
struct inode *inode;
@@ -65,8 +65,6 @@ static bool __is_cp_guaranteed(struct page *page)
S_ISDIR(inode->i_mode))
return true;
- if (f2fs_is_compressed_page(page))
- return false;
if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
page_private_gcing(page))
return true;
@@ -338,18 +336,7 @@ static void f2fs_write_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
- enum count_type type = WB_DATA_TYPE(page);
-
- if (page_private_dummy(page)) {
- clear_page_private_dummy(page);
- unlock_page(page);
- mempool_free(page, sbi->write_io_dummy);
-
- if (unlikely(bio->bi_status))
- f2fs_stop_checkpoint(sbi, true,
- STOP_CP_REASON_WRITE_FAIL);
- continue;
- }
+ enum count_type type = WB_DATA_TYPE(page, false);
fscrypt_finalize_bounce_page(&page);
@@ -524,50 +511,13 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
submit_bio(bio);
}
-static void f2fs_align_write_bio(struct f2fs_sb_info *sbi, struct bio *bio)
-{
- unsigned int start =
- (bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS) % F2FS_IO_SIZE(sbi);
-
- if (start == 0)
- return;
-
- /* fill dummy pages */
- for (; start < F2FS_IO_SIZE(sbi); start++) {
- struct page *page =
- mempool_alloc(sbi->write_io_dummy,
- GFP_NOIO | __GFP_NOFAIL);
- f2fs_bug_on(sbi, !page);
-
- lock_page(page);
-
- zero_user_segment(page, 0, PAGE_SIZE);
- set_page_private_dummy(page);
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
- f2fs_bug_on(sbi, 1);
- }
-}
-
static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
WARN_ON_ONCE(is_read_io(bio_op(bio)));
- if (type == DATA || type == NODE) {
- if (f2fs_lfs_mode(sbi) && current->plug)
- blk_finish_plug(current->plug);
-
- if (F2FS_IO_ALIGNED(sbi)) {
- f2fs_align_write_bio(sbi, bio);
- /*
- * In the NODE case, we lose next block address chain.
- * So, we need to do checkpoint in f2fs_sync_file.
- */
- if (type == NODE)
- set_sbi_flag(sbi, SBI_NEED_CP);
- }
- }
+ if (f2fs_lfs_mode(sbi) && current->plug && PAGE_TYPE_ON_MAIN(type))
+ blk_finish_plug(current->plug);
trace_f2fs_submit_write_bio(sbi->sb, type, bio);
iostat_update_submit_ctx(bio, type);
@@ -740,10 +690,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
fio->is_por ? META_POR : (__is_meta_io(fio) ?
- META_GENERIC : DATA_GENERIC_ENHANCE))) {
- f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
+ META_GENERIC : DATA_GENERIC_ENHANCE)))
return -EFSCORRUPTED;
- }
trace_f2fs_submit_page_bio(page, fio);
@@ -762,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
- __read_io_type(page) : WB_DATA_TYPE(fio->page));
+ __read_io_type(page) : WB_DATA_TYPE(fio->page, false));
if (is_read_io(bio_op(bio)))
f2fs_submit_read_bio(fio->sbi, bio, fio->type);
@@ -796,16 +744,6 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
block_t last_blkaddr,
block_t cur_blkaddr)
{
- if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) {
- unsigned int filled_blocks =
- F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size);
- unsigned int io_size = F2FS_IO_SIZE(sbi);
- unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt;
-
- /* IOs in bio is aligned and left space of vectors is not enough */
- if (!(filled_blocks % io_size) && left_vecs < io_size)
- return false;
- }
if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr))
return false;
return io_type_is_mergeable(io, fio);
@@ -948,10 +886,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
fio->encrypted_page : fio->page;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
- __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) {
- f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
+ __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
return -EFSCORRUPTED;
- }
trace_f2fs_submit_page_bio(page, fio);
@@ -973,7 +909,7 @@ alloc_new:
if (fio->io_wbc)
wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
- inc_page_count(fio->sbi, WB_DATA_TYPE(page));
+ inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
*fio->last_block = fio->new_blkaddr;
*fio->bio = bio;
@@ -1007,11 +943,12 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
struct page *bio_page;
+ enum count_type type;
f2fs_bug_on(sbi, is_read_io(fio->op));
f2fs_down_write(&io->io_rwsem);
-
+next:
#ifdef CONFIG_BLK_DEV_ZONED
if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) {
wait_for_completion_io(&io->zone_wait);
@@ -1021,7 +958,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
}
#endif
-next:
if (fio->in_list) {
spin_lock(&io->io_lock);
if (list_empty(&io->io_list)) {
@@ -1046,7 +982,8 @@ next:
/* set submitted = true as a return value */
fio->submitted = 1;
- inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+ type = WB_DATA_TYPE(bio_page, fio->compressed_page);
+ inc_page_count(sbi, type);
if (io->bio &&
(!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
@@ -1056,13 +993,6 @@ next:
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- if (F2FS_IO_ALIGNED(sbi) &&
- (fio->type == DATA || fio->type == NODE) &&
- fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
- dec_page_count(sbi, WB_DATA_TYPE(bio_page));
- fio->retry = 1;
- goto skip;
- }
io->bio = __bio_alloc(fio, BIO_MAX_VECS);
f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
bio_page->index, fio, GFP_NOIO);
@@ -1080,10 +1010,6 @@ alloc_new:
io->last_block_in_bio = fio->new_blkaddr;
trace_f2fs_submit_page_write(fio->page, fio);
-skip:
- if (fio->in_list)
- goto next;
-out:
#ifdef CONFIG_BLK_DEV_ZONED
if (f2fs_sb_has_blkzoned(sbi) && btype < META &&
is_end_zone_blkaddr(sbi, fio->new_blkaddr)) {
@@ -1096,6 +1022,9 @@ out:
__submit_merged_bio(io);
}
#endif
+ if (fio->in_list)
+ goto next;
+out:
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
!f2fs_is_checkpoint_ready(sbi))
__submit_merged_bio(io);
@@ -1218,7 +1147,8 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
+ err = inc_valid_block_count(sbi, dn->inode, &count, true);
+ if (unlikely(err))
return err;
trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
@@ -1285,8 +1215,6 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(F2FS_I_SB(inode),
- ERROR_INVALID_BLKADDR);
goto put_err;
}
goto got_it;
@@ -1312,8 +1240,6 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
dn.data_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(F2FS_I_SB(inode),
- ERROR_INVALID_BLKADDR);
goto put_err;
}
got_it:
@@ -1475,15 +1401,18 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
dn->data_blkaddr = f2fs_data_blkaddr(dn);
if (dn->data_blkaddr == NULL_ADDR) {
- err = inc_valid_block_count(sbi, dn->inode, &count);
+ err = inc_valid_block_count(sbi, dn->inode, &count, true);
if (unlikely(err))
return err;
}
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
old_blkaddr = dn->data_blkaddr;
- f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
- &sum, seg_type, NULL);
+ err = f2fs_allocate_data_block(sbi, NULL, old_blkaddr,
+ &dn->data_blkaddr, &sum, seg_type, NULL);
+ if (err)
+ return err;
+
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
f2fs_invalidate_internal_cache(sbi, old_blkaddr);
@@ -1641,7 +1570,6 @@ next_block:
if (!is_hole &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto sync_out;
}
@@ -2165,8 +2093,6 @@ got_it:
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
DATA_GENERIC_ENHANCE_READ)) {
ret = -EFSCORRUPTED;
- f2fs_handle_error(F2FS_I_SB(inode),
- ERROR_INVALID_BLKADDR);
goto out;
}
} else {
@@ -2668,8 +2594,6 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
if (fio) {
if (page_private_gcing(fio->page))
return true;
- if (page_private_dummy(fio->page))
- return true;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
return true;
@@ -2706,11 +2630,8 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
f2fs_lookup_read_extent_cache_block(inode, page->index,
&fio->old_blkaddr)) {
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
- DATA_GENERIC_ENHANCE)) {
- f2fs_handle_error(fio->sbi,
- ERROR_INVALID_BLKADDR);
+ DATA_GENERIC_ENHANCE))
return -EFSCORRUPTED;
- }
ipu_force = true;
fio->need_lock = LOCK_DONE;
@@ -2738,7 +2659,6 @@ got_it:
!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
goto out_writepage;
}
@@ -2838,7 +2758,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.encrypted_page = NULL,
.submitted = 0,
.compr_blocks = compr_blocks,
- .need_lock = LOCK_RETRY,
+ .need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY,
.post_read = f2fs_post_read_required(inode) ? 1 : 0,
.io_type = io_type,
.io_wbc = wbc,
@@ -2919,6 +2839,7 @@ write:
if (err == -EAGAIN) {
err = f2fs_do_write_data_page(&fio);
if (err == -EAGAIN) {
+ f2fs_bug_on(sbi, compr_blocks);
fio.need_lock = LOCK_REQ;
err = f2fs_do_write_data_page(&fio);
}
@@ -3704,7 +3625,6 @@ repeat:
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto fail;
}
err = f2fs_submit_page_read(use_cow ?
@@ -3905,26 +3825,36 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int blkofs;
unsigned int blk_per_sec = BLKS_PER_SEC(sbi);
+ unsigned int end_blk = start_blk + blkcnt - 1;
unsigned int secidx = start_blk / blk_per_sec;
- unsigned int end_sec = secidx + blkcnt / blk_per_sec;
+ unsigned int end_sec;
int ret = 0;
+ if (!blkcnt)
+ return 0;
+ end_sec = end_blk / blk_per_sec;
+
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
set_inode_flag(inode, FI_ALIGNED_WRITE);
set_inode_flag(inode, FI_OPU_WRITE);
- for (; secidx < end_sec; secidx++) {
+ for (; secidx <= end_sec; secidx++) {
+ unsigned int blkofs_end = secidx == end_sec ?
+ end_blk % blk_per_sec : blk_per_sec - 1;
+
f2fs_down_write(&sbi->pin_sem);
- f2fs_lock_op(sbi);
- f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
- f2fs_unlock_op(sbi);
+ ret = f2fs_allocate_pinning_section(sbi);
+ if (ret) {
+ f2fs_up_write(&sbi->pin_sem);
+ break;
+ }
set_inode_flag(inode, FI_SKIP_WRITES);
- for (blkofs = 0; blkofs < blk_per_sec; blkofs++) {
+ for (blkofs = 0; blkofs <= blkofs_end; blkofs++) {
struct page *page;
unsigned int blkidx = secidx * blk_per_sec + blkofs;
@@ -4013,27 +3943,34 @@ retry:
nr_pblocks = map.m_len;
if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask ||
- nr_pblocks & sec_blks_mask) {
+ nr_pblocks & sec_blks_mask ||
+ !f2fs_valid_pinned_area(sbi, pblock)) {
+ bool last_extent = false;
+
not_aligned++;
nr_pblocks = roundup(nr_pblocks, blks_per_sec);
if (cur_lblock + nr_pblocks > sis->max)
nr_pblocks -= blks_per_sec;
+ /* this extent is last one */
if (!nr_pblocks) {
- /* this extent is last one */
- nr_pblocks = map.m_len;
- f2fs_warn(sbi, "Swapfile: last extent is not aligned to section");
- goto next;
+ nr_pblocks = last_lblock - cur_lblock;
+ last_extent = true;
}
ret = f2fs_migrate_blocks(inode, cur_lblock,
nr_pblocks);
- if (ret)
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = -EINVAL;
goto out;
- goto retry;
+ }
+
+ if (!last_extent)
+ goto retry;
}
-next:
+
if (cur_lblock + nr_pblocks >= sis->max)
nr_pblocks = sis->max - cur_lblock;
@@ -4071,17 +4008,17 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
struct inode *inode = file_inode(file);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int ret;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+ if (f2fs_readonly(sbi->sb))
return -EROFS;
- if (f2fs_lfs_mode(F2FS_I_SB(inode))) {
- f2fs_err(F2FS_I_SB(inode),
- "Swapfile not supported in LFS mode");
+ if (f2fs_lfs_mode(sbi) && !f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_err(sbi, "Swapfile not supported in LFS mode");
return -EINVAL;
}
@@ -4092,6 +4029,10 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
+ ret = filemap_fdatawrite(inode->i_mapping);
+ if (ret < 0)
+ return ret;
+
f2fs_precache_extents(inode);
ret = check_swap_activate(sis, file, span);
@@ -4100,7 +4041,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
stat_inc_swapfile_inode(inode);
set_inode_flag(inode, FI_PIN_FILE);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
return ret;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fdbf994f1271..8b0e1e71b667 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -41,7 +41,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi)
total_vblocks = 0;
blks_per_sec = CAP_BLKS_PER_SEC(sbi);
hblks_per_sec = blks_per_sec / 2;
- for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
vblocks = get_valid_blocks(sbi, segno, true);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -135,7 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->cur_ckpt_time = sbi->cprc_info.cur_time;
si->peak_ckpt_time = sbi->cprc_info.peak_time;
spin_unlock(&sbi->cprc_info.stat_lock);
- si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
+ si->total_count = BLKS_TO_SEGS(sbi, (int)sbi->user_block_count);
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
si->valid_count = valid_user_blocks(sbi);
@@ -176,11 +176,10 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
si->io_skip_bggc = sbi->io_skip_bggc;
si->other_skip_bggc = sbi->other_skip_bggc;
- si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
+ si->util_free = (int)(BLKS_TO_SEGS(sbi, free_user_blocks(sbi)))
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
- si->util_valid = (int)(written_block_count(sbi) >>
- sbi->log_blocks_per_seg)
+ si->util_valid = (int)(BLKS_TO_SEGS(sbi, written_block_count(sbi)))
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
si->util_invalid = 50 - si->util_free - si->util_valid;
@@ -208,7 +207,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
if (!blks)
continue;
- if (blks == sbi->blocks_per_seg)
+ if (blks == BLKS_PER_SEG(sbi))
si->full_seg[type]++;
else
si->dirty_seg[type]++;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 042593aed1ec..02c9355176d3 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -830,13 +830,14 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
return err;
}
-int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir,
+ struct f2fs_filename *fname)
{
struct page *page;
int err = 0;
f2fs_down_write(&F2FS_I(inode)->i_sem);
- page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
+ page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -995,9 +996,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
de = &d->dentry[bit_pos];
if (de->name_len == 0) {
if (found_valid_dirent || !bit_pos) {
- printk_ratelimited(
- "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
- KERN_WARNING, sbi->sb->s_id,
+ f2fs_warn_ratelimited(sbi,
+ "invalid namelen(0), ino:%u, run fsck to fix.",
le32_to_cpu(de->ino));
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ad8dfac73bd4..48048fa36427 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -43,7 +43,6 @@ bool sanity_check_extent_cache(struct inode *inode)
if (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE) ||
!f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1,
DATA_GENERIC_ENHANCE)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix",
__func__, inode->i_ino,
ei->blk, ei->fofs, ei->len);
@@ -856,10 +855,8 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
goto out;
if (__is_valid_data_blkaddr(blkaddr) &&
- !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
- f2fs_bug_on(sbi, 1);
+ !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE))
return -EINVAL;
- }
out:
/*
* init block age with zero, this can happen when the block age extent
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4c77e8ce5c75..fced2b7652f4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -61,7 +61,9 @@ enum {
FAULT_SLAB_ALLOC,
FAULT_DQUOT_INIT,
FAULT_LOCK_OP,
- FAULT_BLKADDR,
+ FAULT_BLKADDR_VALIDITY,
+ FAULT_BLKADDR_CONSISTENCE,
+ FAULT_NO_SEGMENT,
FAULT_MAX,
};
@@ -76,6 +78,11 @@ struct f2fs_fault_info {
extern const char *f2fs_fault_name[FAULT_MAX];
#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type))
+
+/* maximum retry count for injected failure */
+#define DEFAULT_FAILURE_RETRY_COUNT 8
+#else
+#define DEFAULT_FAILURE_RETRY_COUNT 1
#endif
/*
@@ -143,7 +150,6 @@ struct f2fs_rwsem {
struct f2fs_mount_info {
unsigned int opt;
- int write_io_size_bits; /* Write IO size bits */
block_t root_reserved_blocks; /* root reserved blocks */
kuid_t s_resuid; /* reserved blocks for uid */
kgid_t s_resgid; /* reserved blocks for gid */
@@ -830,7 +836,7 @@ struct f2fs_inode_info {
spinlock_t i_size_lock; /* protect last_disk_size */
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
@@ -1081,7 +1087,8 @@ struct f2fs_sm_info {
* f2fs monitors the number of several block types such as on-writeback,
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
-#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
+#define WB_DATA_TYPE(p, f) \
+ (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
enum count_type {
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
@@ -1111,6 +1118,7 @@ enum count_type {
* ... Only can be used with META.
*/
#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
+#define PAGE_TYPE_ON_MAIN(type) ((type) == DATA || (type) == NODE)
enum page_type {
DATA = 0,
NODE = 1, /* should not change this */
@@ -1205,7 +1213,6 @@ struct f2fs_io_info {
unsigned int submitted:1; /* indicate IO submission */
unsigned int in_list:1; /* indicate fio is in io_list */
unsigned int is_por:1; /* indicate IO is from recovery or not */
- unsigned int retry:1; /* need to reallocate block address */
unsigned int encrypted:1; /* indicate file is encrypted */
unsigned int post_read:1; /* require post read */
enum iostat_type io_type; /* io type */
@@ -1407,18 +1414,16 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr);
* Layout A: lowest bit should be 1
* | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... |
* bit 0 PAGE_PRIVATE_NOT_POINTER
- * bit 1 PAGE_PRIVATE_DUMMY_WRITE
- * bit 2 PAGE_PRIVATE_ONGOING_MIGRATION
- * bit 3 PAGE_PRIVATE_INLINE_INODE
- * bit 4 PAGE_PRIVATE_REF_RESOURCE
- * bit 5- f2fs private data
+ * bit 1 PAGE_PRIVATE_ONGOING_MIGRATION
+ * bit 2 PAGE_PRIVATE_INLINE_INODE
+ * bit 3 PAGE_PRIVATE_REF_RESOURCE
+ * bit 4- f2fs private data
*
* Layout B: lowest bit should be 0
* page.private is a wrapped pointer.
*/
enum {
PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */
- PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */
PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */
PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */
PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */
@@ -1565,7 +1570,6 @@ struct f2fs_sb_info {
struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
/* keep migration IO order for LFS mode */
struct f2fs_rwsem io_order_lock;
- mempool_t *write_io_dummy; /* Dummy pages */
pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */
int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */
@@ -1811,6 +1815,37 @@ struct f2fs_sb_info {
#endif
};
+/* Definitions to access f2fs_sb_info */
+#define SEGS_TO_BLKS(sbi, segs) \
+ ((segs) << (sbi)->log_blocks_per_seg)
+#define BLKS_TO_SEGS(sbi, blks) \
+ ((blks) >> (sbi)->log_blocks_per_seg)
+
+#define BLKS_PER_SEG(sbi) ((sbi)->blocks_per_seg)
+#define BLKS_PER_SEC(sbi) (SEGS_TO_BLKS(sbi, (sbi)->segs_per_sec))
+#define SEGS_PER_SEC(sbi) ((sbi)->segs_per_sec)
+
+__printf(3, 4)
+void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...);
+
+#define f2fs_err(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_notice(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__)
+#define f2fs_info(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__)
+#define f2fs_debug(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__)
+
+#define f2fs_err_ratelimited(sbi, fmt, ...) \
+ f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn_ratelimited(sbi, fmt, ...) \
+ f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_info_ratelimited(sbi, fmt, ...) \
+ f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__)
+
#ifdef CONFIG_F2FS_FAULT_INJECTION
#define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \
__builtin_return_address(0))
@@ -1828,9 +1863,8 @@ static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type,
atomic_inc(&ffi->inject_ops);
if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
atomic_set(&ffi->inject_ops, 0);
- printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n",
- KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type],
- func, parent_func);
+ f2fs_info_ratelimited(sbi, "inject %s in %s of %pS",
+ f2fs_fault_name[type], func, parent_func);
return true;
}
return false;
@@ -2250,9 +2284,30 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
return false;
}
+static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi,
+ struct inode *inode, bool cap)
+{
+ block_t avail_user_block_count;
+
+ avail_user_block_count = sbi->user_block_count -
+ sbi->current_reserved_blocks;
+
+ if (!__allow_reserved_blocks(sbi, inode, cap))
+ avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (avail_user_block_count > sbi->unusable_block_count)
+ avail_user_block_count -= sbi->unusable_block_count;
+ else
+ avail_user_block_count = 0;
+ }
+
+ return avail_user_block_count;
+}
+
static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
- struct inode *inode, blkcnt_t *count)
+ struct inode *inode, blkcnt_t *count, bool partial)
{
blkcnt_t diff = 0, release = 0;
block_t avail_user_block_count;
@@ -2275,23 +2330,14 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
sbi->total_valid_block_count += (block_t)(*count);
- avail_user_block_count = sbi->user_block_count -
- sbi->current_reserved_blocks;
-
- if (!__allow_reserved_blocks(sbi, inode, true))
- avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
-
- if (F2FS_IO_ALIGNED(sbi))
- avail_user_block_count -= sbi->blocks_per_seg *
- SM_I(sbi)->additional_reserved_segments;
+ avail_user_block_count = get_available_block_count(sbi, inode, true);
- if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
- if (avail_user_block_count > sbi->unusable_block_count)
- avail_user_block_count -= sbi->unusable_block_count;
- else
- avail_user_block_count = 0;
- }
if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
+ if (!partial) {
+ spin_unlock(&sbi->stat_lock);
+ goto enospc;
+ }
+
diff = sbi->total_valid_block_count - avail_user_block_count;
if (diff > *count)
diff = *count;
@@ -2319,20 +2365,6 @@ release_quota:
return -ENOSPC;
}
-__printf(2, 3)
-void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...);
-
-#define f2fs_err(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_ERR fmt, ##__VA_ARGS__)
-#define f2fs_warn(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_WARNING fmt, ##__VA_ARGS__)
-#define f2fs_notice(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_NOTICE fmt, ##__VA_ARGS__)
-#define f2fs_info(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_INFO fmt, ##__VA_ARGS__)
-#define f2fs_debug(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__)
-
#define PAGE_PRIVATE_GET_FUNC(name, flagname) \
static inline bool page_private_##name(struct page *page) \
{ \
@@ -2361,17 +2393,14 @@ static inline void clear_page_private_##name(struct page *page) \
PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER);
PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE);
PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE);
PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE);
static inline unsigned long get_page_private_data(struct page *page)
{
@@ -2505,11 +2534,8 @@ static inline int get_dirty_pages(struct inode *inode)
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
- unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
- unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >>
- sbi->log_blocks_per_seg;
-
- return segs / sbi->segs_per_sec;
+ return div_u64(get_pages(sbi, block_type) + BLKS_PER_SEC(sbi) - 1,
+ BLKS_PER_SEC(sbi));
}
static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
@@ -2573,7 +2599,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
if (sbi->cur_cp_pack == 2)
- start_addr += sbi->blocks_per_seg;
+ start_addr += BLKS_PER_SEG(sbi);
return start_addr;
}
@@ -2582,7 +2608,7 @@ static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
if (sbi->cur_cp_pack == 1)
- start_addr += sbi->blocks_per_seg;
+ start_addr += BLKS_PER_SEG(sbi);
return start_addr;
}
@@ -2601,7 +2627,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
struct inode *inode, bool is_inode)
{
block_t valid_block_count;
- unsigned int valid_node_count, user_block_count;
+ unsigned int valid_node_count;
+ unsigned int avail_user_block_count;
int err;
if (is_inode) {
@@ -2621,21 +2648,10 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
- valid_block_count = sbi->total_valid_block_count +
- sbi->current_reserved_blocks + 1;
-
- if (!__allow_reserved_blocks(sbi, inode, false))
- valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+ valid_block_count = sbi->total_valid_block_count + 1;
+ avail_user_block_count = get_available_block_count(sbi, inode, false);
- if (F2FS_IO_ALIGNED(sbi))
- valid_block_count += sbi->blocks_per_seg *
- SM_I(sbi)->additional_reserved_segments;
-
- user_block_count = sbi->user_block_count;
- if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
- user_block_count -= sbi->unusable_block_count;
-
- if (unlikely(valid_block_count > user_block_count)) {
+ if (unlikely(valid_block_count > avail_user_block_count)) {
spin_unlock(&sbi->stat_lock);
goto enospc;
}
@@ -3022,6 +3038,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_INLINE_DOTS:
case FI_PIN_FILE:
case FI_COMPRESS_RELEASED:
+ case FI_ATOMIC_COMMITTED:
f2fs_mark_inode_dirty_sync(inode, true);
}
}
@@ -3445,7 +3462,7 @@ static inline __le32 *get_dnode_addr(struct inode *inode,
sizeof((f2fs_inode)->field)) \
<= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \
-#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
+#define __is_large_section(sbi) (SEGS_PER_SEC(sbi) > 1)
#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META)
@@ -3454,11 +3471,9 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
static inline void verify_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
- if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) {
+ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type))
f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.",
blkaddr, type);
- f2fs_bug_on(sbi, 1);
- }
}
static inline bool __is_valid_data_blkaddr(block_t blkaddr)
@@ -3560,7 +3575,8 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode);
void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
struct inode *dir, struct inode *inode);
-int f2fs_do_tmpfile(struct inode *inode, struct inode *dir);
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir,
+ struct f2fs_filename *fname);
bool f2fs_empty_dir(struct inode *dir);
static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
@@ -3675,15 +3691,14 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno);
-void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
+int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi);
void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi);
-void f2fs_get_new_segment(struct f2fs_sb_info *sbi,
- unsigned int *newseg, bool new_sec, int dir);
-void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end);
-void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force);
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
+int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force);
+int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi);
+int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
@@ -3704,7 +3719,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
block_t old_addr, block_t new_addr,
unsigned char version, bool recover_curseg,
bool recover_newaddr);
-void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio);
@@ -3754,6 +3769,8 @@ struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index);
struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
+bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type);
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync);
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
@@ -3794,6 +3811,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
+bool f2fs_is_cp_guaranteed(struct page *page);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
@@ -3857,6 +3875,9 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi);
block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control);
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
+int f2fs_gc_range(struct f2fs_sb_info *sbi,
+ unsigned int start_seg, unsigned int end_seg,
+ bool dry_run, unsigned int dry_run_sections);
int f2fs_resize_fs(struct file *filp, __u64 block_count);
int __init f2fs_create_garbage_collection_cache(void);
void f2fs_destroy_garbage_collection_cache(void);
@@ -4277,7 +4298,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
bool in_task);
void f2fs_put_page_dic(struct page *page, bool in_task);
-unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn,
+ unsigned int ofs_in_node);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
@@ -4334,7 +4356,8 @@ static inline void f2fs_put_page_dic(struct page *page, bool in_task)
{
WARN_ON_ONCE(1);
}
-static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; }
+static inline unsigned int f2fs_cluster_blocks_are_contiguous(
+ struct dnode_of_data *dn, unsigned int ofs_in_node) { return 0; }
static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; }
static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { }
@@ -4391,15 +4414,24 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
- if (!f2fs_compressed_file(inode))
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
+
+ if (!f2fs_compressed_file(inode)) {
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return true;
- if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
+ }
+ if (f2fs_is_mmap_file(inode) ||
+ (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) {
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return false;
+ }
fi->i_flags &= ~F2FS_COMPR_FL;
stat_dec_compr_inode(inode);
clear_inode_flag(inode, FI_COMPRESSED_FILE);
f2fs_mark_inode_dirty_sync(inode, true);
+
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return true;
}
@@ -4502,6 +4534,17 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
+static inline bool f2fs_valid_pinned_area(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ int devi = f2fs_target_device_index(sbi, blkaddr);
+
+ return !bdev_is_zoned(FDEV(devi).bdev);
+ }
+ return true;
+}
+
static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi)
{
return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW;
@@ -4603,10 +4646,36 @@ static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
}
+static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi,
+ block_t blkaddr, unsigned int cnt)
+{
+ bool need_submit = false;
+ int i = 0;
+
+ do {
+ struct page *page;
+
+ page = find_get_page(META_MAPPING(sbi), blkaddr + i);
+ if (page) {
+ if (PageWriteback(page))
+ need_submit = true;
+ f2fs_put_page(page, 0);
+ }
+ } while (++i < cnt && !need_submit);
+
+ if (need_submit)
+ f2fs_submit_merged_write_cond(sbi, sbi->meta_inode,
+ NULL, 0, DATA);
+
+ truncate_inode_pages_range(META_MAPPING(sbi),
+ F2FS_BLK_TO_BYTES((loff_t)blkaddr),
+ F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1)));
+}
+
static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
block_t blkaddr)
{
- invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr);
+ f2fs_truncate_meta_inode_pages(sbi, blkaddr, 1);
f2fs_invalidate_compress_page(sbi, blkaddr);
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b58ab1157b7e..1761ad125f97 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -39,6 +39,7 @@
static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_flags_t flags = vmf->vma->vm_flags;
vm_fault_t ret;
ret = filemap_fault(vmf);
@@ -46,7 +47,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_MAPPED_READ_IO, F2FS_BLKSIZE);
- trace_f2fs_filemap_fault(inode, vmf->pgoff, vmf->vma->vm_flags, ret);
+ trace_f2fs_filemap_fault(inode, vmf->pgoff, flags, ret);
return ret;
}
@@ -394,9 +395,20 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
return f2fs_do_sync_file(file, start, end, datasync, false);
}
-static bool __found_offset(struct address_space *mapping, block_t blkaddr,
- pgoff_t index, int whence)
+static bool __found_offset(struct address_space *mapping,
+ struct dnode_of_data *dn, pgoff_t index, int whence)
{
+ block_t blkaddr = f2fs_data_blkaddr(dn);
+ struct inode *inode = mapping->host;
+ bool compressed_cluster = false;
+
+ if (f2fs_compressed_file(inode)) {
+ block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ ALIGN_DOWN(dn->ofs_in_node, F2FS_I(inode)->i_cluster_size));
+
+ compressed_cluster = first_blkaddr == COMPRESS_ADDR;
+ }
+
switch (whence) {
case SEEK_DATA:
if (__is_valid_data_blkaddr(blkaddr))
@@ -404,8 +416,12 @@ static bool __found_offset(struct address_space *mapping, block_t blkaddr,
if (blkaddr == NEW_ADDR &&
xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY))
return true;
+ if (compressed_cluster)
+ return true;
break;
case SEEK_HOLE:
+ if (compressed_cluster)
+ return false;
if (blkaddr == NULL_ADDR)
return true;
break;
@@ -474,7 +490,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
goto fail;
}
- if (__found_offset(file->f_mapping, blkaddr,
+ if (__found_offset(file->f_mapping, &dn,
pgofs, whence)) {
f2fs_put_dnode(&dn);
goto found;
@@ -590,8 +606,10 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
f2fs_set_data_blkaddr(dn, NULL_ADDR);
if (__is_valid_data_blkaddr(blkaddr)) {
- if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE))
+ if (time_to_inject(sbi, FAULT_BLKADDR_CONSISTENCE))
+ continue;
+ if (!f2fs_is_valid_blkaddr_raw(sbi, blkaddr,
+ DATA_GENERIC_ENHANCE))
continue;
if (compressed_cluster)
valid_blocks++;
@@ -818,8 +836,6 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw)
*/
if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE))
return true;
- if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
- return true;
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
return true;
@@ -1192,7 +1208,6 @@ next_dnode:
!f2fs_is_valid_blkaddr(sbi, *blkaddr,
DATA_GENERIC_ENHANCE)) {
f2fs_put_dnode(&dn);
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
}
@@ -1478,7 +1493,6 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr,
DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
break;
}
@@ -1662,10 +1676,12 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
}
filemap_invalidate_unlock(mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ if (ret)
+ return ret;
/* write out all moved pages, if possible */
filemap_invalidate_lock(mapping);
- filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
+ ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
truncate_pagecache(inode, offset);
filemap_invalidate_unlock(mapping);
@@ -1731,9 +1747,11 @@ next_alloc:
f2fs_down_write(&sbi->pin_sem);
- f2fs_lock_op(sbi);
- f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
- f2fs_unlock_op(sbi);
+ err = f2fs_allocate_pinning_section(sbi);
+ if (err) {
+ f2fs_up_write(&sbi->pin_sem);
+ goto out_err;
+ }
map.m_seg_type = CURSEG_COLD_DATA_PINNED;
err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_DIO);
@@ -2066,7 +2084,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
inode_lock(inode);
- if (!f2fs_disable_compressed_file(inode)) {
+ if (!f2fs_disable_compressed_file(inode) ||
+ f2fs_is_pinned_file(inode)) {
ret = -EINVAL;
goto out;
}
@@ -2243,8 +2262,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
case F2FS_GOING_DOWN_METASYNC:
/* do checkpoint only */
ret = f2fs_sync_fs(sb, 1);
- if (ret)
+ if (ret) {
+ if (ret == -EIO)
+ ret = 0;
goto out;
+ }
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
break;
case F2FS_GOING_DOWN_NOSYNC:
@@ -2260,6 +2282,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
set_sbi_flag(sbi, SBI_IS_DIRTY);
/* do checkpoint only */
ret = f2fs_sync_fs(sb, 1);
+ if (ret == -EIO)
+ ret = 0;
goto out;
default:
ret = -EINVAL;
@@ -2578,7 +2602,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
.m_may_create = false };
struct extent_info ei = {};
pgoff_t pg_start, pg_end, next_pgofs;
- unsigned int blk_per_seg = sbi->blocks_per_seg;
unsigned int total = 0, sec_num;
block_t blk_end = 0;
bool fragmented = false;
@@ -2687,7 +2710,8 @@ do_map:
set_inode_flag(inode, FI_SKIP_WRITES);
idx = map.m_lblk;
- while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+ while (idx < map.m_lblk + map.m_len &&
+ cnt < BLKS_PER_SEG(sbi)) {
struct page *page;
page = f2fs_get_lock_data_page(inode, idx, true);
@@ -2707,7 +2731,7 @@ do_map:
map.m_lblk = idx;
check:
- if (map.m_lblk < pg_end && cnt < blk_per_seg)
+ if (map.m_lblk < pg_end && cnt < BLKS_PER_SEG(sbi))
goto do_map;
clear_inode_flag(inode, FI_SKIP_WRITES);
@@ -2976,8 +3000,8 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num ||
__is_large_section(sbi)) {
- f2fs_warn(sbi, "Can't flush %u in %d for segs_per_sec %u != 1",
- range.dev_num, sbi->s_ndevs, sbi->segs_per_sec);
+ f2fs_warn(sbi, "Can't flush %u in %d for SEGS_PER_SEC %u != 1",
+ range.dev_num, sbi->s_ndevs, SEGS_PER_SEC(sbi));
return -EINVAL;
}
@@ -3183,6 +3207,7 @@ int f2fs_pin_file_control(struct inode *inode, bool inc)
static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
__u32 pin;
int ret = 0;
@@ -3192,7 +3217,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+ if (f2fs_readonly(sbi->sb))
return -EROFS;
ret = mnt_want_write_file(filp);
@@ -3205,9 +3230,18 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
clear_inode_flag(inode, FI_PIN_FILE);
f2fs_i_gc_failures_write(inode, 0);
goto done;
+ } else if (f2fs_is_pinned_file(inode)) {
+ goto done;
}
- if (f2fs_should_update_outplace(inode, NULL)) {
+ if (f2fs_sb_has_blkzoned(sbi) && F2FS_HAS_BLOCKS(inode)) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ /* Let's allow file pinning on zoned device. */
+ if (!f2fs_sb_has_blkzoned(sbi) &&
+ f2fs_should_update_outplace(inode, NULL)) {
ret = -EINVAL;
goto out;
}
@@ -3229,7 +3263,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
set_inode_flag(inode, FI_PIN_FILE);
ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
done:
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
out:
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -3438,10 +3472,8 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (!__is_valid_data_blkaddr(blkaddr))
continue;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE))) {
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ DATA_GENERIC_ENHANCE)))
return -EFSCORRUPTED;
- }
}
while (count) {
@@ -3588,10 +3620,10 @@ out:
return ret;
}
-static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
+static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
+ unsigned int *reserved_blocks)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- unsigned int reserved_blocks = 0;
int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
block_t blkaddr;
int i;
@@ -3603,10 +3635,8 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (!__is_valid_data_blkaddr(blkaddr))
continue;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE))) {
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ DATA_GENERIC_ENHANCE)))
return -EFSCORRUPTED;
- }
}
while (count) {
@@ -3614,40 +3644,53 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
blkcnt_t reserved;
int ret;
- for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
- blkaddr = f2fs_data_blkaddr(dn);
+ for (i = 0; i < cluster_size; i++) {
+ blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
if (i == 0) {
- if (blkaddr == COMPRESS_ADDR)
- continue;
- dn->ofs_in_node += cluster_size;
- goto next;
+ if (blkaddr != COMPRESS_ADDR) {
+ dn->ofs_in_node += cluster_size;
+ goto next;
+ }
+ continue;
}
- if (__is_valid_data_blkaddr(blkaddr)) {
+ /*
+ * compressed cluster was not released due to it
+ * fails in release_compress_blocks(), so NEW_ADDR
+ * is a possible case.
+ */
+ if (blkaddr == NEW_ADDR ||
+ __is_valid_data_blkaddr(blkaddr)) {
compr_blocks++;
continue;
}
-
- f2fs_set_data_blkaddr(dn, NEW_ADDR);
}
reserved = cluster_size - compr_blocks;
- ret = inc_valid_block_count(sbi, dn->inode, &reserved);
- if (ret)
+
+ /* for the case all blocks in cluster were reserved */
+ if (reserved == 1)
+ goto next;
+
+ ret = inc_valid_block_count(sbi, dn->inode, &reserved, false);
+ if (unlikely(ret))
return ret;
- if (reserved != cluster_size - compr_blocks)
- return -ENOSPC;
+ for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
+ if (f2fs_data_blkaddr(dn) == NULL_ADDR)
+ f2fs_set_data_blkaddr(dn, NEW_ADDR);
+ }
f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true);
- reserved_blocks += reserved;
+ *reserved_blocks += reserved;
next:
count -= cluster_size;
}
- return reserved_blocks;
+ return 0;
}
static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
@@ -3671,9 +3714,6 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
if (ret)
return ret;
- if (atomic_read(&F2FS_I(inode)->i_compr_blocks))
- goto out;
-
f2fs_balance_fs(sbi, true);
inode_lock(inode);
@@ -3683,6 +3723,9 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
goto unlock_inode;
}
+ if (atomic_read(&F2FS_I(inode)->i_compr_blocks))
+ goto unlock_inode;
+
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
@@ -3708,7 +3751,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
count = round_up(count, F2FS_I(inode)->i_cluster_size);
- ret = reserve_compress_blocks(&dn, count);
+ ret = reserve_compress_blocks(&dn, count, &reserved_blocks);
f2fs_put_dnode(&dn);
@@ -3716,23 +3759,21 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
break;
page_idx += count;
- reserved_blocks += ret;
}
filemap_invalidate_unlock(inode->i_mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- if (ret >= 0) {
+ if (!ret) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
}
unlock_inode:
inode_unlock(inode);
-out:
mnt_drop_write_file(filp);
- if (ret >= 0) {
+ if (!ret) {
ret = put_user(reserved_blocks, (u64 __user *)arg);
} else if (reserved_blocks &&
atomic_read(&F2FS_I(inode)->i_compr_blocks)) {
@@ -3877,8 +3918,6 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
f2fs_put_dnode(&dn);
- f2fs_handle_error(sbi,
- ERROR_INVALID_BLKADDR);
goto out;
}
@@ -3981,16 +4020,20 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
sizeof(option)))
return -EFAULT;
- if (!f2fs_compressed_file(inode) ||
- option.log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
- option.log_cluster_size > MAX_COMPRESS_LOG_SIZE ||
- option.algorithm >= COMPRESS_MAX)
+ if (option.log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
+ option.log_cluster_size > MAX_COMPRESS_LOG_SIZE ||
+ option.algorithm >= COMPRESS_MAX)
return -EINVAL;
file_start_write(filp);
inode_lock(inode);
f2fs_down_write(&F2FS_I(inode)->i_sem);
+ if (!f2fs_compressed_file(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) {
ret = -EBUSY;
goto out;
@@ -4066,7 +4109,6 @@ static int f2fs_ioc_decompress_file(struct file *filp)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
pgoff_t page_idx = 0, last_idx;
- unsigned int blk_per_seg = sbi->blocks_per_seg;
int cluster_size = fi->i_cluster_size;
int count, ret;
@@ -4110,7 +4152,7 @@ static int f2fs_ioc_decompress_file(struct file *filp)
if (ret < 0)
break;
- if (get_dirty_pages(inode) >= blk_per_seg) {
+ if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) {
ret = filemap_fdatawrite(inode->i_mapping);
if (ret < 0)
break;
@@ -4145,7 +4187,6 @@ static int f2fs_ioc_compress_file(struct file *filp)
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t page_idx = 0, last_idx;
- unsigned int blk_per_seg = sbi->blocks_per_seg;
int cluster_size = F2FS_I(inode)->i_cluster_size;
int count, ret;
@@ -4188,7 +4229,7 @@ static int f2fs_ioc_compress_file(struct file *filp)
if (ret < 0)
break;
- if (get_dirty_pages(inode) >= blk_per_seg) {
+ if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) {
ret = filemap_fdatawrite(inode->i_mapping);
if (ret < 0)
break;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index a079eebfb080..8852814dab7f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -259,7 +259,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->ofs_unit = 1;
} else {
p->gc_mode = select_gc_type(sbi, gc_type);
- p->ofs_unit = sbi->segs_per_sec;
+ p->ofs_unit = SEGS_PER_SEC(sbi);
if (__is_large_section(sbi)) {
p->dirty_bitmap = dirty_i->dirty_secmap;
p->max_search = count_bits(p->dirty_bitmap,
@@ -280,11 +280,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
- /* let's select beginning hot/small space first in no_heap mode*/
+ /* let's select beginning hot/small space first. */
if (f2fs_need_rand_seg(sbi))
- p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
- else if (test_opt(sbi, NOHEAP) &&
- (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
+ p->offset = get_random_u32_below(MAIN_SECS(sbi) *
+ SEGS_PER_SEC(sbi));
+ else if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
p->offset = 0;
else
p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
@@ -295,13 +295,13 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
{
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
else if (p->alloc_mode == AT_SSR)
return UINT_MAX;
/* LFS */
if (p->gc_mode == GC_GREEDY)
- return 2 * sbi->blocks_per_seg * p->ofs_unit;
+ return SEGS_TO_BLKS(sbi, 2 * p->ofs_unit);
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else if (p->gc_mode == GC_AT)
@@ -348,7 +348,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
mtime = div_u64(mtime, usable_segs_per_sec);
vblocks = div_u64(vblocks, usable_segs_per_sec);
- u = (vblocks * 100) >> sbi->log_blocks_per_seg;
+ u = BLKS_TO_SEGS(sbi, vblocks * 100);
/* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
@@ -496,9 +496,9 @@ static void add_victim_entry(struct f2fs_sb_info *sbi,
return;
}
- for (i = 0; i < sbi->segs_per_sec; i++)
+ for (i = 0; i < SEGS_PER_SEC(sbi); i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
- mtime = div_u64(mtime, sbi->segs_per_sec);
+ mtime = div_u64(mtime, SEGS_PER_SEC(sbi));
/* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
@@ -599,7 +599,6 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi,
unsigned long long age;
unsigned long long max_mtime = sit_i->dirty_max_mtime;
unsigned long long min_mtime = sit_i->dirty_min_mtime;
- unsigned int seg_blocks = sbi->blocks_per_seg;
unsigned int vblocks;
unsigned int dirty_threshold = max(am->max_candidate_count,
am->candidate_ratio *
@@ -629,7 +628,7 @@ next_node:
f2fs_bug_on(sbi, !vblocks);
/* rare case */
- if (vblocks == seg_blocks)
+ if (vblocks == BLKS_PER_SEG(sbi))
goto skip_node;
iter++;
@@ -755,7 +754,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
int ret = 0;
mutex_lock(&dirty_i->seglist_lock);
- last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec;
+ last_segment = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi);
p.alloc_mode = alloc_mode;
p.age = age;
@@ -896,7 +895,7 @@ next:
else
sm->last_victim[p.gc_mode] = segno + p.ofs_unit;
sm->last_victim[p.gc_mode] %=
- (MAIN_SECS(sbi) * sbi->segs_per_sec);
+ (MAIN_SECS(sbi) * SEGS_PER_SEC(sbi));
break;
}
}
@@ -1184,7 +1183,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
.op_flags = 0,
.encrypted_page = NULL,
.in_list = 0,
- .retry = 0,
};
int err;
@@ -1197,7 +1195,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ))) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto put_page;
}
goto got_it;
@@ -1216,7 +1213,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE))) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto put_page;
}
got_it:
@@ -1273,7 +1269,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
.op_flags = 0,
.encrypted_page = NULL,
.in_list = 0,
- .retry = 0,
};
struct dnode_of_data dn;
struct f2fs_summary sum;
@@ -1364,8 +1359,13 @@ static int move_data_block(struct inode *inode, block_t bidx,
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* allocate block address */
- f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
+ err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
&sum, type, NULL);
+ if (err) {
+ f2fs_put_page(mpage, 1);
+ /* filesystem should shutdown, no need to recovery block */
+ goto up_out;
+ }
fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -1393,18 +1393,12 @@ static int move_data_block(struct inode *inode, block_t bidx,
fio.op_flags = REQ_SYNC;
fio.new_blkaddr = newaddr;
f2fs_submit_page_write(&fio);
- if (fio.retry) {
- err = -EAGAIN;
- if (PageWriteback(fio.encrypted_page))
- end_page_writeback(fio.encrypted_page);
- goto put_page_out;
- }
f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE);
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
-put_page_out:
+
f2fs_put_page(fio.encrypted_page, 1);
recover_block:
if (err)
@@ -1678,7 +1672,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct f2fs_summary_block *sum;
struct blk_plug plug;
unsigned int segno = start_segno;
- unsigned int end_segno = start_segno + sbi->segs_per_sec;
+ unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi);
int seg_freed = 0, migrated = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
@@ -1686,7 +1680,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
int submitted = 0;
if (__is_large_section(sbi))
- end_segno = rounddown(end_segno, sbi->segs_per_sec);
+ end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi));
/*
* zone-capacity can be less than zone-size in zoned devices,
@@ -1694,7 +1688,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
* calculate the end segno in the zone which can be garbage collected
*/
if (f2fs_sb_has_blkzoned(sbi))
- end_segno -= sbi->segs_per_sec -
+ end_segno -= SEGS_PER_SEC(sbi) -
f2fs_usable_segs_in_sec(sbi, segno);
sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
@@ -1983,10 +1977,43 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
init_atgc_management(sbi);
}
+int f2fs_gc_range(struct f2fs_sb_info *sbi,
+ unsigned int start_seg, unsigned int end_seg,
+ bool dry_run, unsigned int dry_run_sections)
+{
+ unsigned int segno;
+ unsigned int gc_secs = dry_run_sections;
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) {
+ struct gc_inode_list gc_list = {
+ .ilist = LIST_HEAD_INIT(gc_list.ilist),
+ .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
+ };
+
+ do_garbage_collect(sbi, segno, &gc_list, FG_GC,
+ dry_run_sections == 0);
+ put_gc_inode(&gc_list);
+
+ if (!dry_run && get_valid_blocks(sbi, segno, true))
+ return -EAGAIN;
+ if (dry_run && dry_run_sections &&
+ !get_valid_blocks(sbi, segno, true) && --gc_secs == 0)
+ break;
+
+ if (fatal_signal_pending(current))
+ return -ERESTARTSYS;
+ }
+
+ return 0;
+}
+
static int free_segment_range(struct f2fs_sb_info *sbi,
- unsigned int secs, bool gc_only)
+ unsigned int secs, bool dry_run)
{
- unsigned int segno, next_inuse, start, end;
+ unsigned int next_inuse, start, end;
struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
int gc_mode, gc_type;
int err = 0;
@@ -1994,7 +2021,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
/* Force block allocation for GC */
MAIN_SECS(sbi) -= secs;
- start = MAIN_SECS(sbi) * sbi->segs_per_sec;
+ start = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi);
end = MAIN_SEGS(sbi) - 1;
mutex_lock(&DIRTY_I(sbi)->seglist_lock);
@@ -2008,29 +2035,15 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
/* Move out cursegs from the target range */
- for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++)
- f2fs_allocate_segment_for_resize(sbi, type, start, end);
-
- /* do GC to move out valid blocks in the range */
- for (segno = start; segno <= end; segno += sbi->segs_per_sec) {
- struct gc_inode_list gc_list = {
- .ilist = LIST_HEAD_INIT(gc_list.ilist),
- .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
- };
-
- do_garbage_collect(sbi, segno, &gc_list, FG_GC, true);
- put_gc_inode(&gc_list);
-
- if (!gc_only && get_valid_blocks(sbi, segno, true)) {
- err = -EAGAIN;
- goto out;
- }
- if (fatal_signal_pending(current)) {
- err = -ERESTARTSYS;
+ for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) {
+ err = f2fs_allocate_segment_for_resize(sbi, type, start, end);
+ if (err)
goto out;
- }
}
- if (gc_only)
+
+ /* do GC to move out valid blocks in the range */
+ err = f2fs_gc_range(sbi, start, end, dry_run, 0);
+ if (err || dry_run)
goto out;
stat_inc_cp_call_count(sbi, TOTAL_CALL);
@@ -2056,7 +2069,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
int segment_count;
int segment_count_main;
long long block_count;
- int segs = secs * sbi->segs_per_sec;
+ int segs = secs * SEGS_PER_SEC(sbi);
f2fs_down_write(&sbi->sb_lock);
@@ -2069,7 +2082,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
raw_sb->segment_count = cpu_to_le32(segment_count + segs);
raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
raw_sb->block_count = cpu_to_le64(block_count +
- (long long)segs * sbi->blocks_per_seg);
+ (long long)SEGS_TO_BLKS(sbi, segs));
if (f2fs_is_multi_device(sbi)) {
int last_dev = sbi->s_ndevs - 1;
int dev_segs =
@@ -2084,8 +2097,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
{
- int segs = secs * sbi->segs_per_sec;
- long long blks = (long long)segs * sbi->blocks_per_seg;
+ int segs = secs * SEGS_PER_SEC(sbi);
+ long long blks = SEGS_TO_BLKS(sbi, segs);
long long user_block_count =
le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
@@ -2127,7 +2140,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count)
int last_dev = sbi->s_ndevs - 1;
__u64 last_segs = FDEV(last_dev).total_segments;
- if (block_count + last_segs * sbi->blocks_per_seg <=
+ if (block_count + SEGS_TO_BLKS(sbi, last_segs) <=
old_block_count)
return -EINVAL;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 28a00942802c..9c0d06c4d19a 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -96,7 +96,7 @@ static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi)
if (f2fs_sb_has_blkzoned(sbi))
return free_segs_blk_count_zoned(sbi);
- return free_segments(sbi) << sbi->log_blocks_per_seg;
+ return SEGS_TO_BLKS(sbi, free_segments(sbi));
}
static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
@@ -104,7 +104,7 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
block_t free_blks, ovp_blks;
free_blks = free_segs_blk_count(sbi);
- ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg;
+ ovp_blks = SEGS_TO_BLKS(sbi, overprovision_segments(sbi));
if (free_blks < ovp_blks)
return 0;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index f7f63a567d86..e54f8c08bda8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -851,7 +851,7 @@ out:
static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode, bool is_whiteout,
- struct inode **new_inode)
+ struct inode **new_inode, struct f2fs_filename *fname)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
@@ -879,7 +879,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
if (err)
goto out;
- err = f2fs_do_tmpfile(inode, dir);
+ err = f2fs_do_tmpfile(inode, dir, fname);
if (err)
goto release_out;
@@ -930,22 +930,24 @@ static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL);
+ err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL, NULL);
return finish_open_simple(file, err);
}
static int f2fs_create_whiteout(struct mnt_idmap *idmap,
- struct inode *dir, struct inode **whiteout)
+ struct inode *dir, struct inode **whiteout,
+ struct f2fs_filename *fname)
{
- return __f2fs_tmpfile(idmap, dir, NULL,
- S_IFCHR | WHITEOUT_MODE, true, whiteout);
+ return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE,
+ true, whiteout, fname);
}
int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct inode **new_inode)
{
- return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, false, new_inode);
+ return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG,
+ false, new_inode, NULL);
}
static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
@@ -989,7 +991,14 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
if (flags & RENAME_WHITEOUT) {
- err = f2fs_create_whiteout(idmap, old_dir, &whiteout);
+ struct f2fs_filename fname;
+
+ err = f2fs_setup_filename(old_dir, &old_dentry->d_name,
+ 0, &fname);
+ if (err)
+ return err;
+
+ err = f2fs_create_whiteout(idmap, old_dir, &whiteout, &fname);
if (err)
return err;
}
@@ -1104,14 +1113,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
iput(whiteout);
}
- if (old_is_dir) {
- if (old_dir_entry)
- f2fs_set_link(old_inode, old_dir_entry,
- old_dir_page, new_dir);
- else
- f2fs_put_page(old_dir_page, 0);
+ if (old_dir_entry)
+ f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
+ if (old_is_dir)
f2fs_i_links_write(old_dir, false);
- }
+
if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) {
f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9b546fd21010..b3de6d6cdb02 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -852,21 +852,29 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
f2fs_sb_has_readonly(sbi)) {
- unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
+ unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
+ unsigned int ofs_in_node = dn->ofs_in_node;
+ pgoff_t fofs = index;
+ unsigned int c_len;
block_t blkaddr;
+ /* should align fofs and ofs_in_node to cluster_size */
+ if (fofs % cluster_size) {
+ fofs = round_down(fofs, cluster_size);
+ ofs_in_node = round_down(ofs_in_node, cluster_size);
+ }
+
+ c_len = f2fs_cluster_blocks_are_contiguous(dn, ofs_in_node);
if (!c_len)
goto out;
- blkaddr = f2fs_data_blkaddr(dn);
+ blkaddr = data_blkaddr(dn->inode, dn->node_page, ofs_in_node);
if (blkaddr == COMPRESS_ADDR)
blkaddr = data_blkaddr(dn->inode, dn->node_page,
- dn->ofs_in_node + 1);
+ ofs_in_node + 1);
f2fs_update_read_extent_tree_range_compressed(dn->inode,
- index, blkaddr,
- F2FS_I(dn->inode)->i_cluster_size,
- c_len);
+ fofs, blkaddr, cluster_size, c_len);
}
out:
return 0;
@@ -1919,7 +1927,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
for (i = 0; i < nr_folios; i++) {
struct page *page = &fbatch.folios[i]->page;
- if (!IS_DNODE(page))
+ if (!IS_INODE(page))
continue;
lock_page(page);
@@ -2841,7 +2849,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
int i, idx, last_offset, nrpages;
/* scan the node segment */
- last_offset = sbi->blocks_per_seg;
+ last_offset = BLKS_PER_SEG(sbi);
addr = START_BLOCK(sbi, segno);
sum_entry = &sum->entries[0];
@@ -3158,7 +3166,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
return 0;
- nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
+ nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) -
nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++) {
struct page *page;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 5bd16a95eef8..6aea13024ac1 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -208,10 +208,10 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
(block_off << 1) -
- (block_off & (sbi->blocks_per_seg - 1)));
+ (block_off & (BLKS_PER_SEG(sbi) - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
- block_addr += sbi->blocks_per_seg;
+ block_addr += BLKS_PER_SEG(sbi);
return block_addr;
}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d0f24ccbd1ac..e7bf15b8240a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -354,7 +354,7 @@ static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
if (blkaddr + 1 == next_blkaddr)
ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
ra_blocks * 2);
- else if (next_blkaddr % sbi->blocks_per_seg)
+ else if (next_blkaddr % BLKS_PER_SEG(sbi))
ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
ra_blocks / 2);
return ra_blocks;
@@ -611,6 +611,19 @@ truncate_out:
return 0;
}
+static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn)
+{
+ int i, err = 0;
+
+ for (i = DEFAULT_FAILURE_RETRY_COUNT; i > 0; i--) {
+ err = f2fs_reserve_new_block(dn);
+ if (!err)
+ break;
+ }
+
+ return err;
+}
+
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *page)
{
@@ -680,14 +693,12 @@ retry_dn:
if (__is_valid_data_blkaddr(src) &&
!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto err;
}
if (__is_valid_data_blkaddr(dest) &&
!f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto err;
}
@@ -712,14 +723,8 @@ retry_dn:
*/
if (dest == NEW_ADDR) {
f2fs_truncate_data_blocks_range(&dn, 1);
- do {
- err = f2fs_reserve_new_block(&dn);
- if (err == -ENOSPC) {
- f2fs_bug_on(sbi, 1);
- break;
- }
- } while (err &&
- IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+
+ err = f2fs_reserve_new_block_retry(&dn);
if (err)
goto err;
continue;
@@ -727,16 +732,8 @@ retry_dn:
/* dest is valid block, try to recover from src to dest */
if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
-
if (src == NULL_ADDR) {
- do {
- err = f2fs_reserve_new_block(&dn);
- if (err == -ENOSPC) {
- f2fs_bug_on(sbi, 1);
- break;
- }
- } while (err &&
- IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+ err = f2fs_reserve_new_block_retry(&dn);
if (err)
goto err;
}
@@ -756,8 +753,6 @@ retry_prev:
f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u",
dest, inode->i_ino, dn.ofs_in_node);
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi,
- ERROR_INVALID_BLKADDR);
goto err;
}
@@ -852,7 +847,7 @@ next:
f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
if (!err)
- f2fs_allocate_new_segments(sbi);
+ err = f2fs_allocate_new_segments(sbi);
return err;
}
@@ -864,7 +859,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
int ret = 0;
unsigned long s_flags = sbi->sb->s_flags;
bool need_writecp = false;
- bool fix_curseg_write_pointer = false;
if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE))
f2fs_info(sbi, "recover fsync data on readonly fs");
@@ -895,8 +889,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
else
f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE);
skip:
- fix_curseg_write_pointer = !check_only || list_empty(&inode_list);
-
destroy_fsync_dnodes(&inode_list, err);
destroy_fsync_dnodes(&tmp_inode_list, err);
@@ -914,11 +906,13 @@ skip:
* and the f2fs is not read only, check and fix zoned block devices'
* write pointer consistency.
*/
- if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) &&
- f2fs_sb_has_blkzoned(sbi)) {
- err = f2fs_fix_curseg_write_pointer(sbi);
- if (!err)
- err = f2fs_check_write_pointer(sbi);
+ if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sbi->sb)) {
+ int err2 = f2fs_fix_curseg_write_pointer(sbi);
+
+ if (!err2)
+ err2 = f2fs_check_write_pointer(sbi);
+ if (err2)
+ err = err2;
ret = err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index e1065ba70207..4fd76e867e0a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,6 +192,9 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
if (!f2fs_is_atomic_file(inode))
return;
+ if (clean)
+ truncate_inode_pages_final(inode->i_mapping);
+
release_atomic_write_cnt(inode);
clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
clear_inode_flag(inode, FI_ATOMIC_REPLACE);
@@ -201,7 +204,6 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
F2FS_I(inode)->atomic_write_task = NULL;
if (clean) {
- truncate_inode_pages_final(inode->i_mapping);
f2fs_i_size_write(inode, fi->original_i_size);
fi->original_i_size = 0;
}
@@ -248,7 +250,7 @@ retry:
} else {
blkcnt_t count = 1;
- err = inc_valid_block_count(sbi, inode, &count);
+ err = inc_valid_block_count(sbi, inode, &count, true);
if (err) {
f2fs_put_dnode(&dn);
return err;
@@ -334,8 +336,6 @@ static int __f2fs_commit_atomic_write(struct inode *inode)
DATA_GENERIC_ENHANCE)) {
f2fs_put_dnode(&dn);
ret = -EFSCORRUPTED;
- f2fs_handle_error(sbi,
- ERROR_INVALID_BLKADDR);
goto out;
}
@@ -400,6 +400,9 @@ int f2fs_commit_atomic_write(struct inode *inode)
*/
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
+ if (f2fs_cp_error(sbi))
+ return;
+
if (time_to_inject(sbi, FAULT_CHECKPOINT))
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT);
@@ -448,8 +451,8 @@ static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
unsigned int meta = get_pages(sbi, F2FS_DIRTY_META);
unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
- unsigned int threshold = sbi->blocks_per_seg * factor *
- DEFAULT_DIRTY_THRESHOLD;
+ unsigned int threshold =
+ SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD));
unsigned int global_threshold = threshold * 3 / 2;
if (dents >= threshold || qdata >= threshold ||
@@ -872,7 +875,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
{
int ovp_hole_segs =
(overprovision_segments(sbi) - reserved_segments(sbi));
- block_t ovp_holes = ovp_hole_segs << sbi->log_blocks_per_seg;
+ block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs);
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
block_t holes[2] = {0, 0}; /* DATA and NODE */
block_t unusable;
@@ -901,11 +904,16 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable)
{
int ovp_hole_segs =
(overprovision_segments(sbi) - reserved_segments(sbi));
+
+ if (F2FS_OPTION(sbi).unusable_cap_perc == 100)
+ return 0;
if (unusable > F2FS_OPTION(sbi).unusable_cap)
return -EAGAIN;
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
dirty_segments(sbi) > ovp_hole_segs)
return -EAGAIN;
+ if (has_not_enough_free_secs(sbi, 0, 0))
+ return -EAGAIN;
return 0;
}
@@ -1132,8 +1140,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
struct seg_entry *sentry;
unsigned int segno;
block_t blk = start;
- unsigned long offset, size, max_blocks = sbi->blocks_per_seg;
- unsigned long *map;
+ unsigned long offset, size, *map;
while (blk < end) {
segno = GET_SEGNO(sbi, blk);
@@ -1143,7 +1150,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
if (end < START_BLOCK(sbi, segno + 1))
size = GET_BLKOFF_FROM_SEG0(sbi, end);
else
- size = max_blocks;
+ size = BLKS_PER_SEG(sbi);
map = (unsigned long *)(sentry->cur_valid_map);
offset = __find_rev_next_bit(map, size, offset);
f2fs_bug_on(sbi, offset != size);
@@ -2048,7 +2055,6 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
bool check_only)
{
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
- int max_blocks = sbi->blocks_per_seg;
struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
@@ -2060,8 +2066,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
int i;
- if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) ||
- !f2fs_block_unit_discard(sbi))
+ if (se->valid_blocks == BLKS_PER_SEG(sbi) ||
+ !f2fs_hw_support_discard(sbi) ||
+ !f2fs_block_unit_discard(sbi))
return false;
if (!force) {
@@ -2078,13 +2085,14 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
while (force || SM_I(sbi)->dcc_info->nr_discards <=
SM_I(sbi)->dcc_info->max_discards) {
- start = __find_rev_next_bit(dmap, max_blocks, end + 1);
- if (start >= max_blocks)
+ start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1);
+ if (start >= BLKS_PER_SEG(sbi))
break;
- end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
- if (force && start && end != max_blocks
- && (end - start) < cpc->trim_minlen)
+ end = __find_rev_next_zero_bit(dmap,
+ BLKS_PER_SEG(sbi), start + 1);
+ if (force && start && end != BLKS_PER_SEG(sbi) &&
+ (end - start) < cpc->trim_minlen)
continue;
if (check_only)
@@ -2166,8 +2174,8 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
start + 1);
if (section_alignment) {
- start = rounddown(start, sbi->segs_per_sec);
- end = roundup(end, sbi->segs_per_sec);
+ start = rounddown(start, SEGS_PER_SEC(sbi));
+ end = roundup(end, SEGS_PER_SEC(sbi));
}
for (i = start; i < end; i++) {
@@ -2186,7 +2194,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
if (!f2fs_sb_has_blkzoned(sbi) &&
(!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) {
f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
- (end - start) << sbi->log_blocks_per_seg);
+ SEGS_TO_BLKS(sbi, end - start));
continue;
}
next:
@@ -2195,9 +2203,9 @@ next:
if (!IS_CURSEC(sbi, secno) &&
!get_valid_blocks(sbi, start, true))
f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
- sbi->segs_per_sec << sbi->log_blocks_per_seg);
+ BLKS_PER_SEC(sbi));
- start = start_segno + sbi->segs_per_sec;
+ start = start_segno + SEGS_PER_SEC(sbi);
if (start < end)
goto next;
else
@@ -2216,7 +2224,7 @@ next:
find_next:
if (is_valid) {
next_pos = find_next_zero_bit_le(entry->discard_map,
- sbi->blocks_per_seg, cur_pos);
+ BLKS_PER_SEG(sbi), cur_pos);
len = next_pos - cur_pos;
if (f2fs_sb_has_blkzoned(sbi) ||
@@ -2228,13 +2236,13 @@ find_next:
total_len += len;
} else {
next_pos = find_next_bit_le(entry->discard_map,
- sbi->blocks_per_seg, cur_pos);
+ BLKS_PER_SEG(sbi), cur_pos);
}
skip:
cur_pos = next_pos;
is_valid = !is_valid;
- if (cur_pos < sbi->blocks_per_seg)
+ if (cur_pos < BLKS_PER_SEG(sbi))
goto find_next;
release_discard_addr(entry);
@@ -2251,6 +2259,12 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
int err = 0;
+ if (f2fs_sb_has_readonly(sbi)) {
+ f2fs_info(sbi,
+ "Skip to start discard thread for readonly image");
+ return 0;
+ }
+
if (!f2fs_realtime_discard_enable(sbi))
return 0;
@@ -2283,7 +2297,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE;
if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
- dcc->discard_granularity = sbi->blocks_per_seg;
+ dcc->discard_granularity = BLKS_PER_SEG(sbi);
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
dcc->discard_granularity = BLKS_PER_SEC(sbi);
@@ -2297,7 +2311,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
atomic_set(&dcc->queued_discard, 0);
atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
- dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi));
dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST;
dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
@@ -2405,6 +2419,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
#endif
segno = GET_SEGNO(sbi, blkaddr);
+ if (segno == NULL_SEGNO)
+ return;
se = get_seg_entry(sbi, segno);
new_vblocks = se->valid_blocks + del;
@@ -2546,7 +2562,7 @@ static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int typ
struct curseg_info *curseg = CURSEG_I(sbi, type);
if (sbi->ckpt->alloc_type[type] == SSR)
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
return curseg->next_blkoff;
}
@@ -2634,7 +2650,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi,
unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
- if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
+ if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi))
return !test_bit(segno, free_i->free_segmap);
return 0;
}
@@ -2643,54 +2659,51 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi,
* Find a new segment from the free segments bitmap to right order
* This function should be returned with success, otherwise BUG
*/
-static void get_new_segment(struct f2fs_sb_info *sbi,
- unsigned int *newseg, bool new_sec, int dir)
+static int get_new_segment(struct f2fs_sb_info *sbi,
+ unsigned int *newseg, bool new_sec, bool pinning)
{
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno, secno, zoneno;
unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
- unsigned int left_start = hint;
bool init = true;
- int go_left = 0;
int i;
+ int ret = 0;
spin_lock(&free_i->segmap_lock);
- if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
+ if (time_to_inject(sbi, FAULT_NO_SEGMENT)) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+
+ if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) {
segno = find_next_zero_bit(free_i->free_segmap,
GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
goto got_it;
}
+
+ /*
+ * If we format f2fs on zoned storage, let's try to get pinned sections
+ * from beginning of the storage, which should be a conventional one.
+ */
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ segno = pinning ? 0 : max(first_zoned_segno(sbi), *newseg);
+ hint = GET_SEC_FROM_SEG(sbi, segno);
+ }
+
find_other_zone:
secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
if (secno >= MAIN_SECS(sbi)) {
- if (dir == ALLOC_RIGHT) {
- secno = find_first_zero_bit(free_i->free_secmap,
+ secno = find_first_zero_bit(free_i->free_secmap,
MAIN_SECS(sbi));
- f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
- } else {
- go_left = 1;
- left_start = hint - 1;
+ if (secno >= MAIN_SECS(sbi)) {
+ ret = -ENOSPC;
+ goto out_unlock;
}
}
- if (go_left == 0)
- goto skip_left;
-
- while (test_bit(left_start, free_i->free_secmap)) {
- if (left_start > 0) {
- left_start--;
- continue;
- }
- left_start = find_first_zero_bit(free_i->free_secmap,
- MAIN_SECS(sbi));
- f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
- break;
- }
- secno = left_start;
-skip_left:
segno = GET_SEG_FROM_SEC(sbi, secno);
zoneno = GET_ZONE_FROM_SEC(sbi, secno);
@@ -2701,21 +2714,13 @@ skip_left:
goto got_it;
if (zoneno == old_zoneno)
goto got_it;
- if (dir == ALLOC_LEFT) {
- if (!go_left && zoneno + 1 >= total_zones)
- goto got_it;
- if (go_left && zoneno == 0)
- goto got_it;
- }
for (i = 0; i < NR_CURSEG_TYPE; i++)
if (CURSEG_I(sbi, i)->zone == zoneno)
break;
if (i < NR_CURSEG_TYPE) {
/* zone is in user, try another */
- if (go_left)
- hint = zoneno * sbi->secs_per_zone - 1;
- else if (zoneno + 1 >= total_zones)
+ if (zoneno + 1 >= total_zones)
hint = 0;
else
hint = (zoneno + 1) * sbi->secs_per_zone;
@@ -2725,9 +2730,23 @@ skip_left:
got_it:
/* set it as dirty segment in free segmap */
f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
+
+ /* no free section in conventional zone */
+ if (new_sec && pinning &&
+ !f2fs_valid_pinned_area(sbi, START_BLOCK(sbi, segno))) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
__set_inuse(sbi, segno);
*newseg = segno;
+out_unlock:
spin_unlock(&free_i->segmap_lock);
+
+ if (ret == -ENOSPC) {
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT);
+ f2fs_bug_on(sbi, 1);
+ }
+ return ret;
}
static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -2736,6 +2755,10 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
struct summary_footer *sum_footer;
unsigned short seg_type = curseg->seg_type;
+ /* only happen when get_new_segment() fails */
+ if (curseg->next_segno == NULL_SEGNO)
+ return;
+
curseg->inited = true;
curseg->segno = curseg->next_segno;
curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
@@ -2761,9 +2784,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
sanity_check_seg_type(sbi, seg_type);
if (f2fs_need_rand_seg(sbi))
- return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
+ return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi));
- /* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
return curseg->segno;
@@ -2774,8 +2796,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return 0;
- if (test_opt(sbi, NOHEAP) &&
- (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)))
+ if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))
return 0;
if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
@@ -2792,30 +2813,31 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
*/
-static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
+static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- unsigned short seg_type = curseg->seg_type;
unsigned int segno = curseg->segno;
- int dir = ALLOC_LEFT;
+ bool pinning = type == CURSEG_COLD_DATA_PINNED;
+ int ret;
if (curseg->inited)
- write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, segno));
- if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA)
- dir = ALLOC_RIGHT;
-
- if (test_opt(sbi, NOHEAP))
- dir = ALLOC_RIGHT;
+ write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno));
segno = __get_next_segno(sbi, type);
- get_new_segment(sbi, &segno, new_sec, dir);
+ ret = get_new_segment(sbi, &segno, new_sec, pinning);
+ if (ret) {
+ if (ret == -ENOSPC)
+ curseg->segno = NULL_SEGNO;
+ return ret;
+ }
+
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
curseg->alloc_type = LFS;
if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
curseg->fragment_remained_chunk =
get_random_u32_inclusive(1, sbi->max_fragment_chunk);
+ return 0;
}
static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2831,7 +2853,7 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi,
for (i = 0; i < entries; i++)
target_map[i] = ckpt_map[i] | cur_map[i];
- return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
+ return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start);
}
static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi,
@@ -2842,14 +2864,14 @@ static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi,
bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
{
- return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg;
+ return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi);
}
/*
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
-static void change_curseg(struct f2fs_sb_info *sbi, int type)
+static int change_curseg(struct f2fs_sb_info *sbi, int type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2874,21 +2896,23 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
if (IS_ERR(sum_page)) {
/* GC won't be able to use stale summary pages by cp_error */
memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
- return;
+ return PTR_ERR(sum_page);
}
sum_node = (struct f2fs_summary_block *)page_address(sum_page);
memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
f2fs_put_page(sum_page, 1);
+ return 0;
}
static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
int alloc_mode, unsigned long long age);
-static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
+static int get_atssr_segment(struct f2fs_sb_info *sbi, int type,
int target_type, int alloc_mode,
unsigned long long age)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
+ int ret = 0;
curseg->seg_type = target_type;
@@ -2896,38 +2920,41 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
curseg->seg_type = se->type;
- change_curseg(sbi, type);
+ ret = change_curseg(sbi, type);
} else {
/* allocate cold segment by default */
curseg->seg_type = CURSEG_COLD_DATA;
- new_curseg(sbi, type, true);
+ ret = new_curseg(sbi, type, true);
}
stat_inc_seg_type(sbi, curseg);
+ return ret;
}
-static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
+static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC);
+ int ret = 0;
if (!sbi->am.atgc_enabled)
- return;
+ return 0;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
- get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0);
+ ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC,
+ CURSEG_COLD_DATA, SSR, 0);
up_write(&SIT_I(sbi)->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
-
+ return ret;
}
-void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
+int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
{
- __f2fs_init_atgc_curseg(sbi);
+ return __f2fs_init_atgc_curseg(sbi);
}
static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type)
@@ -3055,11 +3082,12 @@ static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
return false;
}
-void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int segno;
+ int ret = 0;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
@@ -3070,9 +3098,9 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
goto unlock;
if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
- change_curseg(sbi, type);
+ ret = change_curseg(sbi, type);
else
- new_curseg(sbi, type, true);
+ ret = new_curseg(sbi, type, true);
stat_inc_seg_type(sbi, curseg);
@@ -3086,45 +3114,84 @@ unlock:
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
+ return ret;
}
-static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
+static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
bool new_sec, bool force)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int old_segno;
+ int err = 0;
+
+ if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited)
+ goto allocate;
if (!force && curseg->inited &&
!curseg->next_blkoff &&
!get_valid_blocks(sbi, curseg->segno, new_sec) &&
!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
- return;
+ return 0;
+allocate:
old_segno = curseg->segno;
- new_curseg(sbi, type, true);
+ err = new_curseg(sbi, type, true);
+ if (err)
+ return err;
stat_inc_seg_type(sbi, curseg);
locate_dirty_segment(sbi, old_segno);
+ return 0;
}
-void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
+int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
{
+ int ret;
+
f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
- __allocate_new_segment(sbi, type, true, force);
+ ret = __allocate_new_segment(sbi, type, true, force);
up_write(&SIT_I(sbi)->sentry_lock);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
+
+ return ret;
}
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
+int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi)
+{
+ int err;
+ bool gc_required = true;
+
+retry:
+ f2fs_lock_op(sbi);
+ err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
+ f2fs_unlock_op(sbi);
+
+ if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) {
+ f2fs_down_write(&sbi->gc_lock);
+ err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1);
+ f2fs_up_write(&sbi->gc_lock);
+
+ gc_required = false;
+ if (!err)
+ goto retry;
+ }
+
+ return err;
+}
+
+int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
+ int err = 0;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
- __allocate_new_segment(sbi, i, false, false);
+ err += __allocate_new_segment(sbi, i, false, false);
up_write(&SIT_I(sbi)->sentry_lock);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
+
+ return err;
}
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
@@ -3242,8 +3309,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
GET_SEGNO(sbi, end);
if (need_align) {
- start_segno = rounddown(start_segno, sbi->segs_per_sec);
- end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1;
+ start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi));
+ end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1;
}
cpc.reason = CP_DISCARD;
@@ -3416,7 +3483,14 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi,
get_random_u32_inclusive(1, sbi->max_fragment_hole);
}
-void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+static void reset_curseg_fields(struct curseg_info *curseg)
+{
+ curseg->inited = false;
+ curseg->segno = NULL_SEGNO;
+ curseg->next_segno = 0;
+}
+
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio)
@@ -3427,12 +3501,18 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
struct seg_entry *se = NULL;
bool segment_full = false;
+ int ret = 0;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
+ if (curseg->segno == NULL_SEGNO) {
+ ret = -ENOSPC;
+ goto out_err;
+ }
+
if (from_gc) {
f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO);
se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr));
@@ -3441,7 +3521,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
}
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg);
+ f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi));
f2fs_wait_discard_bio(sbi, *new_blkaddr);
@@ -3470,25 +3550,35 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
* since SSR needs latest valid block information.
*/
update_sit_entry(sbi, *new_blkaddr, 1);
- if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
- update_sit_entry(sbi, old_blkaddr, -1);
+ update_sit_entry(sbi, old_blkaddr, -1);
/*
* If the current segment is full, flush it out and replace it with a
* new segment.
*/
if (segment_full) {
+ if (type == CURSEG_COLD_DATA_PINNED &&
+ !((curseg->segno + 1) % sbi->segs_per_sec)) {
+ reset_curseg_fields(curseg);
+ goto skip_new_segment;
+ }
+
if (from_gc) {
- get_atssr_segment(sbi, type, se->type,
+ ret = get_atssr_segment(sbi, type, se->type,
AT_SSR, se->mtime);
} else {
if (need_new_seg(sbi, type))
- new_curseg(sbi, type, false);
+ ret = new_curseg(sbi, type, false);
else
- change_curseg(sbi, type);
+ ret = change_curseg(sbi, type);
stat_inc_seg_type(sbi, curseg);
}
+
+ if (ret)
+ goto out_err;
}
+
+skip_new_segment:
/*
* segment dirty status should be updated after segment allocation,
* so we just need to update status only one time after previous
@@ -3497,12 +3587,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
- if (IS_DATASEG(type))
+ if (IS_DATASEG(curseg->seg_type))
atomic64_inc(&sbi->allocated_data_blocks);
up_write(&sit_i->sentry_lock);
- if (page && IS_NODESEG(type)) {
+ if (page && IS_NODESEG(curseg->seg_type)) {
fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
f2fs_inode_chksum_set(sbi, page);
@@ -3511,9 +3601,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
if (fio) {
struct f2fs_bio_info *io;
- if (F2FS_IO_ALIGNED(sbi))
- fio->retry = 0;
-
INIT_LIST_HEAD(&fio->list);
fio->in_list = 1;
io = sbi->write_io[fio->type] + fio->temp;
@@ -3523,8 +3610,15 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
}
mutex_unlock(&curseg->curseg_mutex);
-
f2fs_up_read(&SM_I(sbi)->curseg_lock);
+ return 0;
+out_err:
+ *new_blkaddr = NULL_ADDR;
+ up_write(&sit_i->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
+ return ret;
+
}
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
@@ -3561,21 +3655,25 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
if (keep_order)
f2fs_down_read(&fio->sbi->io_order_lock);
-reallocate:
- f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
- &fio->new_blkaddr, sum, type, fio);
+
+ if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+ &fio->new_blkaddr, sum, type, fio)) {
+ if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host))
+ fscrypt_finalize_bounce_page(&fio->encrypted_page);
+ if (PageWriteback(fio->page))
+ end_page_writeback(fio->page);
+ if (f2fs_in_warm_node_list(fio->sbi, fio->page))
+ f2fs_del_fsync_node_entry(fio->sbi, fio->page);
+ goto out;
+ }
if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr);
/* writeout dirty page into bdev */
f2fs_submit_page_write(fio);
- if (fio->retry) {
- fio->old_blkaddr = fio->new_blkaddr;
- goto reallocate;
- }
f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
-
+out:
if (keep_order)
f2fs_up_read(&fio->sbi->io_order_lock);
}
@@ -3659,8 +3757,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
}
if (fio->post_read)
- invalidate_mapping_pages(META_MAPPING(sbi),
- fio->new_blkaddr, fio->new_blkaddr);
+ f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1);
stat_inc_inplace_blocks(fio->sbi);
@@ -3749,7 +3846,8 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
/* change the current segment */
if (segno != curseg->segno) {
curseg->next_segno = segno;
- change_curseg(sbi, type);
+ if (change_curseg(sbi, type))
+ goto out_unlock;
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -3775,12 +3873,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (recover_curseg) {
if (old_cursegno != curseg->segno) {
curseg->next_segno = old_cursegno;
- change_curseg(sbi, type);
+ if (change_curseg(sbi, type))
+ goto out_unlock;
}
curseg->next_blkoff = old_blkoff;
curseg->alloc_type = old_alloc_type;
}
+out_unlock:
up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_write(&SM_I(sbi)->curseg_lock);
@@ -3850,7 +3950,7 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
for (i = 0; i < len; i++)
f2fs_wait_on_block_writeback(inode, blkaddr + i);
- invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1);
+ f2fs_truncate_meta_inode_pages(sbi, blkaddr, len);
}
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
@@ -3892,7 +3992,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
seg_i->next_blkoff = blk_off;
if (seg_i->alloc_type == SSR)
- blk_off = sbi->blocks_per_seg;
+ blk_off = BLKS_PER_SEG(sbi);
for (j = 0; j < blk_off; j++) {
struct f2fs_summary *s;
@@ -3960,7 +4060,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
struct f2fs_summary *ns = &sum->entries[0];
int i;
- for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
+ for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) {
ns->version = 0;
ns->ofs_in_node = 0;
}
@@ -4466,7 +4566,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
#endif
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
- sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
+ sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs);
sit_i->written_valid_blocks = 0;
sit_i->bitmap_size = sit_bitmap_size;
sit_i->dirty_sentries = 0;
@@ -4539,9 +4639,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
array[i].seg_type = CURSEG_COLD_DATA;
else if (i == CURSEG_ALL_DATA_ATGC)
array[i].seg_type = CURSEG_COLD_DATA;
- array[i].segno = NULL_SEGNO;
- array[i].next_blkoff = 0;
- array[i].inited = false;
+ reset_curseg_fields(&array[i]);
}
return restore_curseg_summaries(sbi);
}
@@ -4593,21 +4691,20 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
- if (f2fs_block_unit_discard(sbi)) {
- /* build discard map only one time */
- if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
- memset(se->discard_map, 0xff,
+ if (!f2fs_block_unit_discard(sbi))
+ goto init_discard_map_done;
+
+ /* build discard map only one time */
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
SIT_VBLOCK_MAP_SIZE);
- } else {
- memcpy(se->discard_map,
- se->cur_valid_map,
+ goto init_discard_map_done;
+ }
+ memcpy(se->discard_map, se->cur_valid_map,
SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks +=
- sbi->blocks_per_seg -
+ sbi->discard_blks += BLKS_PER_SEG(sbi) -
se->valid_blocks;
- }
- }
-
+init_discard_map_done:
if (__is_large_section(sbi))
get_sec_entry(sbi, start)->valid_blocks +=
se->valid_blocks;
@@ -4747,7 +4844,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
return;
mutex_lock(&dirty_i->seglist_lock);
- for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
valid_blocks = get_valid_blocks(sbi, segno, true);
secno = GET_SEC_FROM_SEG(sbi, segno);
@@ -4846,7 +4943,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
if (curseg->alloc_type == SSR)
continue;
- for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) {
+ for (blkofs += 1; blkofs < BLKS_PER_SEG(sbi); blkofs++) {
if (!f2fs_test_bit(blkofs, se->cur_valid_map))
continue;
out:
@@ -4862,6 +4959,16 @@ out:
}
#ifdef CONFIG_BLK_DEV_ZONED
+static const char *f2fs_zone_status[BLK_ZONE_COND_OFFLINE + 1] = {
+ [BLK_ZONE_COND_NOT_WP] = "NOT_WP",
+ [BLK_ZONE_COND_EMPTY] = "EMPTY",
+ [BLK_ZONE_COND_IMP_OPEN] = "IMPLICIT_OPEN",
+ [BLK_ZONE_COND_EXP_OPEN] = "EXPLICIT_OPEN",
+ [BLK_ZONE_COND_CLOSED] = "CLOSED",
+ [BLK_ZONE_COND_READONLY] = "READONLY",
+ [BLK_ZONE_COND_FULL] = "FULL",
+ [BLK_ZONE_COND_OFFLINE] = "OFFLINE",
+};
static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
struct f2fs_dev_info *fdev,
@@ -4883,14 +4990,19 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
* Skip check of zones cursegs point to, since
* fix_curseg_write_pointer() checks them.
*/
- if (zone_segno >= MAIN_SEGS(sbi) ||
- IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno)))
+ if (zone_segno >= MAIN_SEGS(sbi))
return 0;
/*
* Get # of valid block of the zone.
*/
valid_block_cnt = get_valid_blocks(sbi, zone_segno, true);
+ if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
+ f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]",
+ zone_segno, valid_block_cnt,
+ f2fs_zone_status[zone->cond]);
+ return 0;
+ }
if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) ||
(valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL))
@@ -4898,8 +5010,8 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
if (!valid_block_cnt) {
f2fs_notice(sbi, "Zone without valid block has non-zero write "
- "pointer. Reset the write pointer: cond[0x%x]",
- zone->cond);
+ "pointer. Reset the write pointer: cond[%s]",
+ f2fs_zone_status[zone->cond]);
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
zone->len >> log_sectors_per_block);
if (ret)
@@ -4916,8 +5028,8 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
* selected for write operation until it get discarded.
*/
f2fs_notice(sbi, "Valid blocks are not aligned with write "
- "pointer: valid block[0x%x,0x%x] cond[0x%x]",
- zone_segno, valid_block_cnt, zone->cond);
+ "pointer: valid block[0x%x,0x%x] cond[%s]",
+ zone_segno, valid_block_cnt, f2fs_zone_status[zone->cond]);
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
@@ -5128,7 +5240,7 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
unsigned int secno;
if (!sbi->unusable_blocks_per_sec)
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
secno = GET_SEC_FROM_SEG(sbi, segno);
seg_start = START_BLOCK(sbi, segno);
@@ -5143,10 +5255,10 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
*/
if (seg_start >= sec_cap_blkaddr)
return 0;
- if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr)
+ if (seg_start + BLKS_PER_SEG(sbi) > sec_cap_blkaddr)
return sec_cap_blkaddr - seg_start;
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
}
#else
int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
@@ -5172,7 +5284,7 @@ unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
if (f2fs_sb_has_blkzoned(sbi))
return f2fs_usable_zone_blks_in_seg(sbi, segno);
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
}
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
@@ -5181,7 +5293,7 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
if (f2fs_sb_has_blkzoned(sbi))
return CAP_SEGS_PER_SEC(sbi);
- return sbi->segs_per_sec;
+ return SEGS_PER_SEC(sbi);
}
/*
@@ -5196,14 +5308,14 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
sit_i->min_mtime = ULLONG_MAX;
- for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
unsigned int i;
unsigned long long mtime = 0;
- for (i = 0; i < sbi->segs_per_sec; i++)
+ for (i = 0; i < SEGS_PER_SEC(sbi); i++)
mtime += get_seg_entry(sbi, segno + i)->mtime;
- mtime = div_u64(mtime, sbi->segs_per_sec);
+ mtime = div_u64(mtime, SEGS_PER_SEC(sbi));
if (sit_i->min_mtime > mtime)
sit_i->min_mtime = mtime;
@@ -5242,7 +5354,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC);
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
- sm_info->min_seq_blocks = sbi->blocks_per_seg;
+ sm_info->min_seq_blocks = BLKS_PER_SEG(sbi);
sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
sm_info->min_ssr_sections = reserved_sections(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 8129be788bd5..e1c0f418aa11 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -48,21 +48,21 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
#define IS_CURSEC(sbi, secno) \
(((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \
- (sbi)->segs_per_sec))
+ SEGS_PER_SEC(sbi)))
#define MAIN_BLKADDR(sbi) \
(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
@@ -77,40 +77,37 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
#define TOTAL_SEGS(sbi) \
(SM_I(sbi) ? SM_I(sbi)->segment_count : \
le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count))
-#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi) (SEGS_TO_BLKS(sbi, TOTAL_SEGS(sbi)))
#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \
(sbi)->log_blocks_per_seg))
#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
- (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
+ (SEGS_TO_BLKS(sbi, GET_R2L_SEGNO(FREE_I(sbi), segno))))
#define NEXT_FREE_BLKADDR(sbi, curseg) \
(START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
+ (BLKS_TO_SEGS(sbi, GET_SEGOFF_FROM_SEG0(sbi, blk_addr)))
#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (BLKS_PER_SEG(sbi) - 1))
#define GET_SEGNO(sbi, blk_addr) \
((!__is_valid_data_blkaddr(blk_addr)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define BLKS_PER_SEC(sbi) \
- ((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
#define CAP_BLKS_PER_SEC(sbi) \
- ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \
- (sbi)->unusable_blocks_per_sec)
+ (BLKS_PER_SEC(sbi) - (sbi)->unusable_blocks_per_sec)
#define CAP_SEGS_PER_SEC(sbi) \
- ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\
- (sbi)->log_blocks_per_seg))
+ (SEGS_PER_SEC(sbi) - \
+ BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec))
#define GET_SEC_FROM_SEG(sbi, segno) \
- (((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec)
+ (((segno) == -1) ? -1 : (segno) / SEGS_PER_SEC(sbi))
#define GET_SEG_FROM_SEC(sbi, secno) \
- ((secno) * (sbi)->segs_per_sec)
+ ((secno) * SEGS_PER_SEC(sbi))
#define GET_ZONE_FROM_SEC(sbi, secno) \
(((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone)
#define GET_ZONE_FROM_SEG(sbi, segno) \
@@ -139,16 +136,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
/*
- * indicate a block allocation direction: RIGHT and LEFT.
- * RIGHT means allocating new sections towards the end of volume.
- * LEFT means the opposite direction.
- */
-enum {
- ALLOC_RIGHT = 0,
- ALLOC_LEFT
-};
-
-/*
* In the victim_sel_policy->alloc_mode, there are three block allocation modes.
* LFS writes data sequentially with cleaning operations.
* SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
@@ -364,7 +351,7 @@ static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
unsigned int blocks = 0;
int i;
- for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) {
+ for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) {
struct seg_entry *se = get_seg_entry(sbi, start_segno);
blocks += se->ckpt_valid_blocks;
@@ -449,7 +436,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
free_i->free_segments++;
next = find_next_bit(free_i->free_segmap,
- start_segno + sbi->segs_per_sec, start_segno);
+ start_segno + SEGS_PER_SEC(sbi), start_segno);
if (next >= start_segno + usable_segs) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
@@ -485,7 +472,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
if (!inmem && IS_CURSEC(sbi, secno))
goto skip_free;
next = find_next_bit(free_i->free_segmap,
- start_segno + sbi->segs_per_sec, start_segno);
+ start_segno + SEGS_PER_SEC(sbi), start_segno);
if (next >= start_segno + usable_segs) {
if (test_and_clear_bit(secno, free_i->free_secmap))
free_i->free_sections++;
@@ -573,23 +560,22 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
unsigned int node_blocks, unsigned int dent_blocks)
{
- unsigned int segno, left_blocks;
+ unsigned segno, left_blocks;
int i;
- /* check current node segment */
+ /* check current node sections in the worst case. */
for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
segno = CURSEG_I(sbi, i)->segno;
- left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
- get_seg_entry(sbi, segno)->ckpt_valid_blocks;
-
+ left_blocks = CAP_BLKS_PER_SEC(sbi) -
+ get_ckpt_valid_blocks(sbi, segno, true);
if (node_blocks > left_blocks)
return false;
}
- /* check current data segment */
+ /* check current data section for dentry blocks. */
segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
- left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
- get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+ left_blocks = CAP_BLKS_PER_SEC(sbi) -
+ get_ckpt_valid_blocks(sbi, segno, true);
if (dent_blocks > left_blocks)
return false;
return true;
@@ -638,7 +624,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
if (free_secs > upper_secs)
return false;
- else if (free_secs <= lower_secs)
+ if (free_secs <= lower_secs)
return true;
return !curseg_space;
}
@@ -793,10 +779,10 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
- if (usable_blks_per_seg < sbi->blocks_per_seg)
+ if (usable_blks_per_seg < BLKS_PER_SEG(sbi))
f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map,
- sbi->blocks_per_seg,
- usable_blks_per_seg) != sbi->blocks_per_seg);
+ BLKS_PER_SEG(sbi),
+ usable_blks_per_seg) != BLKS_PER_SEG(sbi));
/* check segment usage, and check boundary of a given segment number */
if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg
@@ -915,9 +901,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
return 0;
if (type == DATA)
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
else if (type == NODE)
- return 8 * sbi->blocks_per_seg;
+ return SEGS_TO_BLKS(sbi, 8);
else if (type == META)
return 8 * BIO_MAX_VECS;
else
@@ -969,3 +955,13 @@ wake_up:
dcc->discard_wake = true;
wake_up_interruptible_all(&dcc->discard_wait_queue);
}
+
+static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi)
+{
+ int devi;
+
+ for (devi = 0; devi < sbi->s_ndevs; devi++)
+ if (bdev_is_zoned(FDEV(devi).bdev))
+ return GET_SEGNO(sbi, FDEV(devi).start_blk);
+ return 0;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b880b746f226..a6867f26f141 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -44,24 +44,26 @@ static struct kmem_cache *f2fs_inode_cachep;
#ifdef CONFIG_F2FS_FAULT_INJECTION
const char *f2fs_fault_name[FAULT_MAX] = {
- [FAULT_KMALLOC] = "kmalloc",
- [FAULT_KVMALLOC] = "kvmalloc",
- [FAULT_PAGE_ALLOC] = "page alloc",
- [FAULT_PAGE_GET] = "page get",
- [FAULT_ALLOC_NID] = "alloc nid",
- [FAULT_ORPHAN] = "orphan",
- [FAULT_BLOCK] = "no more block",
- [FAULT_DIR_DEPTH] = "too big dir depth",
- [FAULT_EVICT_INODE] = "evict_inode fail",
- [FAULT_TRUNCATE] = "truncate fail",
- [FAULT_READ_IO] = "read IO error",
- [FAULT_CHECKPOINT] = "checkpoint error",
- [FAULT_DISCARD] = "discard error",
- [FAULT_WRITE_IO] = "write IO error",
- [FAULT_SLAB_ALLOC] = "slab alloc",
- [FAULT_DQUOT_INIT] = "dquot initialize",
- [FAULT_LOCK_OP] = "lock_op",
- [FAULT_BLKADDR] = "invalid blkaddr",
+ [FAULT_KMALLOC] = "kmalloc",
+ [FAULT_KVMALLOC] = "kvmalloc",
+ [FAULT_PAGE_ALLOC] = "page alloc",
+ [FAULT_PAGE_GET] = "page get",
+ [FAULT_ALLOC_NID] = "alloc nid",
+ [FAULT_ORPHAN] = "orphan",
+ [FAULT_BLOCK] = "no more block",
+ [FAULT_DIR_DEPTH] = "too big dir depth",
+ [FAULT_EVICT_INODE] = "evict_inode fail",
+ [FAULT_TRUNCATE] = "truncate fail",
+ [FAULT_READ_IO] = "read IO error",
+ [FAULT_CHECKPOINT] = "checkpoint error",
+ [FAULT_DISCARD] = "discard error",
+ [FAULT_WRITE_IO] = "write IO error",
+ [FAULT_SLAB_ALLOC] = "slab alloc",
+ [FAULT_DQUOT_INIT] = "dquot initialize",
+ [FAULT_LOCK_OP] = "lock_op",
+ [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr",
+ [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr",
+ [FAULT_NO_SEGMENT] = "no free segment",
};
void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -137,7 +139,6 @@ enum {
Opt_resgid,
Opt_resuid,
Opt_mode,
- Opt_io_size_bits,
Opt_fault_injection,
Opt_fault_type,
Opt_lazytime,
@@ -216,7 +217,6 @@ static match_table_t f2fs_tokens = {
{Opt_resgid, "resgid=%u"},
{Opt_resuid, "resuid=%u"},
{Opt_mode, "mode=%s"},
- {Opt_io_size_bits, "io_bits=%u"},
{Opt_fault_injection, "fault_injection=%u"},
{Opt_fault_type, "fault_type=%u"},
{Opt_lazytime, "lazytime"},
@@ -263,7 +263,8 @@ static match_table_t f2fs_tokens = {
{Opt_err, NULL},
};
-void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
+void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
+ const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -274,8 +275,12 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
level = printk_get_level(fmt);
vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
- printk("%c%cF2FS-fs (%s): %pV\n",
- KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ if (limit_rate)
+ printk_ratelimited("%c%cF2FS-fs (%s): %pV\n",
+ KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ else
+ printk("%c%cF2FS-fs (%s): %pV\n",
+ KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
va_end(args);
}
@@ -343,46 +348,6 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).s_resgid));
}
-static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi)
-{
- unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec;
- unsigned int avg_vblocks;
- unsigned int wanted_reserved_segments;
- block_t avail_user_block_count;
-
- if (!F2FS_IO_ALIGNED(sbi))
- return 0;
-
- /* average valid block count in section in worst case */
- avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi);
-
- /*
- * we need enough free space when migrating one section in worst case
- */
- wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) *
- reserved_segments(sbi);
- wanted_reserved_segments -= reserved_segments(sbi);
-
- avail_user_block_count = sbi->user_block_count -
- sbi->current_reserved_blocks -
- F2FS_OPTION(sbi).root_reserved_blocks;
-
- if (wanted_reserved_segments * sbi->blocks_per_seg >
- avail_user_block_count) {
- f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u",
- wanted_reserved_segments,
- avail_user_block_count >> sbi->log_blocks_per_seg);
- return -ENOSPC;
- }
-
- SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments;
-
- f2fs_info(sbi, "IO align feature needs additional reserved segment: %u",
- wanted_reserved_segments);
-
- return 0;
-}
-
static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi)
{
if (!F2FS_OPTION(sbi).unusable_cap_perc)
@@ -663,7 +628,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
#ifdef CONFIG_F2FS_FS_ZSTD
static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
{
- unsigned int level;
+ int level;
int len = 4;
if (strlen(str) == len) {
@@ -677,9 +642,15 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
return -EINVAL;
}
- if (kstrtouint(str + 1, 10, &level))
+ if (kstrtoint(str + 1, 10, &level))
return -EINVAL;
+ /* f2fs does not support negative compress level now */
+ if (level < 0) {
+ f2fs_info(sbi, "do not support negative compress level: %d", level);
+ return -ERANGE;
+ }
+
if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) {
f2fs_info(sbi, "invalid zstd compress level: %d", level);
return -EINVAL;
@@ -763,10 +734,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
clear_opt(sbi, DISCARD);
break;
case Opt_noheap:
- set_opt(sbi, NOHEAP);
- break;
case Opt_heap:
- clear_opt(sbi, NOHEAP);
+ f2fs_warn(sbi, "heap/no_heap options were deprecated");
break;
#ifdef CONFIG_F2FS_FS_XATTR
case Opt_user_xattr:
@@ -913,16 +882,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
- case Opt_io_size_bits:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) {
- f2fs_warn(sbi, "Not support %ld, larger than %d",
- BIT(arg), BIO_MAX_VECS);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).write_io_size_bits = arg;
- break;
#ifdef CONFIG_F2FS_FAULT_INJECTION
case Opt_fault_injection:
if (args->from && match_int(args, &arg))
@@ -1392,12 +1351,6 @@ default_check:
}
#endif
- if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
- f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO",
- F2FS_IO_SIZE_KB(sbi));
- return -EINVAL;
- }
-
if (test_opt(sbi, INLINE_XATTR_SIZE)) {
int min_size, max_size;
@@ -1718,7 +1671,6 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_destroy_page_array_cache(sbi);
f2fs_destroy_xattr_caches(sbi);
- mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
@@ -2009,10 +1961,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
} else {
seq_puts(seq, ",nodiscard");
}
- if (test_opt(sbi, NOHEAP))
- seq_puts(seq, ",no_heap");
- else
- seq_puts(seq, ",heap");
#ifdef CONFIG_F2FS_FS_XATTR
if (test_opt(sbi, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -2078,9 +2026,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
F2FS_OPTION(sbi).s_resuid),
from_kgid_munged(&init_user_ns,
F2FS_OPTION(sbi).s_resgid));
- if (F2FS_IO_SIZE_BITS(sbi))
- seq_printf(seq, ",io_bits=%u",
- F2FS_OPTION(sbi).write_io_size_bits);
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (test_opt(sbi, FAULT_INJECTION)) {
seq_printf(seq, ",fault_injection=%u",
@@ -2192,7 +2137,6 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
- set_opt(sbi, NOHEAP);
set_opt(sbi, MERGE_CHECKPOINT);
F2FS_OPTION(sbi).unusable_cap = 0;
sbi->sb->s_flags |= SB_LAZYTIME;
@@ -2247,6 +2191,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
.init_gc_type = FG_GC,
.should_migrate_blocks = false,
.err_gc_skipped = true,
+ .no_bg_gc = true,
.nr_free_secs = 1 };
f2fs_down_write(&sbi->gc_lock);
@@ -2332,7 +2277,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
- bool no_io_align = !F2FS_IO_ALIGNED(sbi);
bool no_atgc = !test_opt(sbi, ATGC);
bool no_discard = !test_opt(sbi, DISCARD);
bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
@@ -2440,12 +2384,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
- err = -EINVAL;
- f2fs_warn(sbi, "switch io_bits option is not allowed");
- goto restore_opts;
- }
-
if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) {
err = -EINVAL;
f2fs_warn(sbi, "switch compress_cache option is not allowed");
@@ -2768,7 +2706,7 @@ int f2fs_dquot_initialize(struct inode *inode)
return dquot_initialize(inode);
}
-static struct dquot **f2fs_get_dquots(struct inode *inode)
+static struct dquot __rcu **f2fs_get_dquots(struct inode *inode)
{
return F2FS_I(inode)->i_dquot;
}
@@ -3706,7 +3644,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
}
main_segs = le32_to_cpu(raw_super->segment_count_main);
- blocks_per_seg = sbi->blocks_per_seg;
+ blocks_per_seg = BLKS_PER_SEG(sbi);
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
@@ -3818,9 +3756,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
sbi->total_sections = le32_to_cpu(raw_super->section_count);
- sbi->total_node_count =
- (le32_to_cpu(raw_super->segment_count_nat) / 2)
- * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
+ sbi->total_node_count = SEGS_TO_BLKS(sbi,
+ ((le32_to_cpu(raw_super->segment_count_nat) / 2) *
+ NAT_ENTRY_PER_BLOCK));
F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino);
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
@@ -3829,7 +3767,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
- sbi->migration_granularity = sbi->segs_per_sec;
+ sbi->migration_granularity = SEGS_PER_SEC(sbi);
sbi->seq_file_ra_mul = MIN_RA_MUL;
sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
@@ -3930,11 +3868,6 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
return 0;
zone_sectors = bdev_zone_sectors(bdev);
- if (!is_power_of_2(zone_sectors)) {
- f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n");
- return -EINVAL;
- }
-
if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
SECTOR_TO_BLOCK(zone_sectors))
return -EINVAL;
@@ -4090,7 +4023,9 @@ static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi)
f2fs_up_write(&sbi->sb_lock);
if (err)
- f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err);
+ f2fs_err_ratelimited(sbi,
+ "f2fs_commit_super fails to record stop_reason, err:%d",
+ err);
}
void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag)
@@ -4133,8 +4068,9 @@ static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error)
err = f2fs_commit_super(sbi, false);
if (err)
- f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d",
- error, err);
+ f2fs_err_ratelimited(sbi,
+ "f2fs_commit_super fails to record errors:%u, err:%d",
+ error, err);
out_unlock:
f2fs_up_write(&sbi->sb_lock);
}
@@ -4259,14 +4195,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
if (i == 0) {
FDEV(i).start_blk = 0;
FDEV(i).end_blk = FDEV(i).start_blk +
- (FDEV(i).total_segments <<
- sbi->log_blocks_per_seg) - 1 +
- le32_to_cpu(raw_super->segment0_blkaddr);
+ SEGS_TO_BLKS(sbi,
+ FDEV(i).total_segments) - 1 +
+ le32_to_cpu(raw_super->segment0_blkaddr);
} else {
FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
FDEV(i).end_blk = FDEV(i).start_blk +
- (FDEV(i).total_segments <<
- sbi->log_blocks_per_seg) - 1;
+ SEGS_TO_BLKS(sbi,
+ FDEV(i).total_segments) - 1;
FDEV(i).bdev_file = bdev_file_open_by_path(
FDEV(i).path, mode, sbi->sb, NULL);
}
@@ -4305,8 +4241,6 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
FDEV(i).total_segments,
FDEV(i).start_blk, FDEV(i).end_blk);
}
- f2fs_info(sbi,
- "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi));
return 0;
}
@@ -4519,19 +4453,10 @@ try_onemore:
if (err)
goto free_iostat;
- if (F2FS_IO_ALIGNED(sbi)) {
- sbi->write_io_dummy =
- mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
- if (!sbi->write_io_dummy) {
- err = -ENOMEM;
- goto free_percpu;
- }
- }
-
/* init per sbi slab cache */
err = f2fs_init_xattr_caches(sbi);
if (err)
- goto free_io_dummy;
+ goto free_percpu;
err = f2fs_init_page_array_cache(sbi);
if (err)
goto free_xattr_cache;
@@ -4619,10 +4544,6 @@ try_onemore:
goto free_nm;
}
- err = adjust_reserved_segment(sbi);
- if (err)
- goto free_nm;
-
/* For write statistics */
sbi->sectors_written_start = f2fs_get_sectors_written(sbi);
@@ -4749,13 +4670,20 @@ reset_checkpoint:
* If the f2fs is not readonly and fsync data recovery succeeds,
* check zoned block devices' write pointer consistency.
*/
- if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) {
- err = f2fs_check_write_pointer(sbi);
- if (err)
- goto free_meta;
+ if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sb)) {
+ int err2;
+
+ f2fs_notice(sbi, "Checking entire write pointers");
+ err2 = f2fs_check_write_pointer(sbi);
+ if (err2)
+ err = err2;
}
+ if (err)
+ goto free_meta;
- f2fs_init_inmem_curseg(sbi);
+ err = f2fs_init_inmem_curseg(sbi);
+ if (err)
+ goto sync_free_meta;
/* f2fs_recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -4854,8 +4782,6 @@ free_page_array_cache:
f2fs_destroy_page_array_cache(sbi);
free_xattr_cache:
f2fs_destroy_xattr_caches(sbi);
-free_io_dummy:
- mempool_destroy(sbi->write_io_dummy);
free_percpu:
destroy_percpu_info(sbi);
free_iostat:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a7ec55c7bb20..a568ce96cf56 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -493,8 +493,8 @@ out:
spin_lock(&sbi->stat_lock);
if (t > (unsigned long)(sbi->user_block_count -
F2FS_OPTION(sbi).root_reserved_blocks -
- sbi->blocks_per_seg *
- SM_I(sbi)->additional_reserved_segments)) {
+ SEGS_TO_BLKS(sbi,
+ SM_I(sbi)->additional_reserved_segments))) {
spin_unlock(&sbi->stat_lock);
return -EINVAL;
}
@@ -551,7 +551,7 @@ out:
}
if (!strcmp(a->attr.name, "migration_granularity")) {
- if (t == 0 || t > sbi->segs_per_sec)
+ if (t == 0 || t > SEGS_PER_SEC(sbi))
return -EINVAL;
}
@@ -1492,6 +1492,50 @@ static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
return 0;
}
+static int __maybe_unused disk_map_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int i;
+
+ seq_printf(seq, "Address Layout : %5luB Block address (# of Segments)\n",
+ F2FS_BLKSIZE);
+ seq_printf(seq, " SB : %12s\n", "0/1024B");
+ seq_printf(seq, " seg0_blkaddr : 0x%010x\n", SEG0_BLKADDR(sbi));
+ seq_printf(seq, " Checkpoint : 0x%010x (%10d)\n",
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr), 2);
+ seq_printf(seq, " SIT : 0x%010x (%10d)\n",
+ SIT_I(sbi)->sit_base_addr,
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_sit));
+ seq_printf(seq, " NAT : 0x%010x (%10d)\n",
+ NM_I(sbi)->nat_blkaddr,
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_nat));
+ seq_printf(seq, " SSA : 0x%010x (%10d)\n",
+ SM_I(sbi)->ssa_blkaddr,
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_ssa));
+ seq_printf(seq, " Main : 0x%010x (%10d)\n",
+ SM_I(sbi)->main_blkaddr,
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main));
+ seq_printf(seq, " # of Sections : %12d\n",
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count));
+ seq_printf(seq, " Segs/Sections : %12d\n",
+ SEGS_PER_SEC(sbi));
+ seq_printf(seq, " Section size : %12d MB\n",
+ SEGS_PER_SEC(sbi) << 1);
+
+ if (!f2fs_is_multi_device(sbi))
+ return 0;
+
+ seq_puts(seq, "\nDisk Map for multi devices:\n");
+ for (i = 0; i < sbi->s_ndevs; i++)
+ seq_printf(seq, "Disk:%2d (zoned=%d): 0x%010x - 0x%010x on %s\n",
+ i, bdev_is_zoned(FDEV(i).bdev),
+ FDEV(i).start_blk, FDEV(i).end_blk,
+ FDEV(i).path);
+ return 0;
+}
+
int __init f2fs_init_sysfs(void)
{
int ret;
@@ -1573,6 +1617,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
victim_bits_seq_show, sb);
proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
discard_plist_seq_show, sb);
+ proc_create_single_data("disk_map", 0444, sbi->s_proc,
+ disk_map_seq_show, sb);
return 0;
put_feature_list_kobj:
kobject_put(&sbi->s_feature_list_kobj);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 4fc95f353a7a..f7bb0c54502c 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -258,21 +258,23 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- struct page *page;
+ struct folio *folio;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
- page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
- if (!page || !PageUptodate(page)) {
+ folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
- if (page)
- put_page(page);
+ if (!IS_ERR(folio))
+ folio_put(folio);
else if (num_ra_pages > 1)
page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
- page = read_mapping_page(inode->i_mapping, index, NULL);
+ folio = read_mapping_folio(inode->i_mapping, index, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
- return page;
+ return folio_file_page(folio, index);
}
static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index c52e63e10d35..509eea96a457 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -130,6 +130,12 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
fid->parent_i_gen = parent->i_generation;
type = FILEID_FAT_WITH_PARENT;
*lenp = FAT_FID_SIZE_WITH_PARENT;
+ } else {
+ /*
+ * We need to initialize this field because the fh is actually
+ * 12 bytes long
+ */
+ fid->parent_i_pos_hi = 0;
}
return type;
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 038ed0b9aaa5..8674dbfbe59d 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -52,3 +52,14 @@ config FUSE_DAX
If you want to allow mounting a Virtio Filesystem with the "dax"
option, answer Y.
+
+config FUSE_PASSTHROUGH
+ bool "FUSE passthrough operations support"
+ default y
+ depends on FUSE_FS
+ select FS_STACK
+ help
+ This allows bypassing FUSE server by mapping specific FUSE operations
+ to be performed directly on a backing file.
+
+ If you want to allow passthrough operations, answer Y.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 0c48b35c058d..6e0228c6d0cb 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -8,6 +8,8 @@ obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
+fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 284a35006462..97ac994ff78f 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
if (!fc)
goto out;
- down_read(&fc->killsb);
- spin_lock(&fc->bg_lock);
- fc->congestion_threshold = val;
- spin_unlock(&fc->bg_lock);
- up_read(&fc->killsb);
+ WRITE_ONCE(fc->congestion_threshold, val);
fuse_conn_put(fc);
out:
return ret;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1a8f82f478cb..3ec8bb5e68ff 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1775,6 +1775,61 @@ copy_finish:
return err;
}
+/*
+ * Resending all processing queue requests.
+ *
+ * During a FUSE daemon panics and failover, it is possible for some inflight
+ * requests to be lost and never returned. As a result, applications awaiting
+ * replies would become stuck forever. To address this, we can use notification
+ * to trigger resending of these pending requests to the FUSE daemon, ensuring
+ * they are properly processed again.
+ *
+ * Please note that this strategy is applicable only to idempotent requests or
+ * if the FUSE daemon takes careful measures to avoid processing duplicated
+ * non-idempotent requests.
+ */
+static void fuse_resend(struct fuse_conn *fc)
+{
+ struct fuse_dev *fud;
+ struct fuse_req *req, *next;
+ struct fuse_iqueue *fiq = &fc->iq;
+ LIST_HEAD(to_queue);
+ unsigned int i;
+
+ spin_lock(&fc->lock);
+ if (!fc->connected) {
+ spin_unlock(&fc->lock);
+ return;
+ }
+
+ list_for_each_entry(fud, &fc->devices, entry) {
+ struct fuse_pqueue *fpq = &fud->pq;
+
+ spin_lock(&fpq->lock);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ list_splice_tail_init(&fpq->processing[i], &to_queue);
+ spin_unlock(&fpq->lock);
+ }
+ spin_unlock(&fc->lock);
+
+ list_for_each_entry_safe(req, next, &to_queue, list) {
+ __set_bit(FR_PENDING, &req->flags);
+ /* mark the request as resend request */
+ req->in.h.unique |= FUSE_UNIQUE_RESEND;
+ }
+
+ spin_lock(&fiq->lock);
+ /* iq and pq requests are both oldest to newest */
+ list_splice(&to_queue, &fiq->pending);
+ fiq->ops->wake_pending_and_unlock(fiq);
+}
+
+static int fuse_notify_resend(struct fuse_conn *fc)
+{
+ fuse_resend(fc);
+ return 0;
+}
+
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
@@ -1800,6 +1855,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_DELETE:
return fuse_notify_delete(fc, size, cs);
+ case FUSE_NOTIFY_RESEND:
+ return fuse_notify_resend(fc);
+
default:
fuse_copy_finish(cs);
return -EINVAL;
@@ -2251,43 +2309,91 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
return 0;
}
-static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
{
int res;
int oldfd;
struct fuse_dev *fud = NULL;
struct fd f;
+ if (get_user(oldfd, argp))
+ return -EFAULT;
+
+ f = fdget(oldfd);
+ if (!f.file)
+ return -EINVAL;
+
+ /*
+ * Check against file->f_op because CUSE
+ * uses the same ioctl handler.
+ */
+ if (f.file->f_op == file->f_op)
+ fud = fuse_get_dev(f.file);
+
+ res = -EINVAL;
+ if (fud) {
+ mutex_lock(&fuse_mutex);
+ res = fuse_device_clone(fud->fc, file);
+ mutex_unlock(&fuse_mutex);
+ }
+
+ fdput(f);
+ return res;
+}
+
+static long fuse_dev_ioctl_backing_open(struct file *file,
+ struct fuse_backing_map __user *argp)
+{
+ struct fuse_dev *fud = fuse_get_dev(file);
+ struct fuse_backing_map map;
+
+ if (!fud)
+ return -EPERM;
+
+ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&map, argp, sizeof(map)))
+ return -EFAULT;
+
+ return fuse_backing_open(fud->fc, &map);
+}
+
+static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
+{
+ struct fuse_dev *fud = fuse_get_dev(file);
+ int backing_id;
+
+ if (!fud)
+ return -EPERM;
+
+ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ return -EOPNOTSUPP;
+
+ if (get_user(backing_id, argp))
+ return -EFAULT;
+
+ return fuse_backing_close(fud->fc, backing_id);
+}
+
+static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+
switch (cmd) {
case FUSE_DEV_IOC_CLONE:
- if (get_user(oldfd, (__u32 __user *)arg))
- return -EFAULT;
+ return fuse_dev_ioctl_clone(file, argp);
- f = fdget(oldfd);
- if (!f.file)
- return -EINVAL;
+ case FUSE_DEV_IOC_BACKING_OPEN:
+ return fuse_dev_ioctl_backing_open(file, argp);
+
+ case FUSE_DEV_IOC_BACKING_CLOSE:
+ return fuse_dev_ioctl_backing_close(file, argp);
- /*
- * Check against file->f_op because CUSE
- * uses the same ioctl handler.
- */
- if (f.file->f_op == file->f_op)
- fud = fuse_get_dev(f.file);
-
- res = -EINVAL;
- if (fud) {
- mutex_lock(&fuse_mutex);
- res = fuse_device_clone(fud->fc, file);
- mutex_unlock(&fuse_mutex);
- }
- fdput(f);
- break;
default:
- res = -ENOTTY;
- break;
+ return -ENOTTY;
}
- return res;
}
const struct file_operations fuse_dev_operations = {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d19cbf34c634..4a6df591add6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -391,6 +391,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
err = -EIO;
if (fuse_invalid_attr(&outarg->attr))
goto out_put_forget;
+ if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) {
+ pr_warn_once("root generation should be zero\n");
+ outarg->generation = 0;
+ }
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
&outarg->attr, ATTR_TIMEOUT(outarg),
@@ -615,7 +619,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
FUSE_ARGS(args);
struct fuse_forget_link *forget;
struct fuse_create_in inarg;
- struct fuse_open_out outopen;
+ struct fuse_open_out *outopenp;
struct fuse_entry_out outentry;
struct fuse_inode *fi;
struct fuse_file *ff;
@@ -630,7 +634,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
goto out_err;
err = -ENOMEM;
- ff = fuse_file_alloc(fm);
+ ff = fuse_file_alloc(fm, true);
if (!ff)
goto out_put_forget_req;
@@ -659,8 +663,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.out_numargs = 2;
args.out_args[0].size = sizeof(outentry);
args.out_args[0].value = &outentry;
- args.out_args[1].size = sizeof(outopen);
- args.out_args[1].value = &outopen;
+ /* Store outarg for fuse_finish_open() */
+ outopenp = &ff->args->open_outarg;
+ args.out_args[1].size = sizeof(*outopenp);
+ args.out_args[1].value = outopenp;
err = get_create_ext(&args, dir, entry, mode);
if (err)
@@ -676,9 +682,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
fuse_invalid_attr(&outentry.attr))
goto out_free_ff;
- ff->fh = outopen.fh;
+ ff->fh = outopenp->fh;
ff->nodeid = outentry.nodeid;
- ff->open_flags = outopen.open_flags;
+ ff->open_flags = outopenp->open_flags;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
&outentry.attr, ATTR_TIMEOUT(&outentry), 0);
if (!inode) {
@@ -692,13 +698,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
d_instantiate(entry, inode);
fuse_change_entry_timeout(entry, &outentry);
fuse_dir_changed(dir);
- err = finish_open(file, entry, generic_file_open);
+ err = generic_file_open(inode, file);
+ if (!err) {
+ file->private_data = ff;
+ err = finish_open(file, entry, fuse_finish_open);
+ }
if (err) {
fi = get_fuse_inode(inode);
fuse_sync_release(fi, ff, flags);
} else {
- file->private_data = ff;
- fuse_finish_open(inode, file);
if (fm->fc->atomic_o_trunc && trunc)
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
@@ -1210,7 +1218,7 @@ static int fuse_do_statx(struct inode *inode, struct file *file,
if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) ||
((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) ||
inode_wrong_type(inode, sx->mode)))) {
- make_bad_inode(inode);
+ fuse_make_bad(inode);
return -EIO;
}
@@ -1485,7 +1493,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
*
* 1) Local access checking ('default_permissions' mount option) based
* on file mode. This is the plain old disk filesystem permission
- * modell.
+ * model.
*
* 2) "Remote" access checking, where server is responsible for
* checking permission in each inode operation. An exception to this
@@ -1630,7 +1638,30 @@ out_err:
static int fuse_dir_open(struct inode *inode, struct file *file)
{
- return fuse_open_common(inode, file, true);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ int err;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ err = generic_file_open(inode, file);
+ if (err)
+ return err;
+
+ err = fuse_do_open(fm, get_node_id(inode), file, true);
+ if (!err) {
+ struct fuse_file *ff = file->private_data;
+
+ /*
+ * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for
+ * directories for backward compatibility, though it's unlikely
+ * to be useful.
+ */
+ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE))
+ nonseekable_open(inode, file);
+ }
+
+ return err;
}
static int fuse_dir_release(struct inode *inode, struct file *file)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c007b0f0c3a7..a56e7bffd000 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -20,6 +20,7 @@
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/splice.h>
+#include <linux/task_io_accounting_ops.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -50,13 +51,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
return fuse_simple_request(fm, &args);
}
-struct fuse_release_args {
- struct fuse_args args;
- struct fuse_release_in inarg;
- struct inode *inode;
-};
-
-struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
{
struct fuse_file *ff;
@@ -65,15 +60,15 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
return NULL;
ff->fm = fm;
- ff->release_args = kzalloc(sizeof(*ff->release_args),
- GFP_KERNEL_ACCOUNT);
- if (!ff->release_args) {
- kfree(ff);
- return NULL;
+ if (release) {
+ ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT);
+ if (!ff->args) {
+ kfree(ff);
+ return NULL;
+ }
}
INIT_LIST_HEAD(&ff->write_entry);
- mutex_init(&ff->readdir.lock);
refcount_set(&ff->count, 1);
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
@@ -85,8 +80,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
void fuse_file_free(struct fuse_file *ff)
{
- kfree(ff->release_args);
- mutex_destroy(&ff->readdir.lock);
+ kfree(ff->args);
kfree(ff);
}
@@ -105,14 +99,17 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
kfree(ra);
}
-static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
{
if (refcount_dec_and_test(&ff->count)) {
- struct fuse_args *args = &ff->release_args->args;
+ struct fuse_release_args *ra = &ff->args->release_args;
+ struct fuse_args *args = (ra ? &ra->args : NULL);
- if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
- /* Do nothing when client does not implement 'open' */
- fuse_release_end(ff->fm, args, 0);
+ if (ra && ra->inode)
+ fuse_file_io_release(ff, ra->inode);
+
+ if (!args) {
+ /* Do nothing when server does not implement 'open' */
} else if (sync) {
fuse_simple_request(ff->fm, args);
fuse_release_end(ff->fm, args, 0);
@@ -132,27 +129,31 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
struct fuse_conn *fc = fm->fc;
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+ bool open = isdir ? !fc->no_opendir : !fc->no_open;
- ff = fuse_file_alloc(fm);
+ ff = fuse_file_alloc(fm, open);
if (!ff)
return ERR_PTR(-ENOMEM);
ff->fh = 0;
/* Default for no-open */
ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
- if (isdir ? !fc->no_opendir : !fc->no_open) {
- struct fuse_open_out outarg;
+ if (open) {
+ /* Store outarg for fuse_finish_open() */
+ struct fuse_open_out *outargp = &ff->args->open_outarg;
int err;
- err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
+ err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
if (!err) {
- ff->fh = outarg.fh;
- ff->open_flags = outarg.open_flags;
-
+ ff->fh = outargp->fh;
+ ff->open_flags = outargp->open_flags;
} else if (err != -ENOSYS) {
fuse_file_free(ff);
return ERR_PTR(err);
} else {
+ /* No release needed */
+ kfree(ff->args);
+ ff->args = NULL;
if (isdir)
fc->no_opendir = 1;
else
@@ -195,40 +196,50 @@ static void fuse_link_write_file(struct file *file)
spin_unlock(&fi->lock);
}
-void fuse_finish_open(struct inode *inode, struct file *file)
+int fuse_finish_open(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ err = fuse_file_io_open(file, inode);
+ if (err)
+ return err;
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
- if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
- i_size_write(inode, 0);
- spin_unlock(&fi->lock);
- file_update_time(file);
- fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
- }
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
+
+ return 0;
}
-int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ i_size_write(inode, 0);
+ spin_unlock(&fi->lock);
+ file_update_time(file);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+}
+
+static int fuse_open(struct inode *inode, struct file *file)
{
struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = fm->fc;
+ struct fuse_file *ff;
int err;
- bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
- fc->atomic_o_trunc &&
- fc->writeback_cache;
- bool dax_truncate = (file->f_flags & O_TRUNC) &&
- fc->atomic_o_trunc && FUSE_IS_DAX(inode);
+ bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
+ bool is_wb_truncate = is_truncate && fc->writeback_cache;
+ bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
if (fuse_is_bad(inode))
return -EIO;
@@ -250,16 +261,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (is_wb_truncate || dax_truncate)
fuse_set_nowrite(inode);
- err = fuse_do_open(fm, get_node_id(inode), file, isdir);
- if (!err)
- fuse_finish_open(inode, file);
+ err = fuse_do_open(fm, get_node_id(inode), file, false);
+ if (!err) {
+ ff = file->private_data;
+ err = fuse_finish_open(inode, file);
+ if (err)
+ fuse_sync_release(fi, ff, file->f_flags);
+ else if (is_truncate)
+ fuse_truncate_update_attr(inode, file);
+ }
if (is_wb_truncate || dax_truncate)
fuse_release_nowrite(inode);
if (!err) {
- struct fuse_file *ff = file->private_data;
-
- if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+ if (is_truncate)
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
@@ -274,10 +289,13 @@ out_inode_unlock:
}
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
- unsigned int flags, int opcode)
+ unsigned int flags, int opcode, bool sync)
{
struct fuse_conn *fc = ff->fm->fc;
- struct fuse_release_args *ra = ff->release_args;
+ struct fuse_release_args *ra = &ff->args->release_args;
+
+ if (fuse_file_passthrough(ff))
+ fuse_passthrough_release(ff, fuse_inode_backing(fi));
/* Inode is NULL on error path of fuse_create_open() */
if (likely(fi)) {
@@ -292,6 +310,11 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
wake_up_interruptible_all(&ff->poll_wait);
+ if (!ra)
+ return;
+
+ /* ff->args was used for open outarg */
+ memset(ff->args, 0, sizeof(*ff->args));
ra->inarg.fh = ff->fh;
ra->inarg.flags = flags;
ra->args.in_numargs = 1;
@@ -301,23 +324,28 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
ra->args.nodeid = ff->nodeid;
ra->args.force = true;
ra->args.nocreds = true;
+
+ /*
+ * Hold inode until release is finished.
+ * From fuse_sync_release() the refcount is 1 and everything's
+ * synchronous, so we are fine with not doing igrab() here.
+ */
+ ra->inode = sync ? NULL : igrab(&fi->inode);
}
void fuse_file_release(struct inode *inode, struct fuse_file *ff,
unsigned int open_flags, fl_owner_t id, bool isdir)
{
struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_release_args *ra = ff->release_args;
+ struct fuse_release_args *ra = &ff->args->release_args;
int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
- fuse_prepare_release(fi, ff, open_flags, opcode);
+ fuse_prepare_release(fi, ff, open_flags, opcode, false);
- if (ff->flock) {
+ if (ra && ff->flock) {
ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
}
- /* Hold inode until release is finished */
- ra->inode = igrab(inode);
/*
* Normally this will send the RELEASE request, however if
@@ -328,7 +356,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
- fuse_file_put(ff, ff->fm->fc->destroy, isdir);
+ fuse_file_put(ff, ff->fm->fc->destroy);
}
void fuse_release_common(struct file *file, bool isdir)
@@ -337,11 +365,6 @@ void fuse_release_common(struct file *file, bool isdir)
(fl_owner_t) file, isdir);
}
-static int fuse_open(struct inode *inode, struct file *file)
-{
- return fuse_open_common(inode, file, false);
-}
-
static int fuse_release(struct inode *inode, struct file *file)
{
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -363,12 +386,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
unsigned int flags)
{
WARN_ON(refcount_read(&ff->count) > 1);
- fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
- /*
- * iput(NULL) is a no-op and since the refcount is 1 and everything's
- * synchronous, we are fine with not doing igrab() here"
- */
- fuse_file_put(ff, true, false);
+ fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
+ fuse_file_put(ff, true);
}
EXPORT_SYMBOL_GPL(fuse_sync_release);
@@ -634,7 +653,8 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap,
for (i = 0; i < ap->num_pages; i++) {
if (should_dirty)
set_page_dirty_lock(ap->pages[i]);
- put_page(ap->pages[i]);
+ if (ap->args.is_pinned)
+ unpin_user_page(ap->pages[i]);
}
}
@@ -925,7 +945,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
put_page(page);
}
if (ia->ff)
- fuse_file_put(ia->ff, false, false);
+ fuse_file_put(ia->ff, false);
fuse_io_free(ia);
}
@@ -1299,13 +1319,93 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
return res;
}
+static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
+/*
+ * @return true if an exclusive lock for direct IO writes is needed
+ */
+static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /* Server side has to advise that it supports parallel dio writes. */
+ if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
+ return true;
+
+ /*
+ * Append will need to know the eventual EOF - always needs an
+ * exclusive lock.
+ */
+ if (iocb->ki_flags & IOCB_APPEND)
+ return true;
+
+ /* shared locks are not allowed with parallel page cache IO */
+ if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
+ return false;
+
+ /* Parallel dio beyond EOF is not supported, at least for now. */
+ if (fuse_io_past_eof(iocb, from))
+ return true;
+
+ return false;
+}
+
+static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
+ bool *exclusive)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_file *ff = iocb->ki_filp->private_data;
+
+ *exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
+ if (*exclusive) {
+ inode_lock(inode);
+ } else {
+ inode_lock_shared(inode);
+ /*
+ * New parallal dio allowed only if inode is not in caching
+ * mode and denies new opens in caching mode. This check
+ * should be performed only after taking shared inode lock.
+ * Previous past eof check was without inode lock and might
+ * have raced, so check it again.
+ */
+ if (fuse_io_past_eof(iocb, from) ||
+ fuse_file_uncached_io_start(inode, ff, NULL) != 0) {
+ inode_unlock_shared(inode);
+ inode_lock(inode);
+ *exclusive = true;
+ }
+ }
+}
+
+static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_file *ff = iocb->ki_filp->private_data;
+
+ if (exclusive) {
+ inode_unlock(inode);
+ } else {
+ /* Allow opens in caching mode after last parallel dio end */
+ fuse_file_uncached_io_end(inode, ff);
+ inode_unlock_shared(inode);
+ }
+}
+
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
ssize_t written = 0;
struct inode *inode = mapping->host;
- ssize_t err;
+ ssize_t err, count;
struct fuse_conn *fc = get_fuse_conn(inode);
if (fc->writeback_cache) {
@@ -1327,10 +1427,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
writethrough:
inode_lock(inode);
- err = generic_write_checks(iocb, from);
+ err = count = generic_write_checks(iocb, from);
if (err <= 0)
goto out;
+ task_io_account_write(count);
+
err = file_remove_privs(file);
if (err)
goto out;
@@ -1392,10 +1494,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
while (nbytes < *nbytesp && ap->num_pages < max_pages) {
unsigned npages;
size_t start;
- ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
- *nbytesp - nbytes,
- max_pages - ap->num_pages,
- &start);
+ struct page **pt_pages;
+
+ pt_pages = &ap->pages[ap->num_pages];
+ ret = iov_iter_extract_pages(ii, &pt_pages,
+ *nbytesp - nbytes,
+ max_pages - ap->num_pages,
+ 0, &start);
if (ret < 0)
break;
@@ -1412,6 +1517,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
+ ap->args.is_pinned = iov_iter_extract_will_pin(ii);
ap->args.user_pages = true;
if (write)
ap->args.in_pages = true;
@@ -1558,51 +1664,17 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
return res;
}
-static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
- struct iov_iter *iter)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
-
- return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
-}
-
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct file *file = iocb->ki_filp;
- struct fuse_file *ff = file->private_data;
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
- bool exclusive_lock =
- !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
- get_fuse_conn(inode)->direct_io_allow_mmap ||
- iocb->ki_flags & IOCB_APPEND ||
- fuse_direct_write_extending_i_size(iocb, from);
-
- /*
- * Take exclusive lock if
- * - Parallel direct writes are disabled - a user space decision
- * - Parallel direct writes are enabled and i_size is being extended.
- * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP).
- * This might not be needed at all, but needs further investigation.
- */
- if (exclusive_lock)
- inode_lock(inode);
- else {
- inode_lock_shared(inode);
-
- /* A race with truncate might have come up as the decision for
- * the lock type was done without holding the lock, check again.
- */
- if (fuse_direct_write_extending_i_size(iocb, from)) {
- inode_unlock_shared(inode);
- inode_lock(inode);
- exclusive_lock = true;
- }
- }
+ bool exclusive;
+ fuse_dio_lock(iocb, from, &exclusive);
res = generic_write_checks(iocb, from);
if (res > 0) {
+ task_io_account_write(res);
if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
res = fuse_direct_IO(iocb, from);
} else {
@@ -1611,10 +1683,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- if (exclusive_lock)
- inode_unlock(inode);
- else
- inode_unlock_shared(inode);
+ fuse_dio_unlock(iocb, exclusive);
return res;
}
@@ -1631,10 +1700,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (FUSE_IS_DAX(inode))
return fuse_dax_read_iter(iocb, to);
- if (!(ff->open_flags & FOPEN_DIRECT_IO))
- return fuse_cache_read_iter(iocb, to);
- else
+ /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+ if (ff->open_flags & FOPEN_DIRECT_IO)
return fuse_direct_read_iter(iocb, to);
+ else if (fuse_file_passthrough(ff))
+ return fuse_passthrough_read_iter(iocb, to);
+ else
+ return fuse_cache_read_iter(iocb, to);
}
static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1649,10 +1721,38 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (FUSE_IS_DAX(inode))
return fuse_dax_write_iter(iocb, from);
- if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+ if (ff->open_flags & FOPEN_DIRECT_IO)
+ return fuse_direct_write_iter(iocb, from);
+ else if (fuse_file_passthrough(ff))
+ return fuse_passthrough_write_iter(iocb, from);
+ else
return fuse_cache_write_iter(iocb, from);
+}
+
+static ssize_t fuse_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct fuse_file *ff = in->private_data;
+
+ /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+ if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_passthrough_splice_read(in, ppos, pipe, len, flags);
else
- return fuse_direct_write_iter(iocb, from);
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
+static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fuse_file *ff = out->private_data;
+
+ /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+ if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_passthrough_splice_write(pipe, out, ppos, len, flags);
+ else
+ return iter_file_splice_write(pipe, out, ppos, len, flags);
}
static void fuse_writepage_free(struct fuse_writepage_args *wpa)
@@ -1667,7 +1767,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
__free_page(ap->pages[i]);
if (wpa->ia.ff)
- fuse_file_put(wpa->ia.ff, false, false);
+ fuse_file_put(wpa->ia.ff, false);
kfree(ap->pages);
kfree(wpa);
@@ -1909,7 +2009,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
- fuse_file_put(ff, false, false);
+ fuse_file_put(ff, false);
return err;
}
@@ -1947,26 +2047,26 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
rcu_read_unlock();
}
-static int fuse_writepage_locked(struct page *page)
+static int fuse_writepage_locked(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_writepage_args *wpa;
struct fuse_args_pages *ap;
- struct page *tmp_page;
+ struct folio *tmp_folio;
int error = -ENOMEM;
- set_page_writeback(page);
+ folio_start_writeback(folio);
wpa = fuse_writepage_args_alloc();
if (!wpa)
goto err;
ap = &wpa->ia.ap;
- tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!tmp_page)
+ tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
+ if (!tmp_folio)
goto err_free;
error = -EIO;
@@ -1975,21 +2075,21 @@ static int fuse_writepage_locked(struct page *page)
goto err_nofile;
fuse_writepage_add_to_bucket(fc, wpa);
- fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
+ fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0);
- copy_highpage(tmp_page, page);
+ folio_copy(tmp_folio, folio);
wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
wpa->next = NULL;
ap->args.in_pages = true;
ap->num_pages = 1;
- ap->pages[0] = tmp_page;
+ ap->pages[0] = &tmp_folio->page;
ap->descs[0].offset = 0;
ap->descs[0].length = PAGE_SIZE;
ap->args.end = fuse_writepage_end;
wpa->inode = inode;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
+ node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
spin_lock(&fi->lock);
tree_insert(&fi->writepages, wpa);
@@ -1997,48 +2097,20 @@ static int fuse_writepage_locked(struct page *page)
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
- end_page_writeback(page);
+ folio_end_writeback(folio);
return 0;
err_nofile:
- __free_page(tmp_page);
+ folio_put(tmp_folio);
err_free:
kfree(wpa);
err:
- mapping_set_error(page->mapping, error);
- end_page_writeback(page);
+ mapping_set_error(folio->mapping, error);
+ folio_end_writeback(folio);
return error;
}
-static int fuse_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
- int err;
-
- if (fuse_page_is_writeback(page->mapping->host, page->index)) {
- /*
- * ->writepages() should be called for sync() and friends. We
- * should only get here on direct reclaim and then we are
- * allowed to skip a page which is already in flight
- */
- WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
-
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- fc->num_background >= fc->congestion_threshold)
- return AOP_WRITEPAGE_ACTIVATE;
-
- err = fuse_writepage_locked(page);
- unlock_page(page);
-
- return err;
-}
-
struct fuse_fill_wb_data {
struct fuse_writepage_args *wpa;
struct fuse_file *ff;
@@ -2307,7 +2379,7 @@ static int fuse_writepages(struct address_space *mapping,
fuse_writepages_send(&data);
}
if (data.ff)
- fuse_file_put(data.ff, false, false);
+ fuse_file_put(data.ff, false);
kfree(data.orig_pages);
out:
@@ -2401,7 +2473,7 @@ static int fuse_launder_folio(struct folio *folio)
/* Serialize with pending writeback for the same page */
fuse_wait_on_page_writeback(inode, folio->index);
- err = fuse_writepage_locked(&folio->page);
+ err = fuse_writepage_locked(folio);
if (!err)
fuse_wait_on_page_writeback(inode, folio->index);
}
@@ -2462,13 +2534,30 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
+ struct inode *inode = file_inode(file);
+ int rc;
/* DAX mmap is superior to direct_io mmap */
- if (FUSE_IS_DAX(file_inode(file)))
+ if (FUSE_IS_DAX(inode))
return fuse_dax_mmap(file, vma);
+ /*
+ * If inode is in passthrough io mode, because it has some file open
+ * in passthrough mode, either mmap to backing file or fail mmap,
+ * because mixing cached mmap and passthrough io mode is not allowed.
+ */
+ if (fuse_file_passthrough(ff))
+ return fuse_passthrough_mmap(file, vma);
+ else if (fuse_inode_backing(get_fuse_inode(inode)))
+ return -ENODEV;
+
+ /*
+ * FOPEN_DIRECT_IO handling is special compared to O_DIRECT,
+ * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
+ */
if (ff->open_flags & FOPEN_DIRECT_IO) {
- /* Can't provide the coherency needed for MAP_SHARED
+ /*
+ * Can't provide the coherency needed for MAP_SHARED
* if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
*/
if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
@@ -2476,7 +2565,19 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
invalidate_inode_pages2(file->f_mapping);
- return generic_file_mmap(file, vma);
+ if (!(vma->vm_flags & VM_MAYSHARE)) {
+ /* MAP_PRIVATE */
+ return generic_file_mmap(file, vma);
+ }
+
+ /*
+ * First mmap of direct_io file enters caching inode io mode.
+ * Also waits for parallel dio writers to go into serial mode
+ * (exclusive instead of shared lock).
+ */
+ rc = fuse_file_cached_io_start(inode, ff);
+ if (rc)
+ return rc;
}
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
@@ -2580,10 +2681,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
return -ENOLCK;
}
- /* Unlock on close is handled by the flush method */
- if ((fl->c.flc_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
- return 0;
-
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
err = fuse_simple_request(fm, &args);
@@ -3213,8 +3310,8 @@ static const struct file_operations fuse_file_operations = {
.lock = fuse_file_lock,
.get_unmapped_area = thp_get_unmapped_area,
.flock = fuse_file_flock,
- .splice_read = filemap_splice_read,
- .splice_write = iter_file_splice_write,
+ .splice_read = fuse_splice_read,
+ .splice_write = fuse_splice_write,
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
@@ -3225,10 +3322,10 @@ static const struct file_operations fuse_file_operations = {
static const struct address_space_operations fuse_file_aops = {
.read_folio = fuse_read_folio,
.readahead = fuse_readahead,
- .writepage = fuse_writepage,
.writepages = fuse_writepages,
.launder_folio = fuse_launder_folio,
.dirty_folio = filemap_dirty_folio,
+ .migrate_folio = filemap_migrate_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
.write_begin = fuse_write_begin,
@@ -3245,7 +3342,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
fi->writectr = 0;
+ fi->iocachectr = 0;
init_waitqueue_head(&fi->page_waitq);
+ init_waitqueue_head(&fi->direct_io_waitq);
fi->writepages = RB_ROOT;
if (IS_ENABLED(CONFIG_FUSE_DAX))
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bcbe34488862..b24084b60864 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -76,6 +76,16 @@ struct fuse_submount_lookup {
struct fuse_forget_link *forget;
};
+/** Container for data related to mapping to backing file */
+struct fuse_backing {
+ struct file *file;
+ struct cred *cred;
+
+ /** refcount */
+ refcount_t count;
+ struct rcu_head rcu;
+};
+
/** FUSE inode */
struct fuse_inode {
/** Inode data */
@@ -111,7 +121,7 @@ struct fuse_inode {
u64 attr_version;
union {
- /* Write related fields (regular file only) */
+ /* read/write io cache (regular file only) */
struct {
/* Files usable in writepage. Protected by fi->lock */
struct list_head write_files;
@@ -123,9 +133,15 @@ struct fuse_inode {
* (FUSE_NOWRITE) means more writes are blocked */
int writectr;
+ /** Number of files/maps using page cache */
+ int iocachectr;
+
/* Waitq for writepage completion */
wait_queue_head_t page_waitq;
+ /* waitq for direct-io completion */
+ wait_queue_head_t direct_io_waitq;
+
/* List of writepage requestst (pending or sent) */
struct rb_root writepages;
};
@@ -173,6 +189,10 @@ struct fuse_inode {
#endif
/** Submount specific lookup tracking */
struct fuse_submount_lookup *submount_lookup;
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ /** Reference to backing file in passthrough mode */
+ struct fuse_backing *fb;
+#endif
};
/** FUSE inode state bits */
@@ -187,19 +207,21 @@ enum {
FUSE_I_BAD,
/* Has btime */
FUSE_I_BTIME,
+ /* Wants or already has page cache IO */
+ FUSE_I_CACHE_IO_MODE,
};
struct fuse_conn;
struct fuse_mount;
-struct fuse_release_args;
+union fuse_file_args;
/** FUSE specific file data */
struct fuse_file {
/** Fuse connection for this file */
struct fuse_mount *fm;
- /* Argument space reserved for release */
- struct fuse_release_args *release_args;
+ /* Argument space reserved for open/release */
+ union fuse_file_args *args;
/** Kernel file handle guaranteed to be unique */
u64 kh;
@@ -221,12 +243,6 @@ struct fuse_file {
/* Readdir related */
struct {
- /*
- * Protects below fields against (crazy) parallel readdir on
- * same open file. Uncontended in the normal case.
- */
- struct mutex lock;
-
/* Dir stream position */
loff_t pos;
@@ -244,6 +260,15 @@ struct fuse_file {
/** Wait queue head for poll */
wait_queue_head_t poll_wait;
+ /** Does file hold a fi->iocachectr refcount? */
+ enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode;
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ /** Reference to backing file in passthrough mode */
+ struct file *passthrough;
+ const struct cred *cred;
+#endif
+
/** Has flock been performed on this file? */
bool flock:1;
};
@@ -283,6 +308,7 @@ struct fuse_args {
bool page_replace:1;
bool may_block:1;
bool is_ext:1;
+ bool is_pinned:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
@@ -295,6 +321,19 @@ struct fuse_args_pages {
unsigned int num_pages;
};
+struct fuse_release_args {
+ struct fuse_args args;
+ struct fuse_release_in inarg;
+ struct inode *inode;
+};
+
+union fuse_file_args {
+ /* Used during open() */
+ struct fuse_open_out open_outarg;
+ /* Used during release() */
+ struct fuse_release_args release_args;
+};
+
#define FUSE_ARGS(args) struct fuse_args args = {}
/** The request IO state (for asynchronous processing) */
@@ -818,6 +857,12 @@ struct fuse_conn {
/* Is statx not implemented by fs? */
unsigned int no_statx:1;
+ /** Passthrough support for read/write IO */
+ unsigned int passthrough:1;
+
+ /** Maximum stack depth for passthrough backing files */
+ int max_stack_depth;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -867,6 +912,11 @@ struct fuse_conn {
/* New writepages go into this bucket */
struct fuse_sync_bucket __rcu *curr_bucket;
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ /** IDR for backing files ids */
+ struct idr backing_files_map;
+#endif
};
/*
@@ -940,7 +990,6 @@ static inline bool fuse_stale_inode(const struct inode *inode, int generation,
static inline void fuse_make_bad(struct inode *inode)
{
- remove_inode_hash(inode);
set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
}
@@ -1032,14 +1081,9 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
size_t count, int opcode);
-/**
- * Send OPEN or OPENDIR request
- */
-int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
-
-struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release);
void fuse_file_free(struct fuse_file *ff);
-void fuse_finish_open(struct inode *inode, struct file *file);
+int fuse_finish_open(struct inode *inode, struct file *file);
void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
unsigned int flags);
@@ -1349,11 +1393,82 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int fuse_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
-/* file.c */
+/* iomode.c */
+int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff);
+int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb);
+void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff);
+
+int fuse_file_io_open(struct file *file, struct inode *inode);
+void fuse_file_io_release(struct fuse_file *ff, struct inode *inode);
+/* file.c */
struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, bool isdir);
void fuse_file_release(struct inode *inode, struct fuse_file *ff,
unsigned int open_flags, fl_owner_t id, bool isdir);
+/* passthrough.c */
+static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return READ_ONCE(fi->fb);
+#else
+ return NULL;
+#endif
+}
+
+static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
+ struct fuse_backing *fb)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return xchg(&fi->fb, fb);
+#else
+ return NULL;
+#endif
+}
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb);
+void fuse_backing_put(struct fuse_backing *fb);
+#else
+
+static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+ return NULL;
+}
+
+static inline void fuse_backing_put(struct fuse_backing *fb)
+{
+}
+#endif
+
+void fuse_backing_files_init(struct fuse_conn *fc);
+void fuse_backing_files_free(struct fuse_conn *fc);
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map);
+int fuse_backing_close(struct fuse_conn *fc, int backing_id);
+
+struct fuse_backing *fuse_passthrough_open(struct file *file,
+ struct inode *inode,
+ int backing_id);
+void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb);
+
+static inline struct file *fuse_file_passthrough(struct fuse_file *ff)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return ff->passthrough;
+#else
+ return NULL;
+#endif
+}
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags);
+ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags);
+ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 516ea2979a90..3a5d88878335 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -111,6 +111,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
goto out_free_forget;
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_inode_backing_set(fi, NULL);
+
return &fi->inode;
out_free_forget:
@@ -129,6 +132,9 @@ static void fuse_free_inode(struct inode *inode)
#ifdef CONFIG_FUSE_DAX
kfree(fi->dax);
#endif
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_backing_put(fuse_inode_backing(fi));
+
kmem_cache_free(fuse_inode_cachep, fi);
}
@@ -469,8 +475,11 @@ retry:
} else if (fuse_stale_inode(inode, generation, attr)) {
/* nodeid was reused, any I/O on the old inode should fail */
fuse_make_bad(inode);
- iput(inode);
- goto retry;
+ if (inode != d_inode(sb->s_root)) {
+ remove_inode_hash(inode);
+ iput(inode);
+ goto retry;
+ }
}
fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
@@ -924,6 +933,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_backing_files_init(fc);
+
INIT_LIST_HEAD(&fc->mounts);
list_add(&fm->fc_entry, &fc->mounts);
fm->fc = fc;
@@ -954,6 +966,8 @@ void fuse_conn_put(struct fuse_conn *fc)
WARN_ON(atomic_read(&bucket->count) != 1);
kfree(bucket);
}
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_backing_files_free(fc);
call_rcu(&fc->rcu, delayed_release);
}
}
@@ -974,7 +988,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
attr.mode = mode;
attr.ino = FUSE_ROOT_ID;
attr.nlink = 1;
- return fuse_iget(sb, 1, 0, &attr, 0, 0);
+ return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0);
}
struct fuse_inode_handle {
@@ -1117,6 +1131,11 @@ static struct dentry *fuse_get_parent(struct dentry *child)
return parent;
}
+/* only for fid encoding; no support for file handle */
+static const struct export_operations fuse_export_fid_operations = {
+ .encode_fh = fuse_encode_fh,
+};
+
static const struct export_operations fuse_export_operations = {
.fh_to_dentry = fuse_fh_to_dentry,
.fh_to_parent = fuse_fh_to_parent,
@@ -1291,6 +1310,26 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->create_supp_group = 1;
if (flags & FUSE_DIRECT_IO_ALLOW_MMAP)
fc->direct_io_allow_mmap = 1;
+ /*
+ * max_stack_depth is the max stack depth of FUSE fs,
+ * so it has to be at least 1 to support passthrough
+ * to backing files.
+ *
+ * with max_stack_depth > 1, the backing files can be
+ * on a stacked fs (e.g. overlayfs) themselves and with
+ * max_stack_depth == 1, FUSE fs can be stacked as the
+ * underlying fs of a stacked fs (e.g. overlayfs).
+ */
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) &&
+ (flags & FUSE_PASSTHROUGH) &&
+ arg->max_stack_depth > 0 &&
+ arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) {
+ fc->passthrough = 1;
+ fc->max_stack_depth = arg->max_stack_depth;
+ fm->sb->s_stack_depth = arg->max_stack_depth;
+ }
+ if (flags & FUSE_NO_EXPORT_SUPPORT)
+ fm->sb->s_export_op = &fuse_export_fid_operations;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1337,7 +1376,8 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
- FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP;
+ FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
+ FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
@@ -1346,6 +1386,8 @@ void fuse_send_init(struct fuse_mount *fm)
#endif
if (fm->fc->auto_submounts)
flags |= FUSE_SUBMOUNTS;
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ flags |= FUSE_PASSTHROUGH;
ia->in.flags = flags;
ia->in.flags2 = flags >> 32;
@@ -1496,8 +1538,8 @@ static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
.ctimensec = ctime.tv_nsec,
.mode = fi->inode.i_mode,
.nlink = fi->inode.i_nlink,
- .uid = fi->inode.i_uid.val,
- .gid = fi->inode.i_gid.val,
+ .uid = __kuid_val(fi->inode.i_uid),
+ .gid = __kgid_val(fi->inode.i_gid),
.rdev = fi->inode.i_rdev,
.blksize = 1u << fi->inode.i_blkbits,
};
@@ -1534,6 +1576,7 @@ static int fuse_fill_super_submount(struct super_block *sb,
sb->s_bdi = bdi_get(parent_sb->s_bdi);
sb->s_xattr = parent_sb->s_xattr;
+ sb->s_export_op = parent_sb->s_export_op;
sb->s_time_gran = parent_sb->s_time_gran;
sb->s_blocksize = parent_sb->s_blocksize;
sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
new file mode 100644
index 000000000000..c653ddcf0578
--- /dev/null
+++ b/fs/fuse/iomode.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE inode io modes.
+ *
+ * Copyright (c) 2024 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+
+/*
+ * Return true if need to wait for new opens in caching mode.
+ */
+static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi)
+{
+ return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi);
+}
+
+/*
+ * Start cached io mode.
+ *
+ * Blocks new parallel dio writes and waits for the in-progress parallel dio
+ * writes to complete.
+ */
+int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /* There are no io modes if server does not implement open */
+ if (!ff->args)
+ return 0;
+
+ spin_lock(&fi->lock);
+ /*
+ * Setting the bit advises new direct-io writes to use an exclusive
+ * lock - without it the wait below might be forever.
+ */
+ while (fuse_is_io_cache_wait(fi)) {
+ set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+ spin_unlock(&fi->lock);
+ wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi));
+ spin_lock(&fi->lock);
+ }
+
+ /*
+ * Check if inode entered passthrough io mode while waiting for parallel
+ * dio write completion.
+ */
+ if (fuse_inode_backing(fi)) {
+ clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+ spin_unlock(&fi->lock);
+ return -ETXTBSY;
+ }
+
+ WARN_ON(ff->iomode == IOM_UNCACHED);
+ if (ff->iomode == IOM_NONE) {
+ ff->iomode = IOM_CACHED;
+ if (fi->iocachectr == 0)
+ set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+ fi->iocachectr++;
+ }
+ spin_unlock(&fi->lock);
+ return 0;
+}
+
+static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ WARN_ON(fi->iocachectr <= 0);
+ WARN_ON(ff->iomode != IOM_CACHED);
+ ff->iomode = IOM_NONE;
+ fi->iocachectr--;
+ if (fi->iocachectr == 0)
+ clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+ spin_unlock(&fi->lock);
+}
+
+/* Start strictly uncached io mode where cache access is not allowed */
+int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_backing *oldfb;
+ int err = 0;
+
+ spin_lock(&fi->lock);
+ /* deny conflicting backing files on same fuse inode */
+ oldfb = fuse_inode_backing(fi);
+ if (oldfb && oldfb != fb) {
+ err = -EBUSY;
+ goto unlock;
+ }
+ if (fi->iocachectr > 0) {
+ err = -ETXTBSY;
+ goto unlock;
+ }
+ WARN_ON(ff->iomode != IOM_NONE);
+ fi->iocachectr--;
+ ff->iomode = IOM_UNCACHED;
+
+ /* fuse inode holds a single refcount of backing file */
+ if (!oldfb) {
+ oldfb = fuse_inode_backing_set(fi, fb);
+ WARN_ON_ONCE(oldfb != NULL);
+ } else {
+ fuse_backing_put(fb);
+ }
+unlock:
+ spin_unlock(&fi->lock);
+ return err;
+}
+
+void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_backing *oldfb = NULL;
+
+ spin_lock(&fi->lock);
+ WARN_ON(fi->iocachectr >= 0);
+ WARN_ON(ff->iomode != IOM_UNCACHED);
+ ff->iomode = IOM_NONE;
+ fi->iocachectr++;
+ if (!fi->iocachectr) {
+ wake_up(&fi->direct_io_waitq);
+ oldfb = fuse_inode_backing_set(fi, NULL);
+ }
+ spin_unlock(&fi->lock);
+ if (oldfb)
+ fuse_backing_put(oldfb);
+}
+
+/*
+ * Open flags that are allowed in combination with FOPEN_PASSTHROUGH.
+ * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write
+ * operations go directly to the server, but mmap is done on the backing file.
+ * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode
+ * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination.
+ */
+#define FOPEN_PASSTHROUGH_MASK \
+ (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \
+ FOPEN_NOFLUSH)
+
+static int fuse_file_passthrough_open(struct inode *inode, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_backing *fb;
+ int err;
+
+ /* Check allowed conditions for file open in passthrough mode */
+ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough ||
+ (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK))
+ return -EINVAL;
+
+ fb = fuse_passthrough_open(file, inode,
+ ff->args->open_outarg.backing_id);
+ if (IS_ERR(fb))
+ return PTR_ERR(fb);
+
+ /* First passthrough file open denies caching inode io mode */
+ err = fuse_file_uncached_io_start(inode, ff, fb);
+ if (!err)
+ return 0;
+
+ fuse_passthrough_release(ff, fb);
+ fuse_backing_put(fb);
+
+ return err;
+}
+
+/* Request access to submit new io to inode via open file */
+int fuse_file_io_open(struct file *file, struct inode *inode)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int err;
+
+ /*
+ * io modes are not relevant with DAX and with server that does not
+ * implement open.
+ */
+ if (FUSE_IS_DAX(inode) || !ff->args)
+ return 0;
+
+ /*
+ * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode
+ * which is already open for passthrough.
+ */
+ err = -EINVAL;
+ if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH))
+ goto fail;
+
+ /*
+ * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO.
+ */
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES;
+
+ /*
+ * First passthrough file open denies caching inode io mode.
+ * First caching file open enters caching inode io mode.
+ *
+ * Note that if user opens a file open with O_DIRECT, but server did
+ * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT,
+ * so we put the inode in caching mode to prevent parallel dio.
+ */
+ if ((ff->open_flags & FOPEN_DIRECT_IO) &&
+ !(ff->open_flags & FOPEN_PASSTHROUGH))
+ return 0;
+
+ if (ff->open_flags & FOPEN_PASSTHROUGH)
+ err = fuse_file_passthrough_open(inode, file);
+ else
+ err = fuse_file_cached_io_start(inode, ff);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n",
+ ff->open_flags, err);
+ /*
+ * The file open mode determines the inode io mode.
+ * Using incorrect open mode is a server mistake, which results in
+ * user visible failure of open() with EIO error.
+ */
+ return -EIO;
+}
+
+/* No more pending io and no new io possible to inode via open/mmapped file */
+void fuse_file_io_release(struct fuse_file *ff, struct inode *inode)
+{
+ /*
+ * Last parallel dio close allows caching inode io mode.
+ * Last caching file close exits caching inode io mode.
+ */
+ switch (ff->iomode) {
+ case IOM_NONE:
+ /* Nothing to do */
+ break;
+ case IOM_UNCACHED:
+ fuse_file_uncached_io_end(inode, ff);
+ break;
+ case IOM_CACHED:
+ fuse_file_cached_io_end(inode, ff);
+ break;
+ }
+}
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
new file mode 100644
index 000000000000..1567f0323858
--- /dev/null
+++ b/fs/fuse/passthrough.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE passthrough to backing file.
+ *
+ * Copyright (c) 2023 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/file.h>
+#include <linux/backing-file.h>
+#include <linux/splice.h>
+
+static void fuse_file_accessed(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ fuse_invalidate_atime(inode);
+}
+
+static void fuse_file_modified(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+}
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = file,
+ .accessed = fuse_file_accessed,
+ };
+
+
+ pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
+ backing_file, iocb->ki_pos, count);
+
+ if (!count)
+ return 0;
+
+ ret = backing_file_read_iter(backing_file, iter, iocb, iocb->ki_flags,
+ &ctx);
+
+ return ret;
+}
+
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct fuse_file *ff = file->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = file,
+ .end_write = fuse_file_modified,
+ };
+
+ pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
+ backing_file, iocb->ki_pos, count);
+
+ if (!count)
+ return 0;
+
+ inode_lock(inode);
+ ret = backing_file_write_iter(backing_file, iter, iocb, iocb->ki_flags,
+ &ctx);
+ inode_unlock(inode);
+
+ return ret;
+}
+
+ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct fuse_file *ff = in->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = in,
+ .accessed = fuse_file_accessed,
+ };
+
+ pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
+ backing_file, ppos ? *ppos : 0, len, flags);
+
+ return backing_file_splice_read(backing_file, ppos, pipe, len, flags,
+ &ctx);
+}
+
+ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags)
+{
+ struct fuse_file *ff = out->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ struct inode *inode = file_inode(out);
+ ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = out,
+ .end_write = fuse_file_modified,
+ };
+
+ pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
+ backing_file, ppos ? *ppos : 0, len, flags);
+
+ inode_lock(inode);
+ ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags,
+ &ctx);
+ inode_unlock(inode);
+
+ return ret;
+}
+
+ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct fuse_file *ff = file->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = file,
+ .accessed = fuse_file_accessed,
+ };
+
+ pr_debug("%s: backing_file=0x%p, start=%lu, end=%lu\n", __func__,
+ backing_file, vma->vm_start, vma->vm_end);
+
+ return backing_file_mmap(backing_file, vma, &ctx);
+}
+
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+ if (fb && refcount_inc_not_zero(&fb->count))
+ return fb;
+ return NULL;
+}
+
+static void fuse_backing_free(struct fuse_backing *fb)
+{
+ pr_debug("%s: fb=0x%p\n", __func__, fb);
+
+ if (fb->file)
+ fput(fb->file);
+ put_cred(fb->cred);
+ kfree_rcu(fb, rcu);
+}
+
+void fuse_backing_put(struct fuse_backing *fb)
+{
+ if (fb && refcount_dec_and_test(&fb->count))
+ fuse_backing_free(fb);
+}
+
+void fuse_backing_files_init(struct fuse_conn *fc)
+{
+ idr_init(&fc->backing_files_map);
+}
+
+static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&fc->lock);
+ /* FIXME: xarray might be space inefficient */
+ id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
+ spin_unlock(&fc->lock);
+ idr_preload_end();
+
+ WARN_ON_ONCE(id == 0);
+ return id;
+}
+
+static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
+ int id)
+{
+ struct fuse_backing *fb;
+
+ spin_lock(&fc->lock);
+ fb = idr_remove(&fc->backing_files_map, id);
+ spin_unlock(&fc->lock);
+
+ return fb;
+}
+
+static int fuse_backing_id_free(int id, void *p, void *data)
+{
+ struct fuse_backing *fb = p;
+
+ WARN_ON_ONCE(refcount_read(&fb->count) != 1);
+ fuse_backing_free(fb);
+ return 0;
+}
+
+void fuse_backing_files_free(struct fuse_conn *fc)
+{
+ idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
+ idr_destroy(&fc->backing_files_map);
+}
+
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
+{
+ struct file *file;
+ struct super_block *backing_sb;
+ struct fuse_backing *fb = NULL;
+ int res;
+
+ pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ res = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ res = -EINVAL;
+ if (map->flags)
+ goto out;
+
+ file = fget(map->fd);
+ res = -EBADF;
+ if (!file)
+ goto out;
+
+ res = -EOPNOTSUPP;
+ if (!file->f_op->read_iter || !file->f_op->write_iter)
+ goto out_fput;
+
+ backing_sb = file_inode(file)->i_sb;
+ res = -ELOOP;
+ if (backing_sb->s_stack_depth >= fc->max_stack_depth)
+ goto out_fput;
+
+ fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
+ res = -ENOMEM;
+ if (!fb)
+ goto out_fput;
+
+ fb->file = file;
+ fb->cred = prepare_creds();
+ refcount_set(&fb->count, 1);
+
+ res = fuse_backing_id_alloc(fc, fb);
+ if (res < 0) {
+ fuse_backing_free(fb);
+ fb = NULL;
+ }
+
+out:
+ pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
+
+ return res;
+
+out_fput:
+ fput(file);
+ goto out;
+}
+
+int fuse_backing_close(struct fuse_conn *fc, int backing_id)
+{
+ struct fuse_backing *fb = NULL;
+ int err;
+
+ pr_debug("%s: backing_id=%d\n", __func__, backing_id);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ err = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ err = -EINVAL;
+ if (backing_id <= 0)
+ goto out;
+
+ err = -ENOENT;
+ fb = fuse_backing_id_remove(fc, backing_id);
+ if (!fb)
+ goto out;
+
+ fuse_backing_put(fb);
+ err = 0;
+out:
+ pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
+
+ return err;
+}
+
+/*
+ * Setup passthrough to a backing file.
+ *
+ * Returns an fb object with elevated refcount to be stored in fuse inode.
+ */
+struct fuse_backing *fuse_passthrough_open(struct file *file,
+ struct inode *inode,
+ int backing_id)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fm->fc;
+ struct fuse_backing *fb = NULL;
+ struct file *backing_file;
+ int err;
+
+ err = -EINVAL;
+ if (backing_id <= 0)
+ goto out;
+
+ rcu_read_lock();
+ fb = idr_find(&fc->backing_files_map, backing_id);
+ fb = fuse_backing_get(fb);
+ rcu_read_unlock();
+
+ err = -ENOENT;
+ if (!fb)
+ goto out;
+
+ /* Allocate backing file per fuse file to store fuse path */
+ backing_file = backing_file_open(&file->f_path, file->f_flags,
+ &fb->file->f_path, fb->cred);
+ err = PTR_ERR(backing_file);
+ if (IS_ERR(backing_file)) {
+ fuse_backing_put(fb);
+ goto out;
+ }
+
+ err = 0;
+ ff->passthrough = backing_file;
+ ff->cred = get_cred(fb->cred);
+out:
+ pr_debug("%s: backing_id=%d, fb=0x%p, backing_file=0x%p, err=%i\n", __func__,
+ backing_id, fb, ff->passthrough, err);
+
+ return err ? ERR_PTR(err) : fb;
+}
+
+void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb)
+{
+ pr_debug("%s: fb=0x%p, backing_file=0x%p\n", __func__,
+ fb, ff->passthrough);
+
+ fput(ff->passthrough);
+ ff->passthrough = NULL;
+ put_cred(ff->cred);
+ ff->cred = NULL;
+}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index c66a54d6c7d3..0377b6dc24c8 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -592,15 +592,11 @@ int fuse_readdir(struct file *file, struct dir_context *ctx)
if (fuse_is_bad(inode))
return -EIO;
- mutex_lock(&ff->readdir.lock);
-
err = UNCACHED;
if (ff->open_flags & FOPEN_CACHE_DIR)
err = fuse_readdir_cached(file, ctx);
if (err == UNCACHED)
err = fuse_readdir_uncached(file, ctx);
- mutex_unlock(&ff->readdir.lock);
-
return err;
}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 5f1be1da92ce..322af827a232 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -16,6 +16,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/highmem.h>
+#include <linux/cleanup.h>
#include <linux/uio.h>
#include "fuse_i.h"
@@ -31,6 +32,9 @@
static DEFINE_MUTEX(virtio_fs_mutex);
static LIST_HEAD(virtio_fs_instances);
+/* The /sys/fs/virtio_fs/ kset */
+static struct kset *virtio_fs_kset;
+
enum {
VQ_HIPRIO,
VQ_REQUEST
@@ -55,7 +59,7 @@ struct virtio_fs_vq {
/* A virtio-fs device instance */
struct virtio_fs {
- struct kref refcount;
+ struct kobject kobj;
struct list_head list; /* on virtio_fs_instances */
char *tag;
struct virtio_fs_vq *vqs;
@@ -161,18 +165,40 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
complete(&fsvq->in_flight_zero);
}
-static void release_virtio_fs_obj(struct kref *ref)
+static ssize_t tag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
+
+ return sysfs_emit(buf, fs->tag);
+}
+
+static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag);
+
+static struct attribute *virtio_fs_attrs[] = {
+ &virtio_fs_tag_attr.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(virtio_fs);
+
+static void virtio_fs_ktype_release(struct kobject *kobj)
{
- struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
+ struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);
kfree(vfs->vqs);
kfree(vfs);
}
+static const struct kobj_type virtio_fs_ktype = {
+ .release = virtio_fs_ktype_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = virtio_fs_groups,
+};
+
/* Make sure virtiofs_mutex is held */
static void virtio_fs_put(struct virtio_fs *fs)
{
- kref_put(&fs->refcount, release_virtio_fs_obj);
+ kobject_put(&fs->kobj);
}
static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
@@ -243,25 +269,46 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs)
}
/* Add a new instance to the list or return -EEXIST if tag name exists*/
-static int virtio_fs_add_instance(struct virtio_fs *fs)
+static int virtio_fs_add_instance(struct virtio_device *vdev,
+ struct virtio_fs *fs)
{
struct virtio_fs *fs2;
- bool duplicate = false;
+ int ret;
mutex_lock(&virtio_fs_mutex);
list_for_each_entry(fs2, &virtio_fs_instances, list) {
- if (strcmp(fs->tag, fs2->tag) == 0)
- duplicate = true;
+ if (strcmp(fs->tag, fs2->tag) == 0) {
+ mutex_unlock(&virtio_fs_mutex);
+ return -EEXIST;
+ }
}
- if (!duplicate)
- list_add_tail(&fs->list, &virtio_fs_instances);
+ /* Use the virtio_device's index as a unique identifier, there is no
+ * need to allocate our own identifiers because the virtio_fs instance
+ * is only visible to userspace as long as the underlying virtio_device
+ * exists.
+ */
+ fs->kobj.kset = virtio_fs_kset;
+ ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index);
+ if (ret < 0) {
+ mutex_unlock(&virtio_fs_mutex);
+ return ret;
+ }
+
+ ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device");
+ if (ret < 0) {
+ kobject_del(&fs->kobj);
+ mutex_unlock(&virtio_fs_mutex);
+ return ret;
+ }
+
+ list_add_tail(&fs->list, &virtio_fs_instances);
mutex_unlock(&virtio_fs_mutex);
- if (duplicate)
- return -EEXIST;
+ kobject_uevent(&fs->kobj, KOBJ_ADD);
+
return 0;
}
@@ -274,7 +321,7 @@ static struct virtio_fs *virtio_fs_find_instance(const char *tag)
list_for_each_entry(fs, &virtio_fs_instances, list) {
if (strcmp(fs->tag, tag) == 0) {
- kref_get(&fs->refcount);
+ kobject_get(&fs->kobj);
goto found;
}
}
@@ -323,6 +370,16 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
return -ENOMEM;
memcpy(fs->tag, tag_buf, len);
fs->tag[len] = '\0';
+
+ /* While the VIRTIO specification allows any character, newlines are
+ * awkward on mount(8) command-lines and cause problems in the sysfs
+ * "tag" attr and uevent TAG= properties. Forbid them.
+ */
+ if (strchr(fs->tag, '\n')) {
+ dev_dbg(&vdev->dev, "refusing virtiofs tag with newline character\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -345,7 +402,7 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work)
kfree(req);
dec_in_flight_req(fsvq);
}
- } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ } while (!virtqueue_enable_cb(vq));
spin_unlock(&fsvq->lock);
}
@@ -627,7 +684,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
list_move_tail(&req->list, &reqs);
spin_unlock(&fpq->lock);
}
- } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ } while (!virtqueue_enable_cb(vq));
spin_unlock(&fsvq->lock);
/* End requests */
@@ -795,8 +852,11 @@ static void virtio_fs_cleanup_dax(void *data)
put_dax(dax_dev);
}
+DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T))
+
static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
{
+ struct dax_device *dax_dev __free(cleanup_dax) = NULL;
struct virtio_shm_region cache_reg;
struct dev_pagemap *pgmap;
bool have_cache;
@@ -804,6 +864,12 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
if (!IS_ENABLED(CONFIG_FUSE_DAX))
return 0;
+ dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
+ if (IS_ERR(dax_dev)) {
+ int rc = PTR_ERR(dax_dev);
+ return rc == -EOPNOTSUPP ? 0 : rc;
+ }
+
/* Get cache region */
have_cache = virtio_get_shm_region(vdev, &cache_reg,
(u8)VIRTIO_FS_SHMCAP_ID_CACHE);
@@ -849,10 +915,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
- fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
- if (IS_ERR(fs->dax_dev))
- return PTR_ERR(fs->dax_dev);
-
+ fs->dax_dev = no_free_ptr(dax_dev);
return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
fs->dax_dev);
}
@@ -865,7 +928,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
fs = kzalloc(sizeof(*fs), GFP_KERNEL);
if (!fs)
return -ENOMEM;
- kref_init(&fs->refcount);
+ kobject_init(&fs->kobj, &virtio_fs_ktype);
vdev->priv = fs;
ret = virtio_fs_read_tag(vdev, fs);
@@ -887,7 +950,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
*/
virtio_device_ready(vdev);
- ret = virtio_fs_add_instance(fs);
+ ret = virtio_fs_add_instance(vdev, fs);
if (ret < 0)
goto out_vqs;
@@ -896,11 +959,10 @@ static int virtio_fs_probe(struct virtio_device *vdev)
out_vqs:
virtio_reset_device(vdev);
virtio_fs_cleanup_vqs(vdev);
- kfree(fs->vqs);
out:
vdev->priv = NULL;
- kfree(fs);
+ kobject_put(&fs->kobj);
return ret;
}
@@ -924,6 +986,8 @@ static void virtio_fs_remove(struct virtio_device *vdev)
mutex_lock(&virtio_fs_mutex);
/* This device is going away. No one should get new reference */
list_del_init(&fs->list);
+ sysfs_remove_link(&fs->kobj, "device");
+ kobject_del(&fs->kobj);
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
virtio_reset_device(vdev);
@@ -1510,21 +1574,56 @@ static struct file_system_type virtio_fs_type = {
.kill_sb = virtio_kill_sb,
};
+static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
+{
+ const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
+
+ add_uevent_var(env, "TAG=%s", fs->tag);
+ return 0;
+}
+
+static const struct kset_uevent_ops virtio_fs_uevent_ops = {
+ .uevent = virtio_fs_uevent,
+};
+
+static int __init virtio_fs_sysfs_init(void)
+{
+ virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops,
+ fs_kobj);
+ if (!virtio_fs_kset)
+ return -ENOMEM;
+ return 0;
+}
+
+static void virtio_fs_sysfs_exit(void)
+{
+ kset_unregister(virtio_fs_kset);
+ virtio_fs_kset = NULL;
+}
+
static int __init virtio_fs_init(void)
{
int ret;
- ret = register_virtio_driver(&virtio_fs_driver);
+ ret = virtio_fs_sysfs_init();
if (ret < 0)
return ret;
+ ret = register_virtio_driver(&virtio_fs_driver);
+ if (ret < 0)
+ goto sysfs_exit;
+
ret = register_filesystem(&virtio_fs_type);
- if (ret < 0) {
- unregister_virtio_driver(&virtio_fs_driver);
- return ret;
- }
+ if (ret < 0)
+ goto unregister_virtio_driver;
return 0;
+
+unregister_virtio_driver:
+ unregister_virtio_driver(&virtio_fs_driver);
+sysfs_exit:
+ virtio_fs_sysfs_exit();
+ return ret;
}
module_init(virtio_fs_init);
@@ -1532,6 +1631,7 @@ static void __exit virtio_fs_exit(void)
{
unregister_filesystem(&virtio_fs_type);
unregister_virtio_driver(&virtio_fs_driver);
+ virtio_fs_sysfs_exit();
}
module_exit(virtio_fs_exit);
diff --git a/fs/inode.c b/fs/inode.c
index d290f007b3d1..3a41f83a4ba5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2033,7 +2033,7 @@ static int __remove_privs(struct mnt_idmap *idmap,
return notify_change(idmap, dentry, &newattrs, NULL);
}
-static int __file_remove_privs(struct file *file, unsigned int flags)
+int file_remove_privs_flags(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
@@ -2058,6 +2058,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
inode_has_no_xattr(inode);
return error;
}
+EXPORT_SYMBOL_GPL(file_remove_privs_flags);
/**
* file_remove_privs - remove special file privileges (suid, capabilities)
@@ -2070,7 +2071,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
*/
int file_remove_privs(struct file *file)
{
- return __file_remove_privs(file, 0);
+ return file_remove_privs_flags(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);
@@ -2163,7 +2164,7 @@ static int file_modified_flags(struct file *file, int flags)
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
- ret = __file_remove_privs(file, flags);
+ ret = file_remove_privs_flags(file, flags);
if (ret)
return ret;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 25fca44149dd..2a616a9f289d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -908,8 +908,22 @@ root_found:
* we then decide whether to use the Joliet descriptor.
*/
inode = isofs_iget(s, sbi->s_firstdatazone, 0);
- if (IS_ERR(inode))
- goto out_no_root;
+
+ /*
+ * Fix for broken CDs with a corrupt root inode but a correct Joliet
+ * root directory.
+ */
+ if (IS_ERR(inode)) {
+ if (joliet_level && sbi->s_firstdatazone != first_data_zone) {
+ printk(KERN_NOTICE
+ "ISOFS: root inode is unusable. "
+ "Disabling Rock Ridge and switching to Joliet.");
+ sbi->s_rock = 0;
+ inode = NULL;
+ } else {
+ goto out_no_root;
+ }
+ }
/*
* Fix for broken CDs with Rock Ridge and empty ISO root directory but
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index dd4264aa9bed..10934f9a11be 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -92,7 +92,7 @@ struct jfs_inode_info {
} link;
} u;
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
u32 dev; /* will die when we get wide dev_t */
struct inode vfs_inode;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 73f09a762b79..e1be21ca5d6e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -824,7 +824,7 @@ out:
return len - towrite;
}
-static struct dquot **jfs_get_dquots(struct inode *inode)
+static struct dquot __rcu **jfs_get_dquots(struct inode *inode)
{
return JFS_IP(inode)->i_dquot;
}
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index bce1d7ac95ca..458519e416fe 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
}
EXPORT_SYMBOL_GPL(kernfs_get);
+static void kernfs_free_rcu(struct rcu_head *rcu)
+{
+ struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
+
+ kfree_const(kn->name);
+
+ if (kn->iattr) {
+ simple_xattrs_free(&kn->iattr->xattrs, NULL);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+ }
+
+ kmem_cache_free(kernfs_node_cache, kn);
+}
+
/**
* kernfs_put - put a reference count on a kernfs_node
* @kn: the target kernfs_node
@@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_LINK)
kernfs_put(kn->symlink.target_kn);
- kfree_const(kn->name);
-
- if (kn->iattr) {
- simple_xattrs_free(&kn->iattr->xattrs, NULL);
- kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
- }
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock);
- kmem_cache_free(kernfs_node_cache, kn);
+
+ call_rcu(&kn->rcu, kernfs_free_rcu);
kn = parent;
if (kn) {
@@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
} else {
/* just released the root kn, free @root too */
idr_destroy(&root->ino_idr);
- kfree(root);
+ kfree_rcu(root, rcu);
}
}
EXPORT_SYMBOL_GPL(kernfs_put);
@@ -715,7 +724,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
ino_t ino = kernfs_id_ino(id);
u32 gen = kernfs_id_gen(id);
- spin_lock(&kernfs_idr_lock);
+ rcu_read_lock();
kn = idr_find(&root->ino_idr, (u32)ino);
if (!kn)
@@ -739,10 +748,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
goto err_unlock;
- spin_unlock(&kernfs_idr_lock);
+ rcu_read_unlock();
return kn;
err_unlock:
- spin_unlock(&kernfs_idr_lock);
+ rcu_read_unlock();
return NULL;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ffa4565c275a..e9df2f87072c 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -483,9 +483,11 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
goto out_put;
rc = 0;
- of->mmapped = true;
- of_on(of)->nr_mmapped++;
- of->vm_ops = vma->vm_ops;
+ if (!of->mmapped) {
+ of->mmapped = true;
+ of_on(of)->nr_mmapped++;
+ of->vm_ops = vma->vm_ops;
+ }
vma->vm_ops = &kernfs_vm_ops;
out_put:
kernfs_put_active(of->kn);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 237f2764b941..b42ee6547cdc 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -49,6 +49,8 @@ struct kernfs_root {
struct rw_semaphore kernfs_rwsem;
struct rw_semaphore kernfs_iattr_rwsem;
struct rw_semaphore kernfs_supers_rwsem;
+
+ struct rcu_head rcu;
};
/* +1 to avoid triggering overflow warning when negating it */
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index ad572f7ee897..43a651ed8264 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -83,8 +83,10 @@ static int fscache_begin_operation(struct netfs_cache_resources *cres,
cres->debug_id = cookie->debug_id;
cres->inval_counter = cookie->inval_counter;
- if (!fscache_begin_cookie_access(cookie, why))
+ if (!fscache_begin_cookie_access(cookie, why)) {
+ cres->cache_priv = NULL;
return -ENOBUFS;
+ }
again:
spin_lock(&cookie->lock);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index fbdc9ca80f71..de77848ae654 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -73,14 +73,9 @@ const struct rpc_program nfs_program = {
.number = NFS_PROGRAM,
.nrvers = ARRAY_SIZE(nfs_version),
.version = nfs_version,
- .stats = &nfs_rpcstat,
.pipe_dir_name = NFS_PIPE_DIRNAME,
};
-struct rpc_stat nfs_rpcstat = {
- .program = &nfs_program
-};
-
static struct nfs_subversion *find_nfs_version(unsigned int version)
{
struct nfs_subversion *nfs;
@@ -502,6 +497,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
const struct nfs_client_initdata *cl_init,
rpc_authflavor_t flavor)
{
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
struct rpc_clnt *clnt = NULL;
struct rpc_create_args args = {
.net = clp->cl_net,
@@ -513,6 +509,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
.servername = clp->cl_hostname,
.nodename = cl_init->nodename,
.program = &nfs_program,
+ .stats = &nn->rpcstats,
.version = clp->rpc_ops->version,
.authflavor = flavor,
.cred = cl_init->cred,
@@ -1182,6 +1179,8 @@ void nfs_clients_init(struct net *net)
#endif
spin_lock_init(&nn->nfs_client_lock);
nn->boot_time = ktime_get_real();
+ memset(&nn->rpcstats, 0, sizeof(nn->rpcstats));
+ nn->rpcstats.program = &nfs_program;
nfs_netns_sysfs_setup(nn, net);
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d4a42ce0c7e3..6bace5fece04 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -181,7 +181,6 @@ static int nfs_delegation_claim_opens(struct inode *inode,
struct nfs_open_context *ctx;
struct nfs4_state_owner *sp;
struct nfs4_state *state;
- unsigned int seq;
int err;
again:
@@ -202,12 +201,9 @@ again:
sp = state->owner;
/* Block nfs4_proc_unlck */
mutex_lock(&sp->so_delegreturn_mutex);
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
err = nfs4_open_delegation_recall(ctx, state, stateid);
if (!err)
err = nfs_delegation_claim_locks(state, stateid);
- if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
- err = -EAGAIN;
mutex_unlock(&sp->so_delegreturn_mutex);
put_nfs_open_context(ctx);
if (err != 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7af5d270de28..bb2f583eb28b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -606,6 +606,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
trace_nfs_direct_commit_complete(dreq);
+ spin_lock(&dreq->lock);
if (status < 0) {
/* Errors in commit are fatal */
dreq->error = status;
@@ -613,6 +614,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
} else {
status = dreq->error;
}
+ spin_unlock(&dreq->lock);
nfs_init_cinfo_from_dreq(&cinfo, dreq);
@@ -625,7 +627,10 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
spin_unlock(&dreq->lock);
nfs_release_request(req);
} else if (!nfs_write_match_verf(verf, req)) {
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_lock(&dreq->lock);
+ if (dreq->flags == 0)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
/*
* Despite the reboot, the write was successful,
* so reset wb_nio.
@@ -667,10 +672,17 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
LIST_HEAD(mds_list);
nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_commit_begin(cinfo.mds);
nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
- if (res < 0) /* res == -ENOMEM */
- nfs_direct_write_reschedule(dreq);
+ if (res < 0) { /* res == -ENOMEM */
+ spin_lock(&dreq->lock);
+ if (dreq->flags == 0)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ }
+ if (nfs_commit_end(cinfo.mds))
+ nfs_direct_write_complete(dreq);
}
static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index acf4b88889dc..4fa304fa5bc4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -35,6 +35,7 @@
#include "../internal.h"
#include "../nfs4session.h"
#include "filelayout.h"
+#include "../nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -172,6 +173,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
if (!dsaddr->ds_list[i])
goto out_err_drain_dsaddrs;
+ trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr);
/* If DS was already in cache, free ds addrs */
while (!list_empty(&dsaddrs)) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ef817a0475ff..3e724cb7ef01 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -2016,7 +2016,7 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
mirror = flseg->mirror_array[idx];
mirror_ds = mirror->mirror_ds;
- if (!mirror_ds)
+ if (IS_ERR_OR_NULL(mirror_ds))
continue;
ds = mirror->mirror_ds->ds;
if (!ds)
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 853e8d609bb3..d0a0956f8a13 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -652,6 +652,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
ctx->fscache_uniq = NULL;
break;
case Opt_fscache:
+ trace_nfs_mount_assign(param->key, param->string);
ctx->options |= NFS_OPTION_FSCACHE;
kfree(ctx->fscache_uniq);
ctx->fscache_uniq = param->string;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 2d1bfee225c3..ddc1ee031955 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -301,11 +301,11 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
struct inode *inode = sreq->rreq->inode;
struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
struct page *page;
+ unsigned long idx;
int err;
pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
pgoff_t last = ((sreq->start + sreq->len -
sreq->transferred - 1) >> PAGE_SHIFT);
- XA_STATE(xas, &sreq->rreq->mapping->i_pages, start);
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
@@ -316,19 +316,14 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
pgio.pg_netfs = netfs; /* used in completion */
- xas_lock(&xas);
- xas_for_each(&xas, page, last) {
+ xa_for_each_range(&sreq->rreq->mapping->i_pages, idx, page, start, last) {
/* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */
- xas_pause(&xas);
- xas_unlock(&xas);
err = nfs_read_add_folio(&pgio, ctx, page_folio(page));
if (err < 0) {
netfs->error = err;
goto out;
}
- xas_lock(&xas);
}
- xas_unlock(&xas);
out:
nfs_pageio_complete_read(&pgio);
nfs_netfs_put(netfs);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 93ea49a7eb61..c709c296ea9a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2426,12 +2426,16 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
nfs_clients_init(net);
+ rpc_proc_register(net, &nn->rpcstats);
return nfs_fs_proc_net_init(net);
}
static void nfs_net_exit(struct net *net)
{
+ rpc_proc_unregister(net, "nfs");
nfs_fs_proc_net_exit(net);
nfs_clients_exit(net);
}
@@ -2486,15 +2490,12 @@ static int __init init_nfs_fs(void)
if (err)
goto out1;
- rpc_proc_register(&init_net, &nfs_rpcstat);
-
err = register_nfs_fs();
if (err)
goto out0;
return 0;
out0:
- rpc_proc_unregister(&init_net, "nfs");
nfs_destroy_directcache();
out1:
nfs_destroy_writepagecache();
@@ -2524,7 +2525,6 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_inodecache();
nfs_destroy_nfspagecache();
unregister_pernet_subsys(&nfs_net_ops);
- rpc_proc_unregister(&init_net, "nfs");
unregister_nfs_fs();
nfs_fs_proc_exit();
nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e3722ce6722e..06253695fe53 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -449,8 +449,6 @@ int nfs_try_get_tree(struct fs_context *);
int nfs_get_tree_common(struct fs_context *);
void nfs_kill_super(struct super_block *);
-extern struct rpc_stat nfs_rpcstat;
-
extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index c8374f74dce1..a68b21603ea9 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -9,6 +9,7 @@
#include <linux/nfs4.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/sunrpc/stats.h>
struct bl_dev_msg {
int32_t status;
@@ -34,6 +35,7 @@ struct nfs_net {
struct nfs_netns_client *nfs_client;
spinlock_t nfs_client_lock;
ktime_t boot_time;
+ struct rpc_stat rpcstats;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *proc_nfsfs;
#endif
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 674c012868b1..b0c8a39c2bbd 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -111,6 +111,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
cl_init.hostname = buf;
switch (ds_proto) {
+ case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_TCP_TLS:
if (mds_clp->cl_nconnect > 1)
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b59876b01a1e..0282d93c8bcc 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -55,11 +55,14 @@ int nfs42_proc_removexattr(struct inode *inode, const char *name);
* They would be 7 bytes long in the eventual buffer ("user.x\0"), and
* 8 bytes long XDR-encoded.
*
- * Include the trailing eof word as well.
+ * Include the trailing eof word as well and make the result a multiple
+ * of 4 bytes.
*/
static inline u32 nfs42_listxattr_xdrsize(u32 buflen)
{
- return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4;
+ u32 size = 8 * buflen / (XATTR_USER_PREFIX_LEN + 2) + 4;
+
+ return (size + 3) & ~3;
}
#endif /* CONFIG_NFS_V4_2 */
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 6ff41ceb9f1c..7024230f0d1d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -120,7 +120,6 @@ struct nfs4_state_owner {
unsigned long so_flags;
struct list_head so_states;
struct nfs_seqid_counter so_seqid;
- seqcount_spinlock_t so_reclaim_seqcount;
struct mutex so_delegreturn_mutex;
};
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 11e3a285594c..84573df5cf5a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -924,6 +924,7 @@ static int nfs4_set_client(struct nfs_server *server,
else
cl_init.max_connect = max_connect;
switch (proto) {
+ case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_TCP_TLS:
cl_init.nconnect = nconnect;
@@ -1000,6 +1001,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
cl_init.hostname = buf;
switch (ds_proto) {
+ case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_TCP_TLS:
if (mds_clp->cl_nconnect > 1) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 815996cb27fc..ea390db94b62 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3069,10 +3069,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx);
struct inode *dir = d_inode(opendata->dir);
unsigned long dir_verifier;
- unsigned int seq;
int ret;
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
dir_verifier = nfs_save_change_attribute(dir);
ret = _nfs4_proc_open(opendata, ctx);
@@ -3125,11 +3123,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
if (ret != 0)
goto out;
- if (d_inode(dentry) == state->inode) {
+ if (d_inode(dentry) == state->inode)
nfs_inode_attach_open_context(ctx);
- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
- nfs4_schedule_stateid_recovery(server, state);
- }
out:
if (!opendata->cancelled) {
@@ -8973,10 +8968,12 @@ try_again:
return;
status = task->tk_status;
- if (status == 0)
+ if (status == 0) {
status = nfs4_detect_session_trunking(adata->clp,
task->tk_msg.rpc_resp, xprt);
-
+ trace_nfs4_trunked_exchange_id(adata->clp,
+ xprt->address_strings[RPC_DISPLAY_ADDR], status);
+ }
if (status == 0)
rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt,
@@ -10618,29 +10615,33 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
ssize_t error, error2, error3;
+ size_t left = size;
- error = generic_listxattr(dentry, list, size);
+ error = generic_listxattr(dentry, list, left);
if (error < 0)
return error;
if (list) {
list += error;
- size -= error;
+ left -= error;
}
- error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+ error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, left);
if (error2 < 0)
return error2;
if (list) {
list += error2;
- size -= error2;
+ left -= error2;
}
- error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size);
+ error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left);
if (error3 < 0)
return error3;
- return error + error2 + error3;
+ error += error2 + error3;
+ if (size && error > size)
+ return -ERANGE;
+ return error;
}
static void nfs4_enable_swap(struct inode *inode)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 8cfabdbda336..662e86ea3a2d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -513,7 +513,6 @@ nfs4_alloc_state_owner(struct nfs_server *server,
nfs4_init_seqid_counter(&sp->so_seqid);
atomic_set(&sp->so_count, 1);
INIT_LIST_HEAD(&sp->so_lru);
- seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock);
mutex_init(&sp->so_delegreturn_mutex);
return sp;
}
@@ -1667,7 +1666,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp,
* server that doesn't support a grace period.
*/
spin_lock(&sp->so_lock);
- raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
restart:
list_for_each_entry(state, &sp->so_states, open_states) {
if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
@@ -1735,7 +1733,6 @@ restart:
spin_lock(&sp->so_lock);
goto restart;
}
- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
#ifdef CONFIG_NFS_V4_2
if (found_ssc_copy_state)
@@ -1745,7 +1742,6 @@ restart:
out_err:
nfs4_put_open_state(state);
spin_lock(&sp->so_lock);
- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
return status;
}
@@ -1928,9 +1924,12 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
struct nfs_server *server;
struct rb_node *pos;
LIST_HEAD(freeme);
- int status = 0;
int lost_locks = 0;
+ int status;
+ status = nfs4_begin_drain_session(clp);
+ if (status < 0)
+ return status;
restart:
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
@@ -2694,6 +2693,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
/* Detect expired delegations... */
if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
section = "detect expired delegations";
+ status = nfs4_begin_drain_session(clp);
+ if (status < 0)
+ goto out_error;
nfs_reap_expired_delegations(clp);
continue;
}
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index d09bcfd7db89..8da5a9c000f4 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -145,6 +145,7 @@ static int do_nfs4_mount(struct nfs_server *server,
const char *export_path)
{
struct nfs_fs_context *root_ctx;
+ struct nfs_fs_context *ctx;
struct fs_context *root_fc;
struct vfsmount *root_mnt;
struct dentry *dentry;
@@ -157,6 +158,12 @@ static int do_nfs4_mount(struct nfs_server *server,
.dirfd = -1,
};
+ struct fs_parameter param_fsc = {
+ .key = "fsc",
+ .type = fs_value_is_string,
+ .dirfd = -1,
+ };
+
if (IS_ERR(server))
return PTR_ERR(server);
@@ -168,9 +175,26 @@ static int do_nfs4_mount(struct nfs_server *server,
kfree(root_fc->source);
root_fc->source = NULL;
+ ctx = nfs_fc2context(fc);
root_ctx = nfs_fc2context(root_fc);
root_ctx->internal = true;
root_ctx->server = server;
+
+ if (ctx->fscache_uniq) {
+ len = strlen(ctx->fscache_uniq);
+ param_fsc.size = len;
+ param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL);
+ if (param_fsc.string == NULL) {
+ put_fs_context(root_fc);
+ return -ENOMEM;
+ }
+ ret = vfs_parse_fs_param(root_fc, &param_fsc);
+ kfree(param_fsc.string);
+ if (ret < 0) {
+ put_fs_context(root_fc);
+ return ret;
+ }
+ }
/* We leave export_path unset as it's not used to find the root. */
len = strlen(hostname) + 5;
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d9ac556bebcf..d22c6670f770 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -28,4 +28,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(fl_getdevinfo);
#endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index fd7cb15b08b2..10985a4b8259 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -77,6 +77,36 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
+TRACE_EVENT(nfs4_trunked_exchange_id,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const char *addr,
+ int error
+ ),
+
+ TP_ARGS(clp, addr, error),
+
+ TP_STRUCT__entry(
+ __string(main_addr, clp->cl_hostname)
+ __string(trunk_addr, addr)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(main_addr, clp->cl_hostname);
+ __assign_str(trunk_addr, addr);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) main_addr=%s trunk_addr=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __get_str(main_addr),
+ __get_str(trunk_addr)
+ )
+);
+
TRACE_EVENT(nfs4_sequence_done,
TP_PROTO(
const struct nfs4_session *session,
@@ -1991,6 +2021,34 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status,
DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo);
DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid);
+TRACE_EVENT(fl_getdevinfo,
+ TP_PROTO(
+ const struct nfs_server *server,
+ const struct nfs4_deviceid *deviceid,
+ char *ds_remotestr
+ ),
+ TP_ARGS(server, deviceid, ds_remotestr),
+
+ TP_STRUCT__entry(
+ __string(mds_addr, server->nfs_client->cl_hostname)
+ __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE)
+ __string(ds_ips, ds_remotestr)
+ ),
+
+ TP_fast_assign(
+ __assign_str(mds_addr, server->nfs_client->cl_hostname);
+ __assign_str(ds_ips, ds_remotestr);
+ memcpy(__entry->deviceid, deviceid->data,
+ NFS4_DEVICEID4_SIZE);
+ ),
+ TP_printk(
+ "deviceid=%s, mds_addr=%s, ds_ips=%s",
+ __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE),
+ __get_str(mds_addr),
+ __get_str(ds_ips)
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
TP_PROTO(
const struct nfs_pgio_header *hdr
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 7600100ba26f..432612d22437 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -175,10 +175,10 @@ static int __init root_nfs_cat(char *dest, const char *src,
size_t len = strlen(dest);
if (len && dest[len - 1] != ',')
- if (strlcat(dest, ",", destlen) > destlen)
+ if (strlcat(dest, ",", destlen) >= destlen)
return -1;
- if (strlcat(dest, src, destlen) > destlen)
+ if (strlcat(dest, src, destlen) >= destlen)
return -1;
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0c0fed1ecd0b..a5cc6199127f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1999,6 +1999,14 @@ pnfs_update_layout(struct inode *ino,
}
lookup_again:
+ if (!nfs4_valid_open_stateid(ctx->state)) {
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+ lseg = ERR_PTR(-EIO);
+ goto out;
+ }
+
lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
if (IS_ERR(lseg))
goto out;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index afd23910f3bf..88e061bd711b 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -919,6 +919,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
list_for_each_entry(da, &ds->ds_addrs, da_node) {
+ char servername[48];
+
dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
@@ -929,6 +931,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
.dstaddr = (struct sockaddr *)&da->da_addr,
.addrlen = da->da_addrlen,
.servername = clp->cl_hostname,
+ .xprtsec = clp->cl_xprtsec,
};
struct nfs4_add_xprt_data xprtdata = {
.clp = clp,
@@ -938,10 +941,45 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
.data = &xprtdata,
};
- if (da->da_transport != clp->cl_proto)
+ if (da->da_transport != clp->cl_proto &&
+ clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
continue;
+ if (da->da_transport == XPRT_TRANSPORT_TCP &&
+ mds_srv->nfs_client->cl_proto ==
+ XPRT_TRANSPORT_TCP_TLS) {
+ struct sockaddr *addr =
+ (struct sockaddr *)&da->da_addr;
+ struct sockaddr_in *sin =
+ (struct sockaddr_in *)&da->da_addr;
+ struct sockaddr_in6 *sin6 =
+ (struct sockaddr_in6 *)&da->da_addr;
+
+ /* for NFS with TLS we need to supply a correct
+ * servername of the trunked transport, not the
+ * servername of the main transport stored in
+ * clp->cl_hostname. And set the protocol to
+ * indicate to use TLS
+ */
+ servername[0] = '\0';
+ switch(addr->sa_family) {
+ case AF_INET:
+ snprintf(servername, sizeof(servername),
+ "%pI4", &sin->sin_addr.s_addr);
+ break;
+ case AF_INET6:
+ snprintf(servername, sizeof(servername),
+ "%pI6", &sin6->sin6_addr);
+ break;
+ default:
+ /* do not consider this address */
+ continue;
+ }
+ xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
+ xprt_args.servername = servername;
+ }
if (da->da_addr.ss_family != clp->cl_addr.ss_family)
continue;
+
/**
* Test this address for session trunking and
* add as an alias
@@ -953,6 +991,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
if (xprtdata.cred)
put_cred(xprtdata.cred);
} else {
+ if (da->da_transport == XPRT_TRANSPORT_TCP &&
+ mds_srv->nfs_client->cl_proto ==
+ XPRT_TRANSPORT_TCP_TLS)
+ da->da_transport = XPRT_TRANSPORT_TCP_TLS;
clp = nfs4_set_ds_client(mds_srv,
&da->da_addr,
da->da_addrlen,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 7dc21a48e3e7..a142287d86f6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -305,6 +305,8 @@ int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len);
if (IS_ERR(new)) {
error = PTR_ERR(new);
+ if (nfs_netfs_folio_unlock(folio))
+ folio_unlock(folio);
goto out;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 075b31c93f87..dc03f98f7616 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -516,8 +516,16 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
else
nfs_show_nfsv4_options(m, nfss, showdefaults);
- if (nfss->options & NFS_OPTION_FSCACHE)
+ if (nfss->options & NFS_OPTION_FSCACHE) {
+#ifdef CONFIG_NFS_FSCACHE
+ if (nfss->fscache_uniq)
+ seq_printf(m, ",fsc=%s", nfss->fscache_uniq);
+ else
+ seq_puts(m, ",fsc");
+#else
seq_puts(m, ",fsc");
+#endif
+ }
if (nfss->options & NFS_OPTION_MIGRATION)
seq_puts(m, ",migration");
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 84bb85264572..5de85d725fb9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -667,10 +667,6 @@ static int nfs_writepage_locked(struct folio *folio,
struct inode *inode = folio_file_mapping(folio)->host;
int err;
- if (wbc->sync_mode == WB_SYNC_NONE &&
- NFS_SERVER(inode)->write_congested)
- return AOP_WRITEPAGE_ACTIVATE;
-
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, 0, false,
&nfs_async_write_completion_ops);
@@ -1650,7 +1646,7 @@ static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
!atomic_read(&cinfo->rpcs_out));
}
-static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
{
atomic_inc(&cinfo->rpcs_out);
}
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index e545e92c4408..1cd2076210b1 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -104,7 +104,7 @@ TRACE_EVENT(nfsd_compound,
TP_fast_assign(
__entry->xid = be32_to_cpu(rqst->rq_xid);
__entry->opcnt = opcnt;
- __assign_str_len(tag, tag, taglen);
+ __assign_str(tag, tag);
),
TP_printk("xid=0x%08x opcnt=%u tag=%s",
__entry->xid, __entry->opcnt, __get_str(tag)
@@ -485,7 +485,7 @@ TRACE_EVENT(nfsd_dirent,
TP_fast_assign(
__entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
__entry->ino = ino;
- __assign_str_len(name, name, namlen)
+ __assign_str(name, name);
),
TP_printk("fh_hash=0x%08x ino=%llu name=%s",
__entry->fh_hash, __entry->ino, __get_str(name)
@@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
__array(unsigned char, addr, sizeof(struct sockaddr_in6))
__field(unsigned long, flavor)
__array(unsigned char, verifier, NFS4_VERIFIER_SIZE)
- __string_len(name, name, clp->cl_name.len)
+ __string_len(name, clp->cl_name.data, clp->cl_name.len)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -906,7 +906,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
__entry->flavor = clp->cl_cred.cr_flavor;
memcpy(__entry->verifier, (void *)&clp->cl_verifier,
NFS4_VERIFIER_SIZE);
- __assign_str_len(name, clp->cl_name.data, clp->cl_name.len);
+ __assign_str(name, clp->cl_name.data);
),
TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
__entry->addr, __get_str(name),
@@ -1976,7 +1976,7 @@ TRACE_EVENT(nfsd_ctl_time,
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->time = time;
- __assign_str_len(name, name, namelen);
+ __assign_str(name, name);
),
TP_printk("file=%s time=%d\n",
__get_str(name), __entry->time
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7342de296ec3..89caef7513db 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -525,54 +525,55 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
if (ret < 0)
return ret;
- desc_kaddr = kmap(desc_bh->b_page);
+ desc_kaddr = kmap_local_page(desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(
inode, group, desc_bh, desc_kaddr);
n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
maxgroup);
- for (j = 0; j < n; j++, desc++, group++) {
+ for (j = 0; j < n; j++, desc++, group++, group_offset = 0) {
lock = nilfs_mdt_bgl_lock(inode, group);
- if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
- ret = nilfs_palloc_get_bitmap_block(
- inode, group, 1, &bitmap_bh);
- if (ret < 0)
- goto out_desc;
- bitmap_kaddr = kmap(bitmap_bh->b_page);
- bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
- pos = nilfs_palloc_find_available_slot(
- bitmap, group_offset,
- entries_per_group, lock);
- if (pos >= 0) {
- /* found a free entry */
- nilfs_palloc_group_desc_add_entries(
- desc, lock, -1);
- req->pr_entry_nr =
- entries_per_group * group + pos;
- kunmap(desc_bh->b_page);
- kunmap(bitmap_bh->b_page);
-
- req->pr_desc_bh = desc_bh;
- req->pr_bitmap_bh = bitmap_bh;
- return 0;
- }
- kunmap(bitmap_bh->b_page);
- brelse(bitmap_bh);
+ if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0)
+ continue;
+
+ kunmap_local(desc_kaddr);
+ ret = nilfs_palloc_get_bitmap_block(inode, group, 1,
+ &bitmap_bh);
+ if (unlikely(ret < 0)) {
+ brelse(desc_bh);
+ return ret;
}
- group_offset = 0;
+ desc_kaddr = kmap_local_page(desc_bh->b_page);
+ desc = nilfs_palloc_block_get_group_desc(
+ inode, group, desc_bh, desc_kaddr);
+
+ bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
+ bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+ pos = nilfs_palloc_find_available_slot(
+ bitmap, group_offset, entries_per_group, lock);
+ kunmap_local(bitmap_kaddr);
+ if (pos >= 0)
+ goto found;
+
+ brelse(bitmap_bh);
}
- kunmap(desc_bh->b_page);
+ kunmap_local(desc_kaddr);
brelse(desc_bh);
}
/* no entries left */
return -ENOSPC;
- out_desc:
- kunmap(desc_bh->b_page);
- brelse(desc_bh);
- return ret;
+found:
+ /* found a free entry */
+ nilfs_palloc_group_desc_add_entries(desc, lock, -1);
+ req->pr_entry_nr = entries_per_group * group + pos;
+ kunmap_local(desc_kaddr);
+
+ req->pr_desc_bh = desc_bh;
+ req->pr_bitmap_bh = bitmap_bh;
+ return 0;
}
/**
@@ -606,10 +607,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
- desc_kaddr = kmap(req->pr_desc_bh->b_page);
+ desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(inode, group,
req->pr_desc_bh, desc_kaddr);
- bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+ bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
@@ -621,8 +622,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
- kunmap(req->pr_bitmap_bh->b_page);
- kunmap(req->pr_desc_bh->b_page);
+ kunmap_local(bitmap_kaddr);
+ kunmap_local(desc_kaddr);
mark_buffer_dirty(req->pr_desc_bh);
mark_buffer_dirty(req->pr_bitmap_bh);
@@ -647,10 +648,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
- desc_kaddr = kmap(req->pr_desc_bh->b_page);
+ desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(inode, group,
req->pr_desc_bh, desc_kaddr);
- bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+ bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
@@ -662,8 +663,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
- kunmap(req->pr_bitmap_bh->b_page);
- kunmap(req->pr_desc_bh->b_page);
+ kunmap_local(bitmap_kaddr);
+ kunmap_local(desc_kaddr);
brelse(req->pr_bitmap_bh);
brelse(req->pr_desc_bh);
@@ -755,7 +756,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
/* Get the first entry number of the group */
group_min_nr = (__u64)group * epg;
- bitmap_kaddr = kmap(bitmap_bh->b_page);
+ bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
@@ -801,7 +802,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
entry_start = rounddown(group_offset, epb);
} while (true);
- kunmap(bitmap_bh->b_page);
+ kunmap_local(bitmap_kaddr);
mark_buffer_dirty(bitmap_bh);
brelse(bitmap_bh);
@@ -815,11 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
inode->i_ino);
}
- desc_kaddr = kmap_atomic(desc_bh->b_page);
+ desc_kaddr = kmap_local_page(desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(
inode, group, desc_bh, desc_kaddr);
nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
- kunmap_atomic(desc_kaddr);
+ kunmap_local(desc_kaddr);
mark_buffer_dirty(desc_bh);
nilfs_mdt_mark_dirty(inode);
brelse(desc_bh);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 7a8f166f2c8d..383f0afa2cea 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -548,13 +548,10 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
*/
void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
{
- down_write(&bmap->b_sem);
memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
NILFS_INODE_BMAP_SIZE * sizeof(__le64));
if (bmap->b_inode->i_ino == NILFS_DAT_INO)
bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
-
- up_write(&bmap->b_sem);
}
void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 13592e82eaf6..65659fa0372e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -724,7 +724,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
dat = nilfs_bmap_get_dat(btree);
ret = nilfs_dat_translate(dat, ptr, &blocknr);
if (ret < 0)
- goto out;
+ goto dat_error;
ptr = blocknr;
}
cnt = 1;
@@ -743,7 +743,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
if (dat) {
ret = nilfs_dat_translate(dat, ptr2, &blocknr);
if (ret < 0)
- goto out;
+ goto dat_error;
ptr2 = blocknr;
}
if (ptr2 != ptr + cnt || ++cnt == maxblocks)
@@ -781,6 +781,11 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
out:
nilfs_btree_free_path(path);
return ret;
+
+ dat_error:
+ if (ret == -ENOENT)
+ ret = -EINVAL; /* Notify bmap layer of metadata corruption */
+ goto out;
}
static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 39136637f715..69a5cced1e84 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -28,7 +28,7 @@ nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
{
__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
- do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+ tcno = div64_ul(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
return (unsigned long)tcno;
}
@@ -187,35 +187,90 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
}
/**
- * nilfs_cpfile_get_checkpoint - get a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @create: create flag
- * @cpp: pointer to a checkpoint
- * @bhp: pointer to a buffer head
- *
- * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
- * specified by @cno. A new checkpoint will be created if @cno is the current
- * checkpoint number and @create is nonzero.
- *
- * Return Value: On success, 0 is returned, and the checkpoint and the
- * buffer head of the buffer on which the checkpoint is located are stored in
- * the place pointed by @cpp and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
+ * nilfs_cpfile_read_checkpoint - read a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno: number of checkpoint entry to read
+ * @root: nilfs root object
+ * @ifile: ifile's inode to read and attach to @root
*
- * %-EIO - I/O error.
+ * This function imports checkpoint information from the checkpoint file and
+ * stores it to the inode file given by @ifile and the nilfs root object
+ * given by @root.
*
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL - Invalid checkpoint.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
+ */
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+ struct nilfs_root *root, struct inode *ifile)
+{
+ struct buffer_head *cp_bh;
+ struct nilfs_checkpoint *cp;
+ void *kaddr;
+ int ret;
+
+ if (cno < 1 || cno > nilfs_mdt_cno(cpfile))
+ return -EINVAL;
+
+ down_read(&NILFS_MDT(cpfile)->mi_sem);
+ ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+ if (unlikely(ret < 0)) {
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ goto out_sem;
+ }
+
+ kaddr = kmap_local_page(cp_bh->b_page);
+ cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+ if (nilfs_checkpoint_invalid(cp)) {
+ ret = -EINVAL;
+ goto put_cp;
+ }
+
+ ret = nilfs_read_inode_common(ifile, &cp->cp_ifile_inode);
+ if (unlikely(ret)) {
+ /*
+ * Since this inode is on a checkpoint entry, treat errors
+ * as metadata corruption.
+ */
+ nilfs_err(cpfile->i_sb,
+ "ifile inode (checkpoint number=%llu) corrupted",
+ (unsigned long long)cno);
+ ret = -EIO;
+ goto put_cp;
+ }
+
+ /* Configure the nilfs root object */
+ atomic64_set(&root->inodes_count, le64_to_cpu(cp->cp_inodes_count));
+ atomic64_set(&root->blocks_count, le64_to_cpu(cp->cp_blocks_count));
+ root->ifile = ifile;
+
+put_cp:
+ kunmap_local(kaddr);
+ brelse(cp_bh);
+out_sem:
+ up_read(&NILFS_MDT(cpfile)->mi_sem);
+ return ret;
+}
+
+/**
+ * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile
+ * @cpfile: checkpoint file inode
+ * @cno: number of checkpoint to set up
*
- * %-ENOENT - No such checkpoint.
+ * This function creates a checkpoint with the number specified by @cno on
+ * cpfile. If the specified checkpoint entry already exists due to a past
+ * failure, it will be reused without returning an error.
+ * In either case, the buffer of the block containing the checkpoint entry
+ * and the cpfile inode are made dirty for inclusion in the write log.
*
- * %-EINVAL - invalid checkpoint.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-EROFS - Read only filesystem
*/
-int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
- __u64 cno,
- int create,
- struct nilfs_checkpoint **cpp,
- struct buffer_head **bhp)
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
{
struct buffer_head *header_bh, *cp_bh;
struct nilfs_cpfile_header *header;
@@ -223,70 +278,128 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
void *kaddr;
int ret;
- if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
- (cno < nilfs_mdt_cno(cpfile) && create)))
- return -EINVAL;
+ if (WARN_ON_ONCE(cno < 1))
+ return -EIO;
down_write(&NILFS_MDT(cpfile)->mi_sem);
-
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
- if (ret < 0)
+ if (unlikely(ret < 0)) {
+ if (ret == -ENOENT) {
+ nilfs_error(cpfile->i_sb,
+ "checkpoint creation failed due to metadata corruption.");
+ ret = -EIO;
+ }
goto out_sem;
- ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
- if (ret < 0)
+ }
+ ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh);
+ if (unlikely(ret < 0))
goto out_header;
- kaddr = kmap(cp_bh->b_page);
+
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
if (nilfs_checkpoint_invalid(cp)) {
- if (!create) {
- kunmap(cp_bh->b_page);
- brelse(cp_bh);
- ret = -ENOENT;
- goto out_header;
- }
/* a newly-created checkpoint */
nilfs_checkpoint_clear_invalid(cp);
if (!nilfs_cpfile_is_in_first(cpfile, cno))
nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
kaddr, 1);
- mark_buffer_dirty(cp_bh);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh,
kaddr);
le64_add_cpu(&header->ch_ncheckpoints, 1);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(header_bh);
- nilfs_mdt_mark_dirty(cpfile);
+ } else {
+ kunmap_local(kaddr);
}
- if (cpp != NULL)
- *cpp = cp;
- *bhp = cp_bh;
+ /* Force the buffer and the inode to become dirty */
+ mark_buffer_dirty(cp_bh);
+ brelse(cp_bh);
+ nilfs_mdt_mark_dirty(cpfile);
- out_header:
+out_header:
brelse(header_bh);
- out_sem:
+out_sem:
up_write(&NILFS_MDT(cpfile)->mi_sem);
return ret;
}
/**
- * nilfs_cpfile_put_checkpoint - put a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @bh: buffer head
+ * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno: checkpoint number
+ * @root: nilfs root object
+ * @blkinc: number of blocks added by this checkpoint
+ * @ctime: checkpoint creation time
+ * @minor: minor checkpoint flag
+ *
+ * This function completes the checkpoint entry numbered by @cno in the
+ * cpfile with the data given by the arguments @root, @blkinc, @ctime, and
+ * @minor.
*
- * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
- * specified by @cno. @bh must be the buffer head which has been returned by
- * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
*/
-void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
- struct buffer_head *bh)
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+ struct nilfs_root *root, __u64 blkinc,
+ time64_t ctime, bool minor)
{
- kunmap(bh->b_page);
- brelse(bh);
+ struct buffer_head *cp_bh;
+ struct nilfs_checkpoint *cp;
+ void *kaddr;
+ int ret;
+
+ if (WARN_ON_ONCE(cno < 1))
+ return -EIO;
+
+ down_write(&NILFS_MDT(cpfile)->mi_sem);
+ ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+ if (unlikely(ret < 0)) {
+ if (ret == -ENOENT)
+ goto error;
+ goto out_sem;
+ }
+
+ kaddr = kmap_local_page(cp_bh->b_page);
+ cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+ if (unlikely(nilfs_checkpoint_invalid(cp))) {
+ kunmap_local(kaddr);
+ brelse(cp_bh);
+ goto error;
+ }
+
+ cp->cp_snapshot_list.ssl_next = 0;
+ cp->cp_snapshot_list.ssl_prev = 0;
+ cp->cp_inodes_count = cpu_to_le64(atomic64_read(&root->inodes_count));
+ cp->cp_blocks_count = cpu_to_le64(atomic64_read(&root->blocks_count));
+ cp->cp_nblk_inc = cpu_to_le64(blkinc);
+ cp->cp_create = cpu_to_le64(ctime);
+ cp->cp_cno = cpu_to_le64(cno);
+
+ if (minor)
+ nilfs_checkpoint_set_minor(cp);
+ else
+ nilfs_checkpoint_clear_minor(cp);
+
+ nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode);
+ nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode);
+
+ kunmap_local(kaddr);
+ brelse(cp_bh);
+out_sem:
+ up_write(&NILFS_MDT(cpfile)->mi_sem);
+ return ret;
+
+error:
+ nilfs_error(cpfile->i_sb,
+ "checkpoint finalization failed due to metadata corruption.");
+ ret = -EIO;
+ goto out_sem;
}
/**
@@ -347,7 +460,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
continue;
}
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(
cpfile, cno, cp_bh, kaddr);
nicps = 0;
@@ -369,7 +482,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cp_bh, kaddr, nicps);
if (count == 0) {
/* make hole */
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(cp_bh);
ret =
nilfs_cpfile_delete_checkpoint_block(
@@ -384,18 +497,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
}
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(cp_bh);
}
if (tnicps > 0) {
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh,
kaddr);
le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(cpfile);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
brelse(header_bh);
@@ -447,7 +560,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
}
ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
if (!nilfs_checkpoint_invalid(cp)) {
@@ -457,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
n++;
}
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
}
@@ -491,10 +604,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
ret = nilfs_cpfile_get_header_block(cpfile, &bh);
if (ret < 0)
goto out;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
if (curr == 0) {
ret = 0;
@@ -512,7 +625,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
ret = 0; /* No snapshots (started from a hole block) */
goto out;
}
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
while (n < nci) {
cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
curr = ~(__u64)0; /* Terminator */
@@ -528,7 +641,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
if (curr_blkoff != next_blkoff) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
0, &bh);
@@ -536,12 +649,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
WARN_ON(ret == -ENOENT);
goto out;
}
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
}
curr = next;
curr_blkoff = next_blkoff;
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
*cnop = curr;
ret = n;
@@ -650,24 +763,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
if (nilfs_checkpoint_invalid(cp)) {
ret = -ENOENT;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
goto out_cp;
}
if (nilfs_checkpoint_snapshot(cp)) {
ret = 0;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
goto out_cp;
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
if (ret < 0)
goto out_cp;
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
list = &header->ch_snapshot_list;
curr_bh = header_bh;
@@ -679,13 +792,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
curr = prev;
if (curr_blkoff != prev_blkoff) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(curr_bh);
ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
0, &curr_bh);
if (ret < 0)
goto out_header;
- kaddr = kmap_atomic(curr_bh->b_page);
+ kaddr = kmap_local_page(curr_bh->b_page);
}
curr_blkoff = prev_blkoff;
cp = nilfs_cpfile_block_get_checkpoint(
@@ -693,7 +806,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
list = &cp->cp_snapshot_list;
prev = le64_to_cpu(list->ssl_prev);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (prev != 0) {
ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
@@ -705,29 +818,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
get_bh(prev_bh);
}
- kaddr = kmap_atomic(curr_bh->b_page);
+ kaddr = kmap_local_page(curr_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, curr, curr_bh, kaddr);
list->ssl_prev = cpu_to_le64(cno);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
nilfs_checkpoint_set_snapshot(cp);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(prev_bh->b_page);
+ kaddr = kmap_local_page(prev_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, prev, prev_bh, kaddr);
list->ssl_next = cpu_to_le64(cno);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
le64_add_cpu(&header->ch_nsnapshots, 1);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(prev_bh);
mark_buffer_dirty(curr_bh);
@@ -768,23 +881,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
if (nilfs_checkpoint_invalid(cp)) {
ret = -ENOENT;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
goto out_cp;
}
if (!nilfs_checkpoint_snapshot(cp)) {
ret = 0;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
goto out_cp;
}
list = &cp->cp_snapshot_list;
next = le64_to_cpu(list->ssl_next);
prev = le64_to_cpu(list->ssl_prev);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
if (ret < 0)
@@ -808,29 +921,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
get_bh(prev_bh);
}
- kaddr = kmap_atomic(next_bh->b_page);
+ kaddr = kmap_local_page(next_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, next, next_bh, kaddr);
list->ssl_prev = cpu_to_le64(prev);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(prev_bh->b_page);
+ kaddr = kmap_local_page(prev_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, prev, prev_bh, kaddr);
list->ssl_next = cpu_to_le64(next);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
nilfs_checkpoint_clear_snapshot(cp);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
le64_add_cpu(&header->ch_nsnapshots, -1);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(next_bh);
mark_buffer_dirty(prev_bh);
@@ -889,13 +1002,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
if (ret < 0)
goto out;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
if (nilfs_checkpoint_invalid(cp))
ret = -ENOENT;
else
ret = nilfs_checkpoint_snapshot(cp);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
out:
@@ -972,12 +1085,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
ret = nilfs_cpfile_get_header_block(cpfile, &bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
cpstat->cs_cno = nilfs_mdt_cno(cpfile);
cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
out_sem:
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index edabb2dc5756..f5b1d59289eb 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -16,10 +16,12 @@
#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */
-int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
- struct nilfs_checkpoint **,
- struct buffer_head **);
-void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+ struct nilfs_root *root, struct inode *ifile);
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+ struct nilfs_root *root, __u64 blkinc,
+ time64_t ctime, bool minor);
int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9cf6ba58f585..180fc8d36213 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -91,13 +91,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
entry->de_blocknr = cpu_to_le64(0);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_palloc_commit_alloc_entry(dat, req);
nilfs_dat_commit_entry(dat, req);
@@ -115,13 +115,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
entry->de_blocknr = cpu_to_le64(0);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_dat_commit_entry(dat, req);
@@ -145,12 +145,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
entry->de_blocknr = cpu_to_le64(blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_dat_commit_entry(dat, req);
}
@@ -167,12 +167,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
if (ret < 0)
return ret;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (blocknr == 0) {
ret = nilfs_palloc_prepare_free_entry(dat, req);
@@ -202,7 +202,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
sector_t blocknr;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
end = start = le64_to_cpu(entry->de_start);
@@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
}
entry->de_end = cpu_to_le64(end);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (blocknr == 0)
nilfs_dat_commit_free(dat, req);
@@ -227,12 +227,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
sector_t blocknr;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (start == nilfs_mdt_cno(dat) && blocknr == 0)
nilfs_palloc_abort_free_entry(dat, req);
@@ -362,7 +362,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
}
}
- kaddr = kmap_atomic(entry_bh->b_page);
+ kaddr = kmap_local_page(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
nilfs_crit(dat->i_sb,
@@ -370,13 +370,13 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
__func__, (unsigned long long)vblocknr,
(unsigned long long)le64_to_cpu(entry->de_start),
(unsigned long long)le64_to_cpu(entry->de_end));
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(entry_bh);
return -EINVAL;
}
WARN_ON(blocknr == 0);
entry->de_blocknr = cpu_to_le64(blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(entry_bh);
nilfs_mdt_mark_dirty(dat);
@@ -426,7 +426,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
}
}
- kaddr = kmap_atomic(entry_bh->b_page);
+ kaddr = kmap_local_page(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
blocknr = le64_to_cpu(entry->de_blocknr);
if (blocknr == 0) {
@@ -436,7 +436,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
*blocknrp = blocknr;
out:
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(entry_bh);
return ret;
}
@@ -457,10 +457,10 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
0, &entry_bh);
if (ret < 0)
return ret;
- kaddr = kmap_atomic(entry_bh->b_page);
+ kaddr = kmap_local_page(entry_bh->b_page);
/* last virtual block number in this block */
first = vinfo->vi_vblocknr;
- do_div(first, entries_per_block);
+ first = div64_ul(first, entries_per_block);
first *= entries_per_block;
last = first + entries_per_block - 1;
for (j = i, n = 0;
@@ -473,7 +473,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
vinfo->vi_end = le64_to_cpu(entry->de_end);
vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(entry_bh);
}
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 4c85914f2abc..893ab36824cc 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -66,7 +66,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
dat = nilfs_bmap_get_dat(direct);
ret = nilfs_dat_translate(dat, ptr, &blocknr);
if (ret < 0)
- return ret;
+ goto dat_error;
ptr = blocknr;
}
@@ -79,7 +79,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
if (dat) {
ret = nilfs_dat_translate(dat, ptr2, &blocknr);
if (ret < 0)
- return ret;
+ goto dat_error;
ptr2 = blocknr;
}
if (ptr2 != ptr + cnt)
@@ -87,6 +87,11 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
}
*ptrp = ptr;
return cnt;
+
+ dat_error:
+ if (ret == -ENOENT)
+ ret = -EINVAL; /* Notify bmap layer of metadata corruption */
+ return ret;
}
static __u64
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index a8a4bc8490b4..612e609158b5 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -15,6 +15,7 @@
#include "mdt.h"
#include "alloc.h"
#include "ifile.h"
+#include "cpfile.h"
/**
* struct nilfs_ifile_info - on-memory private data of ifile
@@ -115,11 +116,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
return ret;
}
- kaddr = kmap_atomic(req.pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req.pr_entry_bh->b_page);
raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
req.pr_entry_bh, kaddr);
raw_inode->i_flags = 0;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(req.pr_entry_bh);
brelse(req.pr_entry_bh);
@@ -173,14 +174,18 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile,
* nilfs_ifile_read - read or get ifile inode
* @sb: super block instance
* @root: root object
+ * @cno: number of checkpoint entry to read
* @inode_size: size of an inode
- * @raw_inode: on-disk ifile inode
- * @inodep: buffer to store the inode
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL - Invalid checkpoint.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
*/
int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
- size_t inode_size, struct nilfs_inode *raw_inode,
- struct inode **inodep)
+ __u64 cno, size_t inode_size)
{
+ struct the_nilfs *nilfs;
struct inode *ifile;
int err;
@@ -201,13 +206,13 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
- err = nilfs_read_inode_common(ifile, raw_inode);
+ nilfs = sb->s_fs_info;
+ err = nilfs_cpfile_read_checkpoint(nilfs->ns_cpfile, cno, root, ifile);
if (err)
goto failed;
unlock_new_inode(ifile);
out:
- *inodep = ifile;
return 0;
failed:
iget_failed(ifile);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 35c5273f4821..625545cc2a98 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -21,15 +21,14 @@
static inline struct nilfs_inode *
nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
{
- void *kaddr = kmap(ibh->b_page);
+ void *kaddr = kmap_local_page(ibh->b_page);
return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
}
-static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
- struct buffer_head *ibh)
+static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode)
{
- kunmap(ibh->b_page);
+ kunmap_local(raw_inode);
}
int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
@@ -39,7 +38,6 @@ int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
- size_t inode_size, struct nilfs_inode *raw_inode,
- struct inode **inodep);
+ __u64 cno, size_t inode_size);
#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 9c334c722fc1..7340a01d80e1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -112,7 +112,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
"%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
__func__, inode->i_ino,
(unsigned long long)blkoff);
- err = 0;
+ err = -EAGAIN;
}
nilfs_transaction_abort(inode->i_sb);
goto out;
@@ -520,7 +520,7 @@ static int __nilfs_read_inode(struct super_block *sb,
inode, inode->i_mode,
huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
}
- nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+ nilfs_ifile_unmap_inode(raw_inode);
brelse(bh);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
@@ -529,7 +529,7 @@ static int __nilfs_read_inode(struct super_block *sb,
return 0;
failed_unmap:
- nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+ nilfs_ifile_unmap_inode(raw_inode);
brelse(bh);
bad_inode:
@@ -759,8 +759,18 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
return s_inode;
}
+/**
+ * nilfs_write_inode_common - export common inode information to on-disk inode
+ * @inode: inode object
+ * @raw_inode: on-disk inode
+ *
+ * This function writes standard information from the on-memory inode @inode
+ * to @raw_inode on ifile, cpfile or a super root block. Since inode bmap
+ * data is not exported, nilfs_bmap_write() must be called separately during
+ * log writing.
+ */
void nilfs_write_inode_common(struct inode *inode,
- struct nilfs_inode *raw_inode, int has_bmap)
+ struct nilfs_inode *raw_inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -778,21 +788,6 @@ void nilfs_write_inode_common(struct inode *inode,
raw_inode->i_flags = cpu_to_le32(ii->i_flags);
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
- if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
- struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-
- /* zero-fill unused portion in the case of super root block */
- raw_inode->i_xattr = 0;
- raw_inode->i_pad = 0;
- memset((void *)raw_inode + sizeof(*raw_inode), 0,
- nilfs->ns_inode_size - sizeof(*raw_inode));
- }
-
- if (has_bmap)
- nilfs_bmap_write(ii->i_bmap, raw_inode);
- else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- raw_inode->i_device_code =
- cpu_to_le64(huge_encode_dev(inode->i_rdev));
/*
* When extending inode, nilfs->ns_inode_size should be checked
* for substitutions of appended fields.
@@ -813,14 +808,13 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
if (flags & I_DIRTY_DATASYNC)
set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
- nilfs_write_inode_common(inode, raw_inode, 0);
- /*
- * XXX: call with has_bmap = 0 is a workaround to avoid
- * deadlock of bmap. This delays update of i_bmap to just
- * before writing.
- */
+ nilfs_write_inode_common(inode, raw_inode);
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ raw_inode->i_device_code =
+ cpu_to_le64(huge_encode_dev(inode->i_rdev));
- nilfs_ifile_unmap_inode(ifile, ino, ibh);
+ nilfs_ifile_unmap_inode(raw_inode);
}
#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index cfb6aca5ec38..f1a01c191cf5 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1111,7 +1111,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
minseg = range[0] + segbytes - 1;
- do_div(minseg, segbytes);
+ minseg = div64_ul(minseg, segbytes);
if (range[1] < 4096)
goto out;
@@ -1120,7 +1120,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
if (maxseg < segbytes)
goto out;
- do_div(maxseg, segbytes);
+ maxseg = div64_ul(maxseg, segbytes);
maxseg--;
ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index e45c01a559c0..4f792a0ad0f0 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -47,12 +47,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
set_buffer_mapped(bh);
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
memset(kaddr + bh_offset(bh), 0, i_blocksize(inode));
if (init_block)
init_block(inode, bh, kaddr);
flush_dcache_page(bh->b_page);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 98cffaf0ac12..2e29b98ba8ba 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -256,7 +256,8 @@ extern struct inode *nilfs_new_inode(struct inode *, umode_t);
extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern void nilfs_set_inode_flags(struct inode *);
extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
-extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+void nilfs_write_inode_common(struct inode *inode,
+ struct nilfs_inode *raw_inode);
struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
unsigned long ino);
struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5c2eba1987bd..14e470fb8870 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -103,11 +103,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
struct page *spage = sbh->b_page, *dpage = dbh->b_page;
struct buffer_head *bh;
- kaddr0 = kmap_atomic(spage);
- kaddr1 = kmap_atomic(dpage);
+ kaddr0 = kmap_local_page(spage);
+ kaddr1 = kmap_local_page(dpage);
memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
- kunmap_atomic(kaddr1);
- kunmap_atomic(kaddr0);
+ kunmap_local(kaddr1);
+ kunmap_local(kaddr0);
dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
dbh->b_blocknr = sbh->b_blocknr;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index a9b8d77c8c1d..49a70c68bf3c 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -482,9 +482,9 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
if (unlikely(!bh_org))
return -EIO;
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
memcpy(kaddr + from, bh_org->b_data, bh_org->b_size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh_org);
return 0;
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6e59dc19a732..dc431b4c34c9 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -220,9 +220,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
crc = crc32_le(crc, bh->b_data, bh->b_size);
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
raw_sum->ss_datasum = cpu_to_le32(crc);
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2bfb08052d39..aa5290cb7467 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -880,76 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
nilfs_mdt_clear_dirty(nilfs->ns_dat);
}
-static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
-{
- struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
- struct buffer_head *bh_cp;
- struct nilfs_checkpoint *raw_cp;
- int err;
-
- /* XXX: this interface will be changed */
- err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
- &raw_cp, &bh_cp);
- if (likely(!err)) {
- /*
- * The following code is duplicated with cpfile. But, it is
- * needed to collect the checkpoint even if it was not newly
- * created.
- */
- mark_buffer_dirty(bh_cp);
- nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
- nilfs_cpfile_put_checkpoint(
- nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
- } else if (err == -EINVAL || err == -ENOENT) {
- nilfs_error(sci->sc_super,
- "checkpoint creation failed due to metadata corruption.");
- err = -EIO;
- }
- return err;
-}
-
-static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
-{
- struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
- struct buffer_head *bh_cp;
- struct nilfs_checkpoint *raw_cp;
- int err;
-
- err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
- &raw_cp, &bh_cp);
- if (unlikely(err)) {
- if (err == -EINVAL || err == -ENOENT) {
- nilfs_error(sci->sc_super,
- "checkpoint finalization failed due to metadata corruption.");
- err = -EIO;
- }
- goto failed_ibh;
- }
- raw_cp->cp_snapshot_list.ssl_next = 0;
- raw_cp->cp_snapshot_list.ssl_prev = 0;
- raw_cp->cp_inodes_count =
- cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
- raw_cp->cp_blocks_count =
- cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
- raw_cp->cp_nblk_inc =
- cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
- raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
- raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
-
- if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
- nilfs_checkpoint_clear_minor(raw_cp);
- else
- nilfs_checkpoint_set_minor(raw_cp);
-
- nilfs_write_inode_common(sci->sc_root->ifile,
- &raw_cp->cp_ifile_inode, 1);
- nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
- return 0;
-
- failed_ibh:
- return err;
-}
-
static void nilfs_fill_in_file_bmap(struct inode *ifile,
struct nilfs_inode_info *ii)
@@ -963,7 +893,7 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
ibh);
nilfs_bmap_write(ii->i_bmap, raw_inode);
- nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+ nilfs_ifile_unmap_inode(raw_inode);
}
}
@@ -977,6 +907,33 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
}
}
+/**
+ * nilfs_write_root_mdt_inode - export root metadata inode information to
+ * the on-disk inode
+ * @inode: inode object of the root metadata file
+ * @raw_inode: on-disk inode
+ *
+ * nilfs_write_root_mdt_inode() writes inode information and bmap data of
+ * @inode to the inode area of the metadata file allocated on the super root
+ * block created to finalize the log. Since super root blocks are configured
+ * each time, this function zero-fills the unused area of @raw_inode.
+ */
+static void nilfs_write_root_mdt_inode(struct inode *inode,
+ struct nilfs_inode *raw_inode)
+{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+ nilfs_write_inode_common(inode, raw_inode);
+
+ /* zero-fill unused portion of raw_inode */
+ raw_inode->i_xattr = 0;
+ raw_inode->i_pad = 0;
+ memset((void *)raw_inode + sizeof(*raw_inode), 0,
+ nilfs->ns_inode_size - sizeof(*raw_inode));
+
+ nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode);
+}
+
static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
@@ -998,12 +955,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
raw_sr->sr_flags = 0;
- nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
- NILFS_SR_DAT_OFFSET(isz), 1);
- nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
- NILFS_SR_CPFILE_OFFSET(isz), 1);
- nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
- NILFS_SR_SUFILE_OFFSET(isz), 1);
+ nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr +
+ NILFS_SR_DAT_OFFSET(isz));
+ nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr +
+ NILFS_SR_CPFILE_OFFSET(isz));
+ nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr +
+ NILFS_SR_SUFILE_OFFSET(isz));
+
memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
set_buffer_uptodate(bh_sr);
unlock_buffer(bh_sr);
@@ -1230,7 +1188,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
break;
nilfs_sc_cstage_inc(sci);
/* Creating a checkpoint */
- err = nilfs_segctor_create_checkpoint(sci);
+ err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile,
+ nilfs->ns_cno);
if (unlikely(err))
break;
fallthrough;
@@ -2101,7 +2060,11 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (mode == SC_LSEG_SR &&
nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
- err = nilfs_segctor_fill_in_checkpoint(sci);
+ err = nilfs_cpfile_finalize_checkpoint(
+ nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root,
+ sci->sc_nblk_inc + sci->sc_nblk_this_inc,
+ sci->sc_seg_ctime,
+ !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags));
if (unlikely(err))
goto failed_to_write;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 0a8119456c21..6748218be7c5 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -48,7 +48,7 @@ nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
{
__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
- do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+ t = div64_ul(t, nilfs_sufile_segment_usages_per_block(sufile));
return (unsigned long)t;
}
@@ -107,11 +107,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
struct nilfs_sufile_header *header;
void *kaddr;
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(header_bh);
}
@@ -315,10 +315,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
ret = nilfs_sufile_get_header_block(sufile, &header_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
last_alloc = le64_to_cpu(header->sh_last_alloc);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nsegments = nilfs_sufile_get_nsegments(sufile);
maxsegnum = sui->allocmax;
@@ -352,7 +352,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
&su_bh);
if (ret < 0)
goto out_header;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
@@ -363,14 +363,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
continue;
/* found a clean segment */
nilfs_segment_usage_set_dirty(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
le64_add_cpu(&header->sh_ncleansegs, -1);
le64_add_cpu(&header->sh_ndirtysegs, 1);
header->sh_last_alloc = cpu_to_le64(segnum);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
sui->ncleansegs--;
mark_buffer_dirty(header_bh);
@@ -384,7 +384,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
goto out_header;
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(su_bh);
}
@@ -406,16 +406,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
struct nilfs_segment_usage *su;
void *kaddr;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
__func__, (unsigned long long)segnum);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
return;
}
nilfs_segment_usage_set_dirty(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_sufile_mod_counter(header_bh, -1, 1);
NILFS_SUI(sufile)->ncleansegs--;
@@ -432,11 +432,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
void *kaddr;
int clean, dirty;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
su->su_nblocks == cpu_to_le32(0)) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
return;
}
clean = nilfs_segment_usage_clean(su);
@@ -446,7 +446,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
su->su_lastmod = cpu_to_le64(0);
su->su_nblocks = cpu_to_le32(0);
su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
NILFS_SUI(sufile)->ncleansegs -= clean;
@@ -463,12 +463,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
void *kaddr;
int sudirty;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
__func__, (unsigned long long)segnum);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
return;
}
if (unlikely(nilfs_segment_usage_error(su)))
@@ -481,7 +481,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
(unsigned long long)segnum);
nilfs_segment_usage_set_clean(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(su_bh);
nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
@@ -509,12 +509,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
if (ret)
goto out_sem;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
if (unlikely(nilfs_segment_usage_error(su))) {
struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
if (nilfs_segment_is_active(nilfs, segnum)) {
nilfs_error(sufile->i_sb,
@@ -532,7 +532,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
ret = -EIO;
} else {
nilfs_segment_usage_set_dirty(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
brelse(bh);
@@ -562,7 +562,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
if (modtime) {
/*
@@ -573,7 +573,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
su->su_lastmod = cpu_to_le64(modtime);
}
su->su_nblocks = cpu_to_le32(nblocks);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
@@ -614,7 +614,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
@@ -624,7 +624,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
spin_lock(&nilfs->ns_last_segment_lock);
sustat->ss_prot_seq = nilfs->ns_prot_seq;
spin_unlock(&nilfs->ns_last_segment_lock);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(header_bh);
out_sem:
@@ -640,15 +640,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
void *kaddr;
int suclean;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_error(su)) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
return;
}
suclean = nilfs_segment_usage_clean(su);
nilfs_segment_usage_set_error(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (suclean) {
nilfs_sufile_mod_counter(header_bh, -1, 0);
@@ -717,7 +717,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
/* hole */
continue;
}
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
su2 = su;
@@ -726,7 +726,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
nilfs_segment_is_active(nilfs, segnum + j)) {
ret = -EBUSY;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(su_bh);
goto out_header;
}
@@ -738,7 +738,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
nc++;
}
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (nc > 0) {
mark_buffer_dirty(su_bh);
ncleaned += nc;
@@ -823,10 +823,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
sui->allocmin = 0;
}
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(sufile);
@@ -891,7 +891,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
continue;
}
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
for (j = 0; j < n;
@@ -904,7 +904,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
si->sui_flags |=
BIT(NILFS_SEGMENT_USAGE_ACTIVE);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(su_bh);
}
ret = nsegs;
@@ -973,7 +973,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
goto out_header;
for (;;) {
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, sup->sup_segnum, bh, kaddr);
@@ -1010,7 +1010,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
sup = (void *)sup + supsz;
if (sup >= supend)
@@ -1115,7 +1115,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
continue;
}
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
su_bh, kaddr);
for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
@@ -1145,7 +1145,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
}
if (nblocks >= minlen) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
@@ -1157,7 +1157,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
}
ndiscarded += nblocks;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
}
@@ -1166,7 +1166,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
start = seg_start;
nblocks = seg_end - seg_start + 1;
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
put_bh(su_bh);
}
@@ -1246,10 +1246,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
goto failed;
sui = NILFS_SUI(sufile);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(header_bh);
sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index df8674173b22..ac24ed109ce9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -448,7 +448,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
newnsegs = sb2off >> nilfs->ns_blocksize_bits;
- do_div(newnsegs, nilfs->ns_blocks_per_segment);
+ newnsegs = div64_ul(newnsegs, nilfs->ns_blocks_per_segment);
ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
up_write(&nilfs->ns_segctor_sem);
@@ -544,8 +544,6 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
{
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_root *root;
- struct nilfs_checkpoint *raw_cp;
- struct buffer_head *bh_cp;
int err = -ENOMEM;
root = nilfs_find_or_create_root(
@@ -557,38 +555,19 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
goto reuse; /* already attached checkpoint */
down_read(&nilfs->ns_segctor_sem);
- err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
- &bh_cp);
+ err = nilfs_ifile_read(sb, root, cno, nilfs->ns_inode_size);
up_read(&nilfs->ns_segctor_sem);
- if (unlikely(err)) {
- if (err == -ENOENT || err == -EINVAL) {
- nilfs_err(sb,
- "Invalid checkpoint (checkpoint number=%llu)",
- (unsigned long long)cno);
- err = -EINVAL;
- }
+ if (unlikely(err))
goto failed;
- }
-
- err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
- &raw_cp->cp_ifile_inode, &root->ifile);
- if (err)
- goto failed_bh;
-
- atomic64_set(&root->inodes_count,
- le64_to_cpu(raw_cp->cp_inodes_count));
- atomic64_set(&root->blocks_count,
- le64_to_cpu(raw_cp->cp_blocks_count));
-
- nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
reuse:
*rootp = root;
return 0;
- failed_bh:
- nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
failed:
+ if (err == -EINVAL)
+ nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)",
+ (unsigned long long)cno);
nilfs_put_root(root);
return err;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 71400496ed36..2ae2c1bbf6d1 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -413,7 +413,7 @@ static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
{
u64 max_count = U64_MAX;
- do_div(max_count, nilfs->ns_blocks_per_segment);
+ max_count = div64_ul(max_count, nilfs->ns_blocks_per_segment);
return min_t(u64, max_count, ULONG_MAX);
}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 1e4def21811e..224bccaab4cc 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -228,8 +228,10 @@ static int fanotify_get_response(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- ret = wait_event_killable(group->fanotify_data.access_waitq,
- event->state == FAN_EVENT_ANSWERED);
+ ret = wait_event_state(group->fanotify_data.access_waitq,
+ event->state == FAN_EVENT_ANSWERED,
+ (TASK_KILLABLE|TASK_FREEZABLE));
+
/* Signal pending? */
if (ret < 0) {
spin_lock(&group->notification_lock);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 8bfd690e9f10..2fc105a72a8f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -141,7 +141,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
}
/* Are inode/sb/mount interested in parent and name info with this event? */
-static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
+static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
__u32 mask)
{
__u32 marks_mask = 0;
@@ -160,13 +160,22 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
/* Did either inode/sb/mount subscribe for events with parent/name? */
marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
- if (mnt)
- marks_mask |= fsnotify_parent_needed_mask(mnt->mnt_fsnotify_mask);
+ marks_mask |= fsnotify_parent_needed_mask(mnt_mask);
/* Did they subscribe for this event with parent/name info? */
return mask & marks_mask;
}
+/* Are there any inode/mount/sb objects that are interested in this event? */
+static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
+ __u32 mask)
+{
+ __u32 marks_mask = inode->i_fsnotify_mask | mnt_mask |
+ inode->i_sb->s_fsnotify_mask;
+
+ return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
+}
+
/*
* Notify this dentry's parent about a child's events with child name info
* if parent is watching or if inode/sb/mount are interested in events with
@@ -179,7 +188,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
int data_type)
{
const struct path *path = fsnotify_data_path(data, data_type);
- struct mount *mnt = path ? real_mount(path->mnt) : NULL;
+ __u32 mnt_mask = path ? real_mount(path->mnt)->mnt_fsnotify_mask : 0;
struct inode *inode = d_inode(dentry);
struct dentry *parent;
bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
@@ -190,16 +199,13 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
struct qstr *file_name = NULL;
int ret = 0;
- /*
- * Do inode/sb/mount care about parent and name info on non-dir?
- * Do they care about any event at all?
- */
- if (!inode->i_fsnotify_marks && !inode->i_sb->s_fsnotify_marks &&
- (!mnt || !mnt->mnt_fsnotify_marks) && !parent_watched)
+ /* Optimize the likely case of nobody watching this path */
+ if (likely(!parent_watched &&
+ !fsnotify_object_watched(inode, mnt_mask, mask)))
return 0;
parent = NULL;
- parent_needed = fsnotify_event_needs_parent(inode, mnt, mask);
+ parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask);
if (!parent_watched && !parent_needed)
goto notify;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64a6ef638495..cb40cafbc062 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1615,7 +1615,7 @@ update_holders:
unlock:
lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
- /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+ /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
spin_unlock_irqrestore(&lockres->l_lock, flags);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8b6d15010703..0da8e7bd3261 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2763,6 +2763,7 @@ const struct inode_operations ocfs2_file_iops = {
const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
+ .listxattr = ocfs2_listxattr,
.permission = ocfs2_permission,
.get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 82b28fdacc7e..accf03d4765e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -65,7 +65,7 @@ struct ocfs2_inode_info
tid_t i_sync_tid;
tid_t i_datasync_tid;
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
};
/*
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index dc9f76ab7e13..0575c2d060eb 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -447,14 +447,17 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
int err;
struct quota_info *dqopt = sb_dqopt(sb);
struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv;
+ unsigned int memalloc;
down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
err = ocfs2_qinfo_lock(info, 1);
if (err < 0)
goto out_sem;
err = __ocfs2_global_write_info(sb, type);
ocfs2_qinfo_unlock(info, 1);
out_sem:
+ memalloc_nofs_restore(memalloc);
up_write(&dqopt->dqio_sem);
return err;
}
@@ -601,6 +604,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_super *osb = OCFS2_SB(sb);
int status = 0;
+ unsigned int memalloc;
trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type,
@@ -618,6 +622,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
goto out_ilock;
}
down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_sync_dquot(dquot);
if (status < 0)
mlog_errno(status);
@@ -625,6 +630,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
status = ocfs2_local_write_dquot(dquot);
if (status < 0)
mlog_errno(status);
+ memalloc_nofs_restore(memalloc);
up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out_ilock:
@@ -662,6 +668,7 @@ static int ocfs2_write_dquot(struct dquot *dquot)
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
+ unsigned int memalloc;
trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type);
@@ -673,7 +680,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
goto out;
}
down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_local_write_dquot(dquot);
+ memalloc_nofs_restore(memalloc);
up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out:
@@ -920,6 +929,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(sb);
+ unsigned int memalloc;
trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
type);
@@ -946,6 +956,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
goto out_ilock;
}
down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_sync_dquot(dquot);
if (status < 0) {
mlog_errno(status);
@@ -954,6 +965,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
/* Now write updated local dquot structure */
status = ocfs2_local_write_dquot(dquot);
out_dlock:
+ memalloc_nofs_restore(memalloc);
up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out_ilock:
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index e09842fc9d4d..8ce462c64c51 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -470,6 +470,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
int bit, chunk;
struct ocfs2_recovery_chunk *rchunk, *next;
qsize_t spacechange, inodechange;
+ unsigned int memalloc;
trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
@@ -521,6 +522,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
goto out_drop_lock;
}
down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
spin_lock(&dquot->dq_dqb_lock);
/* Add usage from quota entry into quota changes
* of our node. Auxiliary variables are important
@@ -553,6 +555,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
unlock_buffer(qbh);
ocfs2_journal_dirty(handle, qbh);
out_commit:
+ memalloc_nofs_restore(memalloc);
up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(OCFS2_SB(sb), handle);
out_drop_lock:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b3f860888e93..8aabaed2c1cb 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -122,7 +122,7 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
static int ocfs2_enable_quotas(struct ocfs2_super *osb);
static void ocfs2_disable_quotas(struct ocfs2_super *osb);
-static struct dquot **ocfs2_get_dquots(struct inode *inode)
+static struct dquot __rcu **ocfs2_get_dquots(struct inode *inode)
{
return OCFS2_I(inode)->i_dquot;
}
@@ -1711,12 +1711,12 @@ static int ocfs2_initialize_mem_caches(void)
ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
sizeof(struct ocfs2_dquot),
0,
- (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT),
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
NULL);
ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
sizeof(struct ocfs2_quota_chunk),
0,
- (SLAB_RECLAIM_ACCOUNT),
+ SLAB_RECLAIM_ACCOUNT,
NULL);
if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
!ocfs2_qf_chunk_cachep) {
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index 3b6982bf6bcf..e75e173a9186 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -22,7 +22,7 @@ int op_cache_initialize(void)
op_cache = kmem_cache_create("orangefs_op_cache",
sizeof(struct orangefs_kernel_op_s),
0,
- ORANGEFS_CACHE_CREATE_FLAGS,
+ 0,
NULL);
if (!op_cache) {
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 926d9c0a428a..e2df7eeadc7a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -93,16 +93,6 @@ enum orangefs_vfs_op_states {
OP_VFS_STATE_GIVEN_UP = 16,
};
-/*
- * orangefs kernel memory related flags
- */
-
-#if (defined CONFIG_DEBUG_SLAB)
-#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE
-#else
-#define ORANGEFS_CACHE_CREATE_FLAGS 0
-#endif
-
extern const struct xattr_handler * const orangefs_xattr_handlers[];
extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 5254256a224d..34849b4a3243 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -527,7 +527,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
if (!ORANGEFS_SB(sb)) {
d = ERR_PTR(-ENOMEM);
- goto free_sb_and_op;
+ goto free_op;
}
ret = orangefs_fill_sb(sb,
@@ -644,7 +644,7 @@ int orangefs_inode_cache_initialize(void)
"orangefs_inode_cache",
sizeof(struct orangefs_inode_s),
0,
- ORANGEFS_CACHE_CREATE_FLAGS,
+ 0,
offsetof(struct orangefs_inode_s,
link_target),
sizeof_field(struct orangefs_inode_s,
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8586e2f5d243..0762575a1e70 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -234,11 +234,11 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen)
{
loff_t tmp;
- if (WARN_ON_ONCE(pos != pos2))
+ if (pos != pos2)
return -EIO;
- if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0))
+ if (pos < 0 || len < 0 || totlen < 0)
return -EIO;
- if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp)))
+ if (check_add_overflow(pos, len, &tmp))
return -EIO;
return 0;
}
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 32b1116ae137..d80a1431ef7b 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -32,7 +32,7 @@ config PROC_FS
config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
- select CRASH_CORE
+ select VMCORE_INFO
help
Provides a virtual ELF core file of the live kernel. This can
be read with gdb and other ELF tools. No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6422e569b080..8e08a9a1b7ed 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,7 +10,7 @@
* Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
*/
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/kcore.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3f78ebbb795f..23fbab954c20 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
- struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
{
pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
@@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
hole_end = end;
for (; addr < hole_end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
if (vma->vm_flags & VM_SOFTDIRTY)
pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
if (pm->show_pfn) {
@@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pagemap_entry_t pme;
pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
}
@@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
return err;
if (pm->show_pfn && (flags & PM_PRESENT))
@@ -1807,7 +1806,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pte_to_swp_entry(pte);
if (is_pfn_swap_entry(swp) &&
- !PageAnon(pfn_swap_entry_to_page(swp)))
+ !folio_test_anon(pfn_swap_entry_folio(swp)))
categories |= PAGE_IS_FILE;
}
if (pte_swp_soft_dirty(pte))
@@ -1873,7 +1872,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pmd_to_swp_entry(pmd);
if (is_pfn_swap_entry(swp) &&
- !PageAnon(pfn_swap_entry_to_page(swp)))
+ !folio_test_anon(pfn_swap_entry_folio(swp)))
categories |= PAGE_IS_FILE;
}
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index eb6e9d95dea1..dacbee455c03 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -399,15 +399,17 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
EXPORT_SYMBOL(dquot_mark_dquot_dirty);
/* Dirtify all the dquots - this can block when journalling */
-static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
+static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
{
int ret, err, cnt;
+ struct dquot *dquot;
ret = err = 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquot[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot)
/* Even in case of error we have to continue */
- ret = mark_dquot_dirty(dquot[cnt]);
+ ret = mark_dquot_dirty(dquot);
if (!err)
err = ret;
}
@@ -875,10 +877,7 @@ void dqput(struct dquot *dquot)
}
/* Need to release dquot? */
-#ifdef CONFIG_QUOTA_DEBUG
- /* sanity check */
- BUG_ON(!list_empty(&dquot->dq_free));
-#endif
+ WARN_ON_ONCE(!list_empty(&dquot->dq_free));
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
@@ -987,9 +986,8 @@ we_slept:
* smp_mb__before_atomic() in dquot_acquire().
*/
smp_rmb();
-#ifdef CONFIG_QUOTA_DEBUG
- BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
-#endif
+ /* Has somebody invalidated entry under us? */
+ WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash));
out:
if (empty)
do_destroy_dquot(empty);
@@ -998,14 +996,14 @@ out:
}
EXPORT_SYMBOL(dqget);
-static inline struct dquot **i_dquot(struct inode *inode)
+static inline struct dquot __rcu **i_dquot(struct inode *inode)
{
return inode->i_sb->s_op->get_dquots(inode);
}
static int dqinit_needed(struct inode *inode, int type)
{
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
int cnt;
if (IS_NOQUOTA(inode))
@@ -1095,14 +1093,16 @@ static void remove_dquot_ref(struct super_block *sb, int type)
*/
spin_lock(&dq_data_lock);
if (!IS_NOQUOTA(inode)) {
- struct dquot **dquots = i_dquot(inode);
- struct dquot *dquot = dquots[type];
+ struct dquot __rcu **dquots = i_dquot(inode);
+ struct dquot *dquot = srcu_dereference_check(
+ dquots[type], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
reserved = 1;
#endif
- dquots[type] = NULL;
+ rcu_assign_pointer(dquots[type], NULL);
if (dquot)
dqput(dquot);
}
@@ -1455,7 +1455,8 @@ static int inode_quota_active(const struct inode *inode)
static int __dquot_initialize(struct inode *inode, int type)
{
int cnt, init_needed = 0;
- struct dquot **dquots, *got[MAXQUOTAS] = {};
+ struct dquot __rcu **dquots;
+ struct dquot *got[MAXQUOTAS] = {};
struct super_block *sb = inode->i_sb;
qsize_t rsv;
int ret = 0;
@@ -1530,7 +1531,7 @@ static int __dquot_initialize(struct inode *inode, int type)
if (!got[cnt])
continue;
if (!dquots[cnt]) {
- dquots[cnt] = got[cnt];
+ rcu_assign_pointer(dquots[cnt], got[cnt]);
got[cnt] = NULL;
/*
* Make quota reservation system happy if someone
@@ -1538,12 +1539,16 @@ static int __dquot_initialize(struct inode *inode, int type)
*/
rsv = inode_get_rsv_space(inode);
if (unlikely(rsv)) {
+ struct dquot *dquot = srcu_dereference_check(
+ dquots[cnt], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+
spin_lock(&inode->i_lock);
/* Get reservation again under proper lock */
rsv = __inode_get_rsv_space(inode);
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- dquots[cnt]->dq_dqb.dqb_rsvspace += rsv;
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot->dq_dqb.dqb_rsvspace += rsv;
+ spin_unlock(&dquot->dq_dqb_lock);
spin_unlock(&inode->i_lock);
}
}
@@ -1565,7 +1570,7 @@ EXPORT_SYMBOL(dquot_initialize);
bool dquot_initialize_needed(struct inode *inode)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
int i;
if (!inode_quota_active(inode))
@@ -1590,13 +1595,14 @@ EXPORT_SYMBOL(dquot_initialize_needed);
static void __dquot_drop(struct inode *inode)
{
int cnt;
- struct dquot **dquots = i_dquot(inode);
+ struct dquot __rcu **dquots = i_dquot(inode);
struct dquot *put[MAXQUOTAS];
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- put[cnt] = dquots[cnt];
- dquots[cnt] = NULL;
+ put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+ rcu_assign_pointer(dquots[cnt], NULL);
}
spin_unlock(&dq_data_lock);
dqput_all(put);
@@ -1604,7 +1610,7 @@ static void __dquot_drop(struct inode *inode)
void dquot_drop(struct inode *inode)
{
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
int cnt;
if (IS_NOQUOTA(inode))
@@ -1677,7 +1683,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
int reserve = flags & DQUOT_SPACE_RESERVE;
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
if (!inode_quota_active(inode)) {
if (reserve) {
@@ -1697,27 +1704,26 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
index = srcu_read_lock(&dquot_srcu);
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
if (reserve) {
- ret = dquot_add_space(dquots[cnt], 0, number, flags,
- &warn[cnt]);
+ ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
} else {
- ret = dquot_add_space(dquots[cnt], number, 0, flags,
- &warn[cnt]);
+ ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
}
if (ret) {
/* Back out changes we already did */
for (cnt--; cnt >= 0; cnt--) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
if (reserve)
- dquot_free_reserved_space(dquots[cnt],
- number);
+ dquot_free_reserved_space(dquot, number);
else
- dquot_decr_space(dquots[cnt], number);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ dquot_decr_space(dquot, number);
+ spin_unlock(&dquot->dq_dqb_lock);
}
spin_unlock(&inode->i_lock);
goto out_flush_warn;
@@ -1747,7 +1753,8 @@ int dquot_alloc_inode(struct inode *inode)
{
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
+ struct dquot *dquot;
if (!inode_quota_active(inode))
return 0;
@@ -1758,17 +1765,19 @@ int dquot_alloc_inode(struct inode *inode)
index = srcu_read_lock(&dquot_srcu);
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- ret = dquot_add_inodes(dquots[cnt], 1, &warn[cnt]);
+ ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
if (ret) {
for (cnt--; cnt >= 0; cnt--) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
/* Back out changes we already did */
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- dquot_decr_inodes(dquots[cnt], 1);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot_decr_inodes(dquot, 1);
+ spin_unlock(&dquot->dq_dqb_lock);
}
goto warn_put_all;
}
@@ -1789,7 +1798,8 @@ EXPORT_SYMBOL(dquot_alloc_inode);
*/
void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int cnt, index;
if (!inode_quota_active(inode)) {
@@ -1805,9 +1815,8 @@ void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
spin_lock(&inode->i_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquots[cnt]) {
- struct dquot *dquot = dquots[cnt];
-
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot) {
spin_lock(&dquot->dq_dqb_lock);
if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
number = dquot->dq_dqb.dqb_rsvspace;
@@ -1831,7 +1840,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
*/
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int cnt, index;
if (!inode_quota_active(inode)) {
@@ -1847,9 +1857,8 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
spin_lock(&inode->i_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquots[cnt]) {
- struct dquot *dquot = dquots[cnt];
-
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot) {
spin_lock(&dquot->dq_dqb_lock);
if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
number = dquot->dq_dqb.dqb_curspace;
@@ -1875,7 +1884,8 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int reserve = flags & DQUOT_SPACE_RESERVE, index;
if (!inode_quota_active(inode)) {
@@ -1896,17 +1906,18 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
int wtype;
warn[cnt].w_type = QUOTA_NL_NOWARN;
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- wtype = info_bdq_free(dquots[cnt], number);
+ spin_lock(&dquot->dq_dqb_lock);
+ wtype = info_bdq_free(dquot, number);
if (wtype != QUOTA_NL_NOWARN)
- prepare_warning(&warn[cnt], dquots[cnt], wtype);
+ prepare_warning(&warn[cnt], dquot, wtype);
if (reserve)
- dquot_free_reserved_space(dquots[cnt], number);
+ dquot_free_reserved_space(dquot, number);
else
- dquot_decr_space(dquots[cnt], number);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ dquot_decr_space(dquot, number);
+ spin_unlock(&dquot->dq_dqb_lock);
}
if (reserve)
*inode_reserved_space(inode) -= number;
@@ -1930,7 +1941,8 @@ void dquot_free_inode(struct inode *inode)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
+ struct dquot *dquot;
int index;
if (!inode_quota_active(inode))
@@ -1941,16 +1953,16 @@ void dquot_free_inode(struct inode *inode)
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
int wtype;
-
warn[cnt].w_type = QUOTA_NL_NOWARN;
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- wtype = info_idq_free(dquots[cnt], 1);
+ spin_lock(&dquot->dq_dqb_lock);
+ wtype = info_idq_free(dquot, 1);
if (wtype != QUOTA_NL_NOWARN)
- prepare_warning(&warn[cnt], dquots[cnt], wtype);
- dquot_decr_inodes(dquots[cnt], 1);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ prepare_warning(&warn[cnt], dquot, wtype);
+ dquot_decr_inodes(dquot, 1);
+ spin_unlock(&dquot->dq_dqb_lock);
}
spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
@@ -1976,8 +1988,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
qsize_t cur_space;
qsize_t rsv_space = 0;
qsize_t inode_usage = 1;
+ struct dquot __rcu **dquots;
struct dquot *transfer_from[MAXQUOTAS] = {};
- int cnt, ret = 0;
+ int cnt, index, ret = 0;
char is_valid[MAXQUOTAS] = {};
struct dquot_warn warn_to[MAXQUOTAS];
struct dquot_warn warn_from_inodes[MAXQUOTAS];
@@ -2008,6 +2021,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
}
cur_space = __inode_get_bytes(inode);
rsv_space = __inode_get_rsv_space(inode);
+ dquots = i_dquot(inode);
/*
* Build the transfer_from list, check limits, and update usage in
* the target structures.
@@ -2022,7 +2036,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
if (!sb_has_quota_active(inode->i_sb, cnt))
continue;
is_valid[cnt] = 1;
- transfer_from[cnt] = i_dquot(inode)[cnt];
+ transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
+ &dquot_srcu, lockdep_is_held(&dq_data_lock));
ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
&warn_to[cnt]);
if (ret)
@@ -2061,13 +2076,21 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
rsv_space);
spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
}
- i_dquot(inode)[cnt] = transfer_to[cnt];
+ rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
}
spin_unlock(&inode->i_lock);
spin_unlock(&dq_data_lock);
- mark_all_dquot_dirty(transfer_from);
- mark_all_dquot_dirty(transfer_to);
+ /*
+ * These arrays are local and we hold dquot references so we don't need
+ * the srcu protection but still take dquot_srcu to avoid warning in
+ * mark_all_dquot_dirty().
+ */
+ index = srcu_read_lock(&dquot_srcu);
+ mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
+ mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
+ srcu_read_unlock(&dquot_srcu, index);
+
flush_warnings(warn_to);
flush_warnings(warn_from_inodes);
flush_warnings(warn_from_space);
@@ -2388,7 +2411,8 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
lockdep_assert_held_write(&sb->s_umount);
/* Just unsuspend quotas? */
- BUG_ON(flags & DQUOT_SUSPENDED);
+ if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED))
+ return -EINVAL;
if (!fmt)
return -ESRCH;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 0f1493e0f6d0..afceef3ddfaa 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -21,6 +21,12 @@ MODULE_AUTHOR("Jan Kara");
MODULE_DESCRIPTION("Quota trie support");
MODULE_LICENSE("GPL");
+/*
+ * Maximum quota tree depth we support. Only to limit recursion when working
+ * with the tree.
+ */
+#define MAX_QTREE_DEPTH 6
+
#define __QUOTA_QT_PARANOIA
static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
@@ -108,7 +114,7 @@ static int check_dquot_block_header(struct qtree_mem_dqinfo *info,
/* Remove empty block from list and return it */
static int get_free_dqblk(struct qtree_mem_dqinfo *info)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
int ret, blk;
@@ -160,7 +166,7 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
uint blk)
{
- char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
uint nextblk = le32_to_cpu(dh->dqdh_next_free);
uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
@@ -207,7 +213,7 @@ out_buf:
static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
uint blk)
{
- char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
int err;
@@ -255,7 +261,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
{
uint blk, i;
struct qt_disk_dqdbheader *dh;
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
char *ddquot;
*err = 0;
@@ -327,27 +333,36 @@ out_buf:
/* Insert reference to structure into the trie */
static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
- uint *treeblk, int depth)
+ uint *blks, int depth)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
int ret = 0, newson = 0, newact = 0;
__le32 *ref;
uint newblk;
+ int i;
if (!buf)
return -ENOMEM;
- if (!*treeblk) {
+ if (!blks[depth]) {
ret = get_free_dqblk(info);
if (ret < 0)
goto out_buf;
- *treeblk = ret;
+ for (i = 0; i < depth; i++)
+ if (ret == blks[i]) {
+ quota_error(dquot->dq_sb,
+ "Free block already used in tree: block %u",
+ ret);
+ ret = -EIO;
+ goto out_buf;
+ }
+ blks[depth] = ret;
memset(buf, 0, info->dqi_usable_bs);
newact = 1;
} else {
- ret = read_blk(info, *treeblk, buf);
+ ret = read_blk(info, blks[depth], buf);
if (ret < 0) {
quota_error(dquot->dq_sb, "Can't read tree quota "
- "block %u", *treeblk);
+ "block %u", blks[depth]);
goto out_buf;
}
}
@@ -357,8 +372,20 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
info->dqi_blocks - 1);
if (ret)
goto out_buf;
- if (!newblk)
+ if (!newblk) {
newson = 1;
+ } else {
+ for (i = 0; i <= depth; i++)
+ if (newblk == blks[i]) {
+ quota_error(dquot->dq_sb,
+ "Cycle in quota tree detected: block %u index %u",
+ blks[depth],
+ get_index(info, dquot->dq_id, depth));
+ ret = -EIO;
+ goto out_buf;
+ }
+ }
+ blks[depth + 1] = newblk;
if (depth == info->dqi_qtree_depth - 1) {
#ifdef __QUOTA_QT_PARANOIA
if (newblk) {
@@ -370,16 +397,16 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
#endif
- newblk = find_free_dqentry(info, dquot, &ret);
+ blks[depth + 1] = find_free_dqentry(info, dquot, &ret);
} else {
- ret = do_insert_tree(info, dquot, &newblk, depth+1);
+ ret = do_insert_tree(info, dquot, blks, depth + 1);
}
if (newson && ret >= 0) {
ref[get_index(info, dquot->dq_id, depth)] =
- cpu_to_le32(newblk);
- ret = write_blk(info, *treeblk, buf);
+ cpu_to_le32(blks[depth + 1]);
+ ret = write_blk(info, blks[depth], buf);
} else if (newact && ret < 0) {
- put_free_dqblk(info, buf, *treeblk);
+ put_free_dqblk(info, buf, blks[depth]);
}
out_buf:
kfree(buf);
@@ -390,7 +417,7 @@ out_buf:
static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
struct dquot *dquot)
{
- int tmp = QT_TREEOFF;
+ uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
#ifdef __QUOTA_QT_PARANOIA
if (info->dqi_blocks <= QT_TREEOFF) {
@@ -398,7 +425,11 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
return -EIO;
}
#endif
- return do_insert_tree(info, dquot, &tmp, 0);
+ if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+ quota_error(dquot->dq_sb, "Quota tree depth too big!");
+ return -EIO;
+ }
+ return do_insert_tree(info, dquot, blks, 0);
}
/*
@@ -410,7 +441,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
int type = dquot->dq_id.type;
struct super_block *sb = dquot->dq_sb;
ssize_t ret;
- char *ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS);
+ char *ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL);
if (!ddquot)
return -ENOMEM;
@@ -449,7 +480,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
uint blk)
{
struct qt_disk_dqdbheader *dh;
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
int ret = 0;
if (!buf)
@@ -511,19 +542,20 @@ out_buf:
/* Remove reference to dquot from tree */
static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
- uint *blk, int depth)
+ uint *blks, int depth)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
int ret = 0;
uint newblk;
__le32 *ref = (__le32 *)buf;
+ int i;
if (!buf)
return -ENOMEM;
- ret = read_blk(info, *blk, buf);
+ ret = read_blk(info, blks[depth], buf);
if (ret < 0) {
quota_error(dquot->dq_sb, "Can't read quota data block %u",
- *blk);
+ blks[depth]);
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -532,29 +564,38 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
if (ret)
goto out_buf;
+ for (i = 0; i <= depth; i++)
+ if (newblk == blks[i]) {
+ quota_error(dquot->dq_sb,
+ "Cycle in quota tree detected: block %u index %u",
+ blks[depth],
+ get_index(info, dquot->dq_id, depth));
+ ret = -EIO;
+ goto out_buf;
+ }
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
- newblk = 0;
+ blks[depth + 1] = 0;
} else {
- ret = remove_tree(info, dquot, &newblk, depth+1);
+ blks[depth + 1] = newblk;
+ ret = remove_tree(info, dquot, blks, depth + 1);
}
- if (ret >= 0 && !newblk) {
- int i;
+ if (ret >= 0 && !blks[depth + 1]) {
ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
/* Block got empty? */
for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++)
;
/* Don't put the root block into the free block list */
if (i == (info->dqi_usable_bs >> 2)
- && *blk != QT_TREEOFF) {
- put_free_dqblk(info, buf, *blk);
- *blk = 0;
+ && blks[depth] != QT_TREEOFF) {
+ put_free_dqblk(info, buf, blks[depth]);
+ blks[depth] = 0;
} else {
- ret = write_blk(info, *blk, buf);
+ ret = write_blk(info, blks[depth], buf);
if (ret < 0)
quota_error(dquot->dq_sb,
"Can't write quota tree block %u",
- *blk);
+ blks[depth]);
}
}
out_buf:
@@ -565,11 +606,15 @@ out_buf:
/* Delete dquot from tree */
int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
{
- uint tmp = QT_TREEOFF;
+ uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
if (!dquot->dq_off) /* Even not allocated? */
return 0;
- return remove_tree(info, dquot, &tmp, 0);
+ if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+ quota_error(dquot->dq_sb, "Quota tree depth too big!");
+ return -EIO;
+ }
+ return remove_tree(info, dquot, blks, 0);
}
EXPORT_SYMBOL(qtree_delete_dquot);
@@ -577,7 +622,7 @@ EXPORT_SYMBOL(qtree_delete_dquot);
static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
struct dquot *dquot, uint blk)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
loff_t ret = 0;
int i;
char *ddquot;
@@ -613,18 +658,20 @@ out_buf:
/* Find entry for given id in the tree */
static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
- struct dquot *dquot, uint blk, int depth)
+ struct dquot *dquot, uint *blks, int depth)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
loff_t ret = 0;
__le32 *ref = (__le32 *)buf;
+ uint blk;
+ int i;
if (!buf)
return -ENOMEM;
- ret = read_blk(info, blk, buf);
+ ret = read_blk(info, blks[depth], buf);
if (ret < 0) {
quota_error(dquot->dq_sb, "Can't read quota tree block %u",
- blk);
+ blks[depth]);
goto out_buf;
}
ret = 0;
@@ -636,8 +683,19 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
if (ret)
goto out_buf;
+ /* Check for cycles in the tree */
+ for (i = 0; i <= depth; i++)
+ if (blk == blks[i]) {
+ quota_error(dquot->dq_sb,
+ "Cycle in quota tree detected: block %u index %u",
+ blks[depth],
+ get_index(info, dquot->dq_id, depth));
+ ret = -EIO;
+ goto out_buf;
+ }
+ blks[depth + 1] = blk;
if (depth < info->dqi_qtree_depth - 1)
- ret = find_tree_dqentry(info, dquot, blk, depth+1);
+ ret = find_tree_dqentry(info, dquot, blks, depth + 1);
else
ret = find_block_dqentry(info, dquot, blk);
out_buf:
@@ -649,7 +707,13 @@ out_buf:
static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
struct dquot *dquot)
{
- return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+ uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
+
+ if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+ quota_error(dquot->dq_sb, "Quota tree depth too big!");
+ return -EIO;
+ }
+ return find_tree_dqentry(info, dquot, blks, 0);
}
int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
@@ -684,7 +748,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
}
dquot->dq_off = offset;
}
- ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS);
+ ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL);
if (!ddquot)
return -ENOMEM;
ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
@@ -728,7 +792,7 @@ EXPORT_SYMBOL(qtree_release_dquot);
static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
unsigned int blk, int depth)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
__le32 *ref = (__le32 *)buf;
ssize_t ret;
unsigned int epb = info->dqi_usable_bs >> 2;
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index a0db3f195e95..3f3e8acc05db 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -160,9 +160,11 @@ static int v1_read_file_info(struct super_block *sb, int type)
{
struct quota_info *dqopt = sb_dqopt(sb);
struct v1_disk_dqblk dqblk;
+ unsigned int memalloc;
int ret;
down_read(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct v1_disk_dqblk), v1_dqoff(0));
if (ret != sizeof(struct v1_disk_dqblk)) {
@@ -179,6 +181,7 @@ static int v1_read_file_info(struct super_block *sb, int type)
dqopt->info[type].dqi_bgrace =
dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
out:
+ memalloc_nofs_restore(memalloc);
up_read(&dqopt->dqio_sem);
return ret;
}
@@ -187,9 +190,11 @@ static int v1_write_file_info(struct super_block *sb, int type)
{
struct quota_info *dqopt = sb_dqopt(sb);
struct v1_disk_dqblk dqblk;
+ unsigned int memalloc;
int ret;
down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct v1_disk_dqblk), v1_dqoff(0));
if (ret != sizeof(struct v1_disk_dqblk)) {
@@ -209,6 +214,7 @@ static int v1_write_file_info(struct super_block *sb, int type)
else if (ret >= 0)
ret = -EIO;
out:
+ memalloc_nofs_restore(memalloc);
up_write(&dqopt->dqio_sem);
return ret;
}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index ae99e7b88205..c48c233f3bef 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -96,9 +96,11 @@ static int v2_read_file_info(struct super_block *sb, int type)
struct qtree_mem_dqinfo *qinfo;
ssize_t size;
unsigned int version;
+ unsigned int memalloc;
int ret;
down_read(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = v2_read_header(sb, type, &dqhead);
if (ret < 0)
goto out;
@@ -119,7 +121,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
ret = -EIO;
goto out;
}
- info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+ info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_KERNEL);
if (!info->dqi_priv) {
ret = -ENOMEM;
goto out;
@@ -166,14 +168,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
i_size_read(sb_dqopt(sb)->files[type]));
goto out_free;
}
- if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) {
- quota_error(sb, "Free block number too big (%u >= %u).",
- qinfo->dqi_free_blk, qinfo->dqi_blocks);
+ if (qinfo->dqi_free_blk && (qinfo->dqi_free_blk <= QT_TREEOFF ||
+ qinfo->dqi_free_blk >= qinfo->dqi_blocks)) {
+ quota_error(sb, "Free block number %u out of range (%u, %u).",
+ qinfo->dqi_free_blk, QT_TREEOFF, qinfo->dqi_blocks);
goto out_free;
}
- if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) {
- quota_error(sb, "Block with free entry too big (%u >= %u).",
- qinfo->dqi_free_entry, qinfo->dqi_blocks);
+ if (qinfo->dqi_free_entry && (qinfo->dqi_free_entry <= QT_TREEOFF ||
+ qinfo->dqi_free_entry >= qinfo->dqi_blocks)) {
+ quota_error(sb, "Block with free entry %u out of range (%u, %u).",
+ qinfo->dqi_free_entry, QT_TREEOFF,
+ qinfo->dqi_blocks);
goto out_free;
}
ret = 0;
@@ -183,6 +188,7 @@ out_free:
info->dqi_priv = NULL;
}
out:
+ memalloc_nofs_restore(memalloc);
up_read(&dqopt->dqio_sem);
return ret;
}
@@ -195,8 +201,10 @@ static int v2_write_file_info(struct super_block *sb, int type)
struct mem_dqinfo *info = &dqopt->info[type];
struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
ssize_t size;
+ unsigned int memalloc;
down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
spin_lock(&dq_data_lock);
info->dqi_flags &= ~DQF_INFO_DIRTY;
dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
@@ -209,6 +217,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+ memalloc_nofs_restore(memalloc);
up_write(&dqopt->dqio_sem);
if (size != sizeof(struct v2_disk_dqinfo)) {
quota_error(sb, "Can't write info structure");
@@ -328,11 +337,14 @@ static int v2_read_dquot(struct dquot *dquot)
{
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
int ret;
+ unsigned int memalloc;
down_read(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = qtree_read_dquot(
sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
dquot);
+ memalloc_nofs_restore(memalloc);
up_read(&dqopt->dqio_sem);
return ret;
}
@@ -342,6 +354,7 @@ static int v2_write_dquot(struct dquot *dquot)
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
int ret;
bool alloc = false;
+ unsigned int memalloc;
/*
* If space for dquot is already allocated, we don't need any
@@ -355,9 +368,11 @@ static int v2_write_dquot(struct dquot *dquot)
} else {
down_read(&dqopt->dqio_sem);
}
+ memalloc = memalloc_nofs_save();
ret = qtree_write_dquot(
sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
dquot);
+ memalloc_nofs_restore(memalloc);
if (alloc)
up_write(&dqopt->dqio_sem);
else
@@ -368,10 +383,13 @@ static int v2_write_dquot(struct dquot *dquot)
static int v2_release_dquot(struct dquot *dquot)
{
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ unsigned int memalloc;
int ret;
down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+ memalloc_nofs_restore(memalloc);
up_write(&dqopt->dqio_sem);
return ret;
@@ -386,10 +404,13 @@ static int v2_free_file_info(struct super_block *sb, int type)
static int v2_get_next_id(struct super_block *sb, struct kqid *qid)
{
struct quota_info *dqopt = sb_dqopt(sb);
+ unsigned int memalloc;
int ret;
down_read(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
+ memalloc_nofs_restore(memalloc);
up_read(&dqopt->dqio_sem);
return ret;
}
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 0554903f42a9..f0e1f29f20ee 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -97,7 +97,7 @@ struct reiserfs_inode_info {
struct rw_semaphore i_xattr_sem;
#endif
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
struct inode vfs_inode;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2cc469d481a2..ab76468da02d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -801,7 +801,7 @@ static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
loff_t);
-static struct dquot **reiserfs_get_dquots(struct inode *inode)
+static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode)
{
return REISERFS_I(inode)->i_dquot;
}
diff --git a/fs/smb/server/glob.h b/fs/smb/server/glob.h
index 5b8f3e0ebdb3..d528b20b37a8 100644
--- a/fs/smb/server/glob.h
+++ b/fs/smb/server/glob.h
@@ -12,8 +12,6 @@
#include "unicode.h"
#include "vfs_cache.h"
-#define KSMBD_VERSION "3.4.2"
-
extern int ksmbd_debug_types;
#define KSMBD_DEBUG_SMB BIT(0)
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index 0ebf91ffa236..8ca8a45c4c62 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -75,6 +75,7 @@ struct ksmbd_heartbeat {
#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1)
#define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2)
#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF BIT(3)
+#define KSMBD_GLOBAL_FLAG_DURABLE_HANDLE BIT(4)
/*
* IPC request for ksmbd server startup
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 15f68ee05089..aec0a7a12405 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -156,7 +156,7 @@ void ksmbd_session_destroy(struct ksmbd_session *sess)
kfree(sess);
}
-static struct ksmbd_session *__session_lookup(unsigned long long id)
+struct ksmbd_session *__session_lookup(unsigned long long id)
{
struct ksmbd_session *sess;
@@ -305,6 +305,32 @@ struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
return sess;
}
+void destroy_previous_session(struct ksmbd_conn *conn,
+ struct ksmbd_user *user, u64 id)
+{
+ struct ksmbd_session *prev_sess;
+ struct ksmbd_user *prev_user;
+
+ down_write(&sessions_table_lock);
+ down_write(&conn->session_lock);
+ prev_sess = __session_lookup(id);
+ if (!prev_sess || prev_sess->state == SMB2_SESSION_EXPIRED)
+ goto out;
+
+ prev_user = prev_sess->user;
+ if (!prev_user ||
+ strcmp(user->name, prev_user->name) ||
+ user->passkey_sz != prev_user->passkey_sz ||
+ memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
+ goto out;
+
+ ksmbd_destroy_file_table(&prev_sess->file_table);
+ prev_sess->state = SMB2_SESSION_EXPIRED;
+out:
+ up_write(&conn->session_lock);
+ up_write(&sessions_table_lock);
+}
+
static bool ksmbd_preauth_session_id_match(struct preauth_session *sess,
unsigned long long id)
{
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index 63cb08fffde8..dc9fded2cd43 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -88,8 +88,11 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
int ksmbd_session_register(struct ksmbd_conn *conn,
struct ksmbd_session *sess);
void ksmbd_sessions_deregister(struct ksmbd_conn *conn);
+struct ksmbd_session *__session_lookup(unsigned long long id);
struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
unsigned long long id);
+void destroy_previous_session(struct ksmbd_conn *conn,
+ struct ksmbd_user *user, u64 id);
struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
u64 sess_id);
struct preauth_session *ksmbd_preauth_session_lookup(struct ksmbd_conn *conn,
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 53dfaac425c6..4978edfb15f9 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -159,7 +159,8 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info,
op_entry);
if (opinfo) {
- if (!atomic_inc_not_zero(&opinfo->refcount))
+ if (opinfo->conn == NULL ||
+ !atomic_inc_not_zero(&opinfo->refcount))
opinfo = NULL;
else {
atomic_inc(&opinfo->conn->r_count);
@@ -527,7 +528,7 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
*/
read_lock(&ci->m_lock);
list_for_each_entry(opinfo, &ci->m_op_list, op_entry) {
- if (!opinfo->is_lease)
+ if (!opinfo->is_lease || !opinfo->conn)
continue;
read_unlock(&ci->m_lock);
lease = opinfo->o_lease;
@@ -641,7 +642,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
struct smb2_hdr *rsp_hdr;
struct ksmbd_file *fp;
- fp = ksmbd_lookup_durable_fd(br_info->fid);
+ fp = ksmbd_lookup_global_fd(br_info->fid);
if (!fp)
goto out;
@@ -1106,7 +1107,7 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
read_lock(&p_ci->m_lock);
list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
- if (!opinfo->is_lease)
+ if (opinfo->conn == NULL || !opinfo->is_lease)
continue;
if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE &&
@@ -1142,7 +1143,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
opinfo = rcu_dereference(fp->f_opinfo);
rcu_read_unlock();
- if (!opinfo->is_lease || opinfo->o_lease->version != 2)
+ if (!opinfo || !opinfo->is_lease || opinfo->o_lease->version != 2)
return;
p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent);
@@ -1151,7 +1152,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
read_lock(&p_ci->m_lock);
list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
- if (!opinfo->is_lease)
+ if (opinfo->conn == NULL || !opinfo->is_lease)
continue;
if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE) {
@@ -1361,6 +1362,9 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
rcu_read_lock();
list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) {
+ if (brk_op->conn == NULL)
+ continue;
+
if (!atomic_inc_not_zero(&brk_op->refcount))
continue;
@@ -1496,11 +1500,10 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
/**
* parse_lease_state() - parse lease context containted in file open request
* @open_req: buffer containing smb2 file open(create) request
- * @is_dir: whether leasing file is directory
*
* Return: oplock state, -ENOENT if create lease context not found
*/
-struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
+struct lease_ctx_info *parse_lease_state(void *open_req)
{
struct create_context *cc;
struct smb2_create_req *req = (struct smb2_create_req *)open_req;
@@ -1518,12 +1521,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
- if (is_dir) {
- lreq->req_state = lc->lcontext.LeaseState &
- ~SMB2_LEASE_WRITE_CACHING_LE;
- lreq->is_dir = true;
- } else
- lreq->req_state = lc->lcontext.LeaseState;
+ lreq->req_state = lc->lcontext.LeaseState;
lreq->flags = lc->lcontext.LeaseFlags;
lreq->epoch = lc->lcontext.Epoch;
lreq->duration = lc->lcontext.LeaseDuration;
@@ -1646,6 +1644,8 @@ void create_durable_v2_rsp_buf(char *cc, struct ksmbd_file *fp)
buf->Name[3] = 'Q';
buf->Timeout = cpu_to_le32(fp->durable_timeout);
+ if (fp->is_persistent)
+ buf->Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
}
/**
@@ -1813,3 +1813,71 @@ out:
read_unlock(&lease_list_lock);
return ret_op;
}
+
+int smb2_check_durable_oplock(struct ksmbd_conn *conn,
+ struct ksmbd_share_config *share,
+ struct ksmbd_file *fp,
+ struct lease_ctx_info *lctx,
+ char *name)
+{
+ struct oplock_info *opinfo = opinfo_get(fp);
+ int ret = 0;
+
+ if (!opinfo)
+ return 0;
+
+ if (opinfo->is_lease == false) {
+ if (lctx) {
+ pr_err("create context include lease\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (opinfo->level != SMB2_OPLOCK_LEVEL_BATCH) {
+ pr_err("oplock level is not equal to SMB2_OPLOCK_LEVEL_BATCH\n");
+ ret = -EBADF;
+ }
+
+ goto out;
+ }
+
+ if (memcmp(conn->ClientGUID, fp->client_guid,
+ SMB2_CLIENT_GUID_SIZE)) {
+ ksmbd_debug(SMB, "Client guid of fp is not equal to the one of connection\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (!lctx) {
+ ksmbd_debug(SMB, "create context does not include lease\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key,
+ SMB2_LEASE_KEY_SIZE)) {
+ ksmbd_debug(SMB,
+ "lease key of fp does not match lease key in create context\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (!(opinfo->o_lease->state & SMB2_LEASE_HANDLE_CACHING_LE)) {
+ ksmbd_debug(SMB, "lease state does not contain SMB2_LEASE_HANDLE_CACHING\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (opinfo->o_lease->version != lctx->version) {
+ ksmbd_debug(SMB,
+ "lease version of fp does not match the one in create context\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (!ksmbd_inode_pending_delete(fp))
+ ret = ksmbd_validate_name_reconnect(share, fp, name);
+out:
+ opinfo_put(opinfo);
+ return ret;
+}
diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h
index 5b93ea9196c0..e9da63f25b20 100644
--- a/fs/smb/server/oplock.h
+++ b/fs/smb/server/oplock.h
@@ -111,7 +111,7 @@ void opinfo_put(struct oplock_info *opinfo);
/* Lease related functions */
void create_lease_buf(u8 *rbuf, struct lease *lease);
-struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir);
+struct lease_ctx_info *parse_lease_state(void *open_req);
__u8 smb2_map_lease_to_oplock(__le32 lease_state);
int lease_read_to_write(struct oplock_info *opinfo);
@@ -130,4 +130,9 @@ void destroy_lease_table(struct ksmbd_conn *conn);
void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
struct lease_ctx_info *lctx);
void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp);
+int smb2_check_durable_oplock(struct ksmbd_conn *conn,
+ struct ksmbd_share_config *share,
+ struct ksmbd_file *fp,
+ struct lease_ctx_info *lctx,
+ char *name);
#endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 3079e607c5fe..c0788188aa82 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -625,7 +625,6 @@ static void __exit ksmbd_server_exit(void)
}
MODULE_AUTHOR("Namjae Jeon <linkinjeon@kernel.org>");
-MODULE_VERSION(KSMBD_VERSION);
MODULE_DESCRIPTION("Linux kernel CIFS/SMB SERVER");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: ecb");
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 03dded29a980..727cb49926ee 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -101,13 +101,17 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
*len = le16_to_cpu(((struct smb2_sess_setup_req *)hdr)->SecurityBufferLength);
break;
case SMB2_TREE_CONNECT:
- *off = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset);
+ *off = max_t(unsigned short int,
+ le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset),
+ offsetof(struct smb2_tree_connect_req, Buffer));
*len = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathLength);
break;
case SMB2_CREATE:
{
unsigned short int name_off =
- le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset);
+ max_t(unsigned short int,
+ le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset),
+ offsetof(struct smb2_create_req, Buffer));
unsigned short int name_len =
le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength);
@@ -128,11 +132,15 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
break;
}
case SMB2_QUERY_INFO:
- *off = le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset);
+ *off = max_t(unsigned int,
+ le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset),
+ offsetof(struct smb2_query_info_req, Buffer));
*len = le32_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferLength);
break;
case SMB2_SET_INFO:
- *off = le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset);
+ *off = max_t(unsigned int,
+ le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset),
+ offsetof(struct smb2_set_info_req, Buffer));
*len = le32_to_cpu(((struct smb2_set_info_req *)hdr)->BufferLength);
break;
case SMB2_READ:
@@ -142,7 +150,7 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
case SMB2_WRITE:
if (((struct smb2_write_req *)hdr)->DataOffset ||
((struct smb2_write_req *)hdr)->Length) {
- *off = max_t(unsigned int,
+ *off = max_t(unsigned short int,
le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset),
offsetof(struct smb2_write_req, Buffer));
*len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length);
@@ -153,7 +161,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
*len = le16_to_cpu(((struct smb2_write_req *)hdr)->WriteChannelInfoLength);
break;
case SMB2_QUERY_DIRECTORY:
- *off = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset);
+ *off = max_t(unsigned short int,
+ le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset),
+ offsetof(struct smb2_query_directory_req, Buffer));
*len = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameLength);
break;
case SMB2_LOCK:
@@ -168,7 +178,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
break;
}
case SMB2_IOCTL:
- *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset);
+ *off = max_t(unsigned int,
+ le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset),
+ offsetof(struct smb2_ioctl_req, Buffer));
*len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount);
break;
default:
diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c
index 27a9dce3e03a..a45f7dca482e 100644
--- a/fs/smb/server/smb2ops.c
+++ b/fs/smb/server/smb2ops.c
@@ -256,6 +256,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_PERSISTENT_HANDLES;
}
/**
@@ -283,6 +286,9 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_PERSISTENT_HANDLES;
+
INIT_LIST_HEAD(&conn->preauth_sess_table);
return 0;
}
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 089527a8b4ff..d478fa0c57ab 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -607,30 +607,6 @@ int smb2_check_user_session(struct ksmbd_work *work)
return -ENOENT;
}
-static void destroy_previous_session(struct ksmbd_conn *conn,
- struct ksmbd_user *user, u64 id)
-{
- struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id);
- struct ksmbd_user *prev_user;
- struct channel *chann;
- long index;
-
- if (!prev_sess)
- return;
-
- prev_user = prev_sess->user;
-
- if (!prev_user ||
- strcmp(user->name, prev_user->name) ||
- user->passkey_sz != prev_user->passkey_sz ||
- memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
- return;
-
- prev_sess->state = SMB2_SESSION_EXPIRED;
- xa_for_each(&prev_sess->ksmbd_chann_list, index, chann)
- ksmbd_conn_set_exiting(chann->conn);
-}
-
/**
* smb2_get_name() - get filename string from on the wire smb format
* @src: source buffer
@@ -1951,7 +1927,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
WORK_BUFFERS(work, req, rsp);
- treename = smb_strndup_from_utf16(req->Buffer,
+ treename = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->PathOffset),
le16_to_cpu(req->PathLength), true,
conn->local_nls);
if (IS_ERR(treename)) {
@@ -2642,6 +2618,165 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
}
}
+enum {
+ DURABLE_RECONN_V2 = 1,
+ DURABLE_RECONN,
+ DURABLE_REQ_V2,
+ DURABLE_REQ,
+};
+
+struct durable_info {
+ struct ksmbd_file *fp;
+ unsigned short int type;
+ bool persistent;
+ bool reconnected;
+ unsigned int timeout;
+ char *CreateGuid;
+};
+
+static int parse_durable_handle_context(struct ksmbd_work *work,
+ struct smb2_create_req *req,
+ struct lease_ctx_info *lc,
+ struct durable_info *dh_info)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct create_context *context;
+ int dh_idx, err = 0;
+ u64 persistent_id = 0;
+ int req_op_level;
+ static const char * const durable_arr[] = {"DH2C", "DHnC", "DH2Q", "DHnQ"};
+
+ req_op_level = req->RequestedOplockLevel;
+ for (dh_idx = DURABLE_RECONN_V2; dh_idx <= ARRAY_SIZE(durable_arr);
+ dh_idx++) {
+ context = smb2_find_context_vals(req, durable_arr[dh_idx - 1], 4);
+ if (IS_ERR(context)) {
+ err = PTR_ERR(context);
+ goto out;
+ }
+ if (!context)
+ continue;
+
+ switch (dh_idx) {
+ case DURABLE_RECONN_V2:
+ {
+ struct create_durable_reconn_v2_req *recon_v2;
+
+ if (dh_info->type == DURABLE_RECONN ||
+ dh_info->type == DURABLE_REQ_V2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ recon_v2 = (struct create_durable_reconn_v2_req *)context;
+ persistent_id = recon_v2->Fid.PersistentFileId;
+ dh_info->fp = ksmbd_lookup_durable_fd(persistent_id);
+ if (!dh_info->fp) {
+ ksmbd_debug(SMB, "Failed to get durable handle state\n");
+ err = -EBADF;
+ goto out;
+ }
+
+ if (memcmp(dh_info->fp->create_guid, recon_v2->CreateGuid,
+ SMB2_CREATE_GUID_SIZE)) {
+ err = -EBADF;
+ ksmbd_put_durable_fd(dh_info->fp);
+ goto out;
+ }
+
+ dh_info->type = dh_idx;
+ dh_info->reconnected = true;
+ ksmbd_debug(SMB,
+ "reconnect v2 Persistent-id from reconnect = %llu\n",
+ persistent_id);
+ break;
+ }
+ case DURABLE_RECONN:
+ {
+ struct create_durable_reconn_req *recon;
+
+ if (dh_info->type == DURABLE_RECONN_V2 ||
+ dh_info->type == DURABLE_REQ_V2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ recon = (struct create_durable_reconn_req *)context;
+ persistent_id = recon->Data.Fid.PersistentFileId;
+ dh_info->fp = ksmbd_lookup_durable_fd(persistent_id);
+ if (!dh_info->fp) {
+ ksmbd_debug(SMB, "Failed to get durable handle state\n");
+ err = -EBADF;
+ goto out;
+ }
+
+ dh_info->type = dh_idx;
+ dh_info->reconnected = true;
+ ksmbd_debug(SMB, "reconnect Persistent-id from reconnect = %llu\n",
+ persistent_id);
+ break;
+ }
+ case DURABLE_REQ_V2:
+ {
+ struct create_durable_req_v2 *durable_v2_blob;
+
+ if (dh_info->type == DURABLE_RECONN ||
+ dh_info->type == DURABLE_RECONN_V2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ durable_v2_blob =
+ (struct create_durable_req_v2 *)context;
+ ksmbd_debug(SMB, "Request for durable v2 open\n");
+ dh_info->fp = ksmbd_lookup_fd_cguid(durable_v2_blob->CreateGuid);
+ if (dh_info->fp) {
+ if (!memcmp(conn->ClientGUID, dh_info->fp->client_guid,
+ SMB2_CLIENT_GUID_SIZE)) {
+ if (!(req->hdr.Flags & SMB2_FLAGS_REPLAY_OPERATION)) {
+ err = -ENOEXEC;
+ goto out;
+ }
+
+ dh_info->fp->conn = conn;
+ dh_info->reconnected = true;
+ goto out;
+ }
+ }
+
+ if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+ req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+ dh_info->CreateGuid =
+ durable_v2_blob->CreateGuid;
+ dh_info->persistent =
+ le32_to_cpu(durable_v2_blob->Flags);
+ dh_info->timeout =
+ le32_to_cpu(durable_v2_blob->Timeout);
+ dh_info->type = dh_idx;
+ }
+ break;
+ }
+ case DURABLE_REQ:
+ if (dh_info->type == DURABLE_RECONN)
+ goto out;
+ if (dh_info->type == DURABLE_RECONN_V2 ||
+ dh_info->type == DURABLE_REQ_V2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+ req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+ ksmbd_debug(SMB, "Request for durable open\n");
+ dh_info->type = dh_idx;
+ }
+ }
+ }
+
+out:
+ return err;
+}
+
/**
* smb2_open() - handler for smb file open request
* @work: smb work containing request buffer
@@ -2665,6 +2800,7 @@ int smb2_open(struct ksmbd_work *work)
struct lease_ctx_info *lc = NULL;
struct create_ea_buf_req *ea_buf = NULL;
struct oplock_info *opinfo;
+ struct durable_info dh_info = {0};
__le32 *next_ptr = NULL;
int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0;
int rc = 0;
@@ -2704,7 +2840,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out2;
}
- name = smb2_get_name(req->Buffer,
+ name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset),
le16_to_cpu(req->NameLength),
work->conn->local_nls);
if (IS_ERR(name)) {
@@ -2745,6 +2881,49 @@ int smb2_open(struct ksmbd_work *work)
}
}
+ req_op_level = req->RequestedOplockLevel;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE &&
+ req->CreateContextsOffset) {
+ lc = parse_lease_state(req);
+ rc = parse_durable_handle_context(work, req, lc, &dh_info);
+ if (rc) {
+ ksmbd_debug(SMB, "error parsing durable handle context\n");
+ goto err_out2;
+ }
+
+ if (dh_info.reconnected == true) {
+ rc = smb2_check_durable_oplock(conn, share, dh_info.fp, lc, name);
+ if (rc) {
+ ksmbd_put_durable_fd(dh_info.fp);
+ goto err_out2;
+ }
+
+ rc = ksmbd_reopen_durable_fd(work, dh_info.fp);
+ if (rc) {
+ ksmbd_put_durable_fd(dh_info.fp);
+ goto err_out2;
+ }
+
+ if (ksmbd_override_fsids(work)) {
+ rc = -ENOMEM;
+ ksmbd_put_durable_fd(dh_info.fp);
+ goto err_out2;
+ }
+
+ fp = dh_info.fp;
+ file_info = FILE_OPENED;
+
+ rc = ksmbd_vfs_getattr(&fp->filp->f_path, &stat);
+ if (rc)
+ goto err_out2;
+
+ ksmbd_put_durable_fd(fp);
+ goto reconnected_fp;
+ }
+ } else if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
+ lc = parse_lease_state(req);
+
if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE)) {
pr_err("Invalid impersonationlevel : 0x%x\n",
le32_to_cpu(req->ImpersonationLevel));
@@ -3207,10 +3386,6 @@ int smb2_open(struct ksmbd_work *work)
need_truncate = 1;
}
- req_op_level = req->RequestedOplockLevel;
- if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
- lc = parse_lease_state(req, S_ISDIR(file_inode(filp)->i_mode));
-
share_ret = ksmbd_smb_check_shared_mode(fp->filp, fp);
if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_OPLOCKS) ||
(req_op_level == SMB2_OPLOCK_LEVEL_LEASE &&
@@ -3221,6 +3396,11 @@ int smb2_open(struct ksmbd_work *work)
}
} else {
if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+ if (S_ISDIR(file_inode(filp)->i_mode)) {
+ lc->req_state &= ~SMB2_LEASE_WRITE_CACHING_LE;
+ lc->is_dir = true;
+ }
+
/*
* Compare parent lease using parent key. If there is no
* a lease that has same parent key, Send lease break
@@ -3317,6 +3497,24 @@ int smb2_open(struct ksmbd_work *work)
memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
+ if (dh_info.type == DURABLE_REQ_V2 || dh_info.type == DURABLE_REQ) {
+ if (dh_info.type == DURABLE_REQ_V2 && dh_info.persistent)
+ fp->is_persistent = true;
+ else
+ fp->is_durable = true;
+
+ if (dh_info.type == DURABLE_REQ_V2) {
+ memcpy(fp->create_guid, dh_info.CreateGuid,
+ SMB2_CREATE_GUID_SIZE);
+ if (dh_info.timeout)
+ fp->durable_timeout = min(dh_info.timeout,
+ 300000);
+ else
+ fp->durable_timeout = 60;
+ }
+ }
+
+reconnected_fp:
rsp->StructureSize = cpu_to_le16(89);
rcu_read_lock();
opinfo = rcu_dereference(fp->f_opinfo);
@@ -3403,6 +3601,33 @@ int smb2_open(struct ksmbd_work *work)
next_off = conn->vals->create_disk_id_size;
}
+ if (dh_info.type == DURABLE_REQ || dh_info.type == DURABLE_REQ_V2) {
+ struct create_context *durable_ccontext;
+
+ durable_ccontext = (struct create_context *)(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength));
+ contxt_cnt++;
+ if (dh_info.type == DURABLE_REQ) {
+ create_durable_rsp_buf(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength));
+ le32_add_cpu(&rsp->CreateContextsLength,
+ conn->vals->create_durable_size);
+ iov_len += conn->vals->create_durable_size;
+ } else {
+ create_durable_v2_rsp_buf(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength),
+ fp);
+ le32_add_cpu(&rsp->CreateContextsLength,
+ conn->vals->create_durable_v2_size);
+ iov_len += conn->vals->create_durable_v2_size;
+ }
+
+ if (next_ptr)
+ *next_ptr = cpu_to_le32(next_off);
+ next_ptr = &durable_ccontext->Next;
+ next_off = conn->vals->create_durable_size;
+ }
+
if (posix_ctxt) {
contxt_cnt++;
create_posix_rsp_buf(rsp->Buffer +
@@ -3828,11 +4053,16 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv)
}
ksmbd_kstat.kstat = &kstat;
- if (priv->info_level != FILE_NAMES_INFORMATION)
- ksmbd_vfs_fill_dentry_attrs(priv->work,
- idmap,
- dent,
- &ksmbd_kstat);
+ if (priv->info_level != FILE_NAMES_INFORMATION) {
+ rc = ksmbd_vfs_fill_dentry_attrs(priv->work,
+ idmap,
+ dent,
+ &ksmbd_kstat);
+ if (rc) {
+ dput(dent);
+ continue;
+ }
+ }
rc = smb2_populate_readdir_entry(priv->work->conn,
priv->info_level,
@@ -4075,7 +4305,7 @@ int smb2_query_dir(struct ksmbd_work *work)
}
srch_flag = req->Flags;
- srch_ptr = smb_strndup_from_utf16(req->Buffer,
+ srch_ptr = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->FileNameOffset),
le16_to_cpu(req->FileNameLength), 1,
conn->local_nls);
if (IS_ERR(srch_ptr)) {
@@ -4335,7 +4565,8 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
sizeof(struct smb2_ea_info_req))
return -EINVAL;
- ea_req = (struct smb2_ea_info_req *)req->Buffer;
+ ea_req = (struct smb2_ea_info_req *)((char *)req +
+ le16_to_cpu(req->InputBufferOffset));
} else {
/* need to send all EAs, if no specific EA is requested*/
if (le32_to_cpu(req->Flags) & SL_RETURN_SINGLE_ENTRY)
@@ -4480,6 +4711,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
struct smb2_file_basic_info *basic_info;
struct kstat stat;
u64 time;
+ int ret;
if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
pr_err("no right to read the attributes : 0x%x\n",
@@ -4487,9 +4719,12 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
return -EACCES;
}
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
+
basic_info = (struct smb2_file_basic_info *)rsp->Buffer;
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
- file_inode(fp->filp), &stat);
basic_info->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
basic_info->LastAccessTime = cpu_to_le64(time);
@@ -4504,27 +4739,31 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
return 0;
}
-static void get_file_standard_info(struct smb2_query_info_rsp *rsp,
- struct ksmbd_file *fp, void *rsp_org)
+static int get_file_standard_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
{
struct smb2_file_standard_info *sinfo;
unsigned int delete_pending;
- struct inode *inode;
struct kstat stat;
+ int ret;
- inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
delete_pending = ksmbd_inode_pending_delete(fp);
- sinfo->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
+ sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9);
sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending);
sinfo->DeletePending = delete_pending;
sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_standard_info));
+
+ return 0;
}
static void get_file_alignment_info(struct smb2_query_info_rsp *rsp,
@@ -4546,11 +4785,11 @@ static int get_file_all_info(struct ksmbd_work *work,
struct ksmbd_conn *conn = work->conn;
struct smb2_file_all_info *file_info;
unsigned int delete_pending;
- struct inode *inode;
struct kstat stat;
int conv_len;
char *filename;
u64 time;
+ int ret;
if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
ksmbd_debug(SMB, "no right to read the attributes : 0x%x\n",
@@ -4562,8 +4801,10 @@ static int get_file_all_info(struct ksmbd_work *work,
if (IS_ERR(filename))
return PTR_ERR(filename);
- inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
ksmbd_debug(SMB, "filename = %s\n", filename);
delete_pending = ksmbd_inode_pending_delete(fp);
@@ -4579,7 +4820,7 @@ static int get_file_all_info(struct ksmbd_work *work,
file_info->Attributes = fp->f_ci->m_fattr;
file_info->Pad1 = 0;
file_info->AllocationSize =
- cpu_to_le64(inode->i_blocks << 9);
+ cpu_to_le64(stat.blocks << 9);
file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
file_info->NumberOfLinks =
cpu_to_le32(get_nlink(&stat) - delete_pending);
@@ -4623,10 +4864,10 @@ static void get_file_alternate_info(struct ksmbd_work *work,
cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len);
}
-static void get_file_stream_info(struct ksmbd_work *work,
- struct smb2_query_info_rsp *rsp,
- struct ksmbd_file *fp,
- void *rsp_org)
+static int get_file_stream_info(struct ksmbd_work *work,
+ struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp,
+ void *rsp_org)
{
struct ksmbd_conn *conn = work->conn;
struct smb2_file_stream_info *file_info;
@@ -4637,9 +4878,13 @@ static void get_file_stream_info(struct ksmbd_work *work,
int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
int buf_free_len;
struct smb2_query_info_req *req = ksmbd_req_buf_next(work);
+ int ret;
+
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
- file_inode(fp->filp), &stat);
file_info = (struct smb2_file_stream_info *)rsp->Buffer;
buf_free_len =
@@ -4720,29 +4965,37 @@ out:
kvfree(xattr_list);
rsp->OutputBufferLength = cpu_to_le32(nbytes);
+
+ return 0;
}
-static void get_file_internal_info(struct smb2_query_info_rsp *rsp,
- struct ksmbd_file *fp, void *rsp_org)
+static int get_file_internal_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
{
struct smb2_file_internal_info *file_info;
struct kstat stat;
+ int ret;
+
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
- file_inode(fp->filp), &stat);
file_info = (struct smb2_file_internal_info *)rsp->Buffer;
file_info->IndexNumber = cpu_to_le64(stat.ino);
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_internal_info));
+
+ return 0;
}
static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
struct ksmbd_file *fp, void *rsp_org)
{
struct smb2_file_ntwrk_info *file_info;
- struct inode *inode;
struct kstat stat;
u64 time;
+ int ret;
if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
pr_err("no right to read the attributes : 0x%x\n",
@@ -4750,10 +5003,12 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
return -EACCES;
}
- file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
- inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+ file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
@@ -4763,8 +5018,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
time = ksmbd_UnixTimeToNT(stat.ctime);
file_info->ChangeTime = cpu_to_le64(time);
file_info->Attributes = fp->f_ci->m_fattr;
- file_info->AllocationSize =
- cpu_to_le64(inode->i_blocks << 9);
+ file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
file_info->Reserved = cpu_to_le32(0);
rsp->OutputBufferLength =
@@ -4804,14 +5058,17 @@ static void get_file_mode_info(struct smb2_query_info_rsp *rsp,
cpu_to_le32(sizeof(struct smb2_file_mode_info));
}
-static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
- struct ksmbd_file *fp, void *rsp_org)
+static int get_file_compression_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
{
struct smb2_file_comp_info *file_info;
struct kstat stat;
+ int ret;
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
- file_inode(fp->filp), &stat);
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
file_info = (struct smb2_file_comp_info *)rsp->Buffer;
file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9);
@@ -4823,6 +5080,8 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_comp_info));
+
+ return 0;
}
static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
@@ -4844,7 +5103,7 @@ static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
return 0;
}
-static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
+static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
struct ksmbd_file *fp, void *rsp_org)
{
struct smb311_posix_qinfo *file_info;
@@ -4852,24 +5111,31 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
struct mnt_idmap *idmap = file_mnt_idmap(fp->filp);
vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
+ struct kstat stat;
u64 time;
int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32;
+ int ret;
+
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
- time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
+ time = ksmbd_UnixTimeToNT(stat.atime);
file_info->LastAccessTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
+ time = ksmbd_UnixTimeToNT(stat.mtime);
file_info->LastWriteTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
+ time = ksmbd_UnixTimeToNT(stat.ctime);
file_info->ChangeTime = cpu_to_le64(time);
file_info->DosAttributes = fp->f_ci->m_fattr;
- file_info->Inode = cpu_to_le64(inode->i_ino);
- file_info->EndOfFile = cpu_to_le64(inode->i_size);
- file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
- file_info->HardLinks = cpu_to_le32(inode->i_nlink);
- file_info->Mode = cpu_to_le32(inode->i_mode & 0777);
- file_info->DeviceId = cpu_to_le32(inode->i_rdev);
+ file_info->Inode = cpu_to_le64(stat.ino);
+ file_info->EndOfFile = cpu_to_le64(stat.size);
+ file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ file_info->HardLinks = cpu_to_le32(stat.nlink);
+ file_info->Mode = cpu_to_le32(stat.mode & 0777);
+ file_info->DeviceId = cpu_to_le32(stat.rdev);
/*
* Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)).
@@ -4882,6 +5148,8 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]);
rsp->OutputBufferLength = cpu_to_le32(out_buf_len);
+
+ return 0;
}
static int smb2_get_info_file(struct ksmbd_work *work,
@@ -4930,7 +5198,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
break;
case FILE_STANDARD_INFORMATION:
- get_file_standard_info(rsp, fp, work->response_buf);
+ rc = get_file_standard_info(rsp, fp, work->response_buf);
break;
case FILE_ALIGNMENT_INFORMATION:
@@ -4946,11 +5214,11 @@ static int smb2_get_info_file(struct ksmbd_work *work,
break;
case FILE_STREAM_INFORMATION:
- get_file_stream_info(work, rsp, fp, work->response_buf);
+ rc = get_file_stream_info(work, rsp, fp, work->response_buf);
break;
case FILE_INTERNAL_INFORMATION:
- get_file_internal_info(rsp, fp, work->response_buf);
+ rc = get_file_internal_info(rsp, fp, work->response_buf);
break;
case FILE_NETWORK_OPEN_INFORMATION:
@@ -4974,7 +5242,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
break;
case FILE_COMPRESSION_INFORMATION:
- get_file_compression_info(rsp, fp, work->response_buf);
+ rc = get_file_compression_info(rsp, fp, work->response_buf);
break;
case FILE_ATTRIBUTE_TAG_INFORMATION:
@@ -4985,7 +5253,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
rc = -EOPNOTSUPP;
} else {
- find_file_posix_info(rsp, fp, work->response_buf);
+ rc = find_file_posix_info(rsp, fp, work->response_buf);
}
break;
default:
@@ -5398,7 +5666,6 @@ int smb2_close(struct ksmbd_work *work)
struct smb2_close_rsp *rsp;
struct ksmbd_conn *conn = work->conn;
struct ksmbd_file *fp;
- struct inode *inode;
u64 time;
int err = 0;
@@ -5453,24 +5720,33 @@ int smb2_close(struct ksmbd_work *work)
rsp->Reserved = 0;
if (req->Flags == SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB) {
+ struct kstat stat;
+ int ret;
+
fp = ksmbd_lookup_fd_fast(work, volatile_id);
if (!fp) {
err = -ENOENT;
goto out;
}
- inode = file_inode(fp->filp);
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret) {
+ ksmbd_fd_put(work, fp);
+ goto out;
+ }
+
rsp->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB;
- rsp->AllocationSize = S_ISDIR(inode->i_mode) ? 0 :
- cpu_to_le64(inode->i_blocks << 9);
- rsp->EndOfFile = cpu_to_le64(inode->i_size);
+ rsp->AllocationSize = S_ISDIR(stat.mode) ? 0 :
+ cpu_to_le64(stat.blocks << 9);
+ rsp->EndOfFile = cpu_to_le64(stat.size);
rsp->Attributes = fp->f_ci->m_fattr;
rsp->CreationTime = cpu_to_le64(fp->create_time);
- time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
+ time = ksmbd_UnixTimeToNT(stat.atime);
rsp->LastAccessTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
+ time = ksmbd_UnixTimeToNT(stat.mtime);
rsp->LastWriteTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
+ time = ksmbd_UnixTimeToNT(stat.ctime);
rsp->ChangeTime = cpu_to_le64(time);
ksmbd_fd_put(work, fp);
} else {
@@ -5759,15 +6035,21 @@ static int set_file_allocation_info(struct ksmbd_work *work,
loff_t alloc_blks;
struct inode *inode;
+ struct kstat stat;
int rc;
if (!(fp->daccess & FILE_WRITE_DATA_LE))
return -EACCES;
+ rc = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (rc)
+ return rc;
+
alloc_blks = (le64_to_cpu(file_alloc_info->AllocationSize) + 511) >> 9;
inode = file_inode(fp->filp);
- if (alloc_blks > inode->i_blocks) {
+ if (alloc_blks > stat.blocks) {
smb_break_all_levII_oplock(work, fp, 1);
rc = vfs_fallocate(fp->filp, FALLOC_FL_KEEP_SIZE, 0,
alloc_blks * 512);
@@ -5775,7 +6057,7 @@ static int set_file_allocation_info(struct ksmbd_work *work,
pr_err("vfs_fallocate is failed : %d\n", rc);
return rc;
}
- } else if (alloc_blks < inode->i_blocks) {
+ } else if (alloc_blks < stat.blocks) {
loff_t size;
/*
@@ -5930,6 +6212,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
struct ksmbd_share_config *share)
{
unsigned int buf_len = le32_to_cpu(req->BufferLength);
+ char *buffer = (char *)req + le16_to_cpu(req->BufferOffset);
switch (req->FileInfoClass) {
case FILE_BASIC_INFORMATION:
@@ -5937,7 +6220,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
if (buf_len < sizeof(struct smb2_file_basic_info))
return -EINVAL;
- return set_file_basic_info(fp, (struct smb2_file_basic_info *)req->Buffer, share);
+ return set_file_basic_info(fp, (struct smb2_file_basic_info *)buffer, share);
}
case FILE_ALLOCATION_INFORMATION:
{
@@ -5945,7 +6228,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return set_file_allocation_info(work, fp,
- (struct smb2_file_alloc_info *)req->Buffer);
+ (struct smb2_file_alloc_info *)buffer);
}
case FILE_END_OF_FILE_INFORMATION:
{
@@ -5953,7 +6236,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return set_end_of_file_info(work, fp,
- (struct smb2_file_eof_info *)req->Buffer);
+ (struct smb2_file_eof_info *)buffer);
}
case FILE_RENAME_INFORMATION:
{
@@ -5961,7 +6244,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return set_rename_info(work, fp,
- (struct smb2_file_rename_info *)req->Buffer,
+ (struct smb2_file_rename_info *)buffer,
buf_len);
}
case FILE_LINK_INFORMATION:
@@ -5970,7 +6253,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return smb2_create_link(work, work->tcon->share_conf,
- (struct smb2_file_link_info *)req->Buffer,
+ (struct smb2_file_link_info *)buffer,
buf_len, fp->filp,
work->conn->local_nls);
}
@@ -5980,7 +6263,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return set_file_disposition_info(fp,
- (struct smb2_file_disposition_info *)req->Buffer);
+ (struct smb2_file_disposition_info *)buffer);
}
case FILE_FULL_EA_INFORMATION:
{
@@ -5993,7 +6276,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
if (buf_len < sizeof(struct smb2_ea_info))
return -EINVAL;
- return smb2_set_ea((struct smb2_ea_info *)req->Buffer,
+ return smb2_set_ea((struct smb2_ea_info *)buffer,
buf_len, &fp->filp->f_path, true);
}
case FILE_POSITION_INFORMATION:
@@ -6001,14 +6284,14 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
if (buf_len < sizeof(struct smb2_file_pos_info))
return -EINVAL;
- return set_file_position_info(fp, (struct smb2_file_pos_info *)req->Buffer);
+ return set_file_position_info(fp, (struct smb2_file_pos_info *)buffer);
}
case FILE_MODE_INFORMATION:
{
if (buf_len < sizeof(struct smb2_file_mode_info))
return -EINVAL;
- return set_file_mode_info(fp, (struct smb2_file_mode_info *)req->Buffer);
+ return set_file_mode_info(fp, (struct smb2_file_mode_info *)buffer);
}
}
@@ -6089,7 +6372,7 @@ int smb2_set_info(struct ksmbd_work *work)
}
rc = smb2_set_info_sec(fp,
le32_to_cpu(req->AdditionalInformation),
- req->Buffer,
+ (char *)req + le16_to_cpu(req->BufferOffset),
le32_to_cpu(req->BufferLength));
ksmbd_revert_fsids(work);
break;
@@ -7535,7 +7818,7 @@ static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id,
struct smb2_ioctl_rsp *rsp)
{
struct ksmbd_rpc_command *rpc_resp;
- char *data_buf = (char *)&req->Buffer[0];
+ char *data_buf = (char *)req + le32_to_cpu(req->InputOffset);
int nbytes = 0;
rpc_resp = ksmbd_rpc_ioctl(work->sess, id, data_buf,
@@ -7648,6 +7931,7 @@ int smb2_ioctl(struct ksmbd_work *work)
u64 id = KSMBD_NO_FID;
struct ksmbd_conn *conn = work->conn;
int ret = 0;
+ char *buffer;
if (work->next_smb2_rcv_hdr_off) {
req = ksmbd_req_buf_next(work);
@@ -7670,6 +7954,8 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ buffer = (char *)req + le32_to_cpu(req->InputOffset);
+
cnt_code = le32_to_cpu(req->CtlCode);
ret = smb2_calc_max_out_buf_len(work, 48,
le32_to_cpu(req->MaxOutputResponse));
@@ -7727,7 +8013,7 @@ int smb2_ioctl(struct ksmbd_work *work)
}
ret = fsctl_validate_negotiate_info(conn,
- (struct validate_negotiate_info_req *)&req->Buffer[0],
+ (struct validate_negotiate_info_req *)buffer,
(struct validate_negotiate_info_rsp *)&rsp->Buffer[0],
in_buf_len);
if (ret < 0)
@@ -7780,7 +8066,7 @@ int smb2_ioctl(struct ksmbd_work *work)
rsp->VolatileFileId = req->VolatileFileId;
rsp->PersistentFileId = req->PersistentFileId;
fsctl_copychunk(work,
- (struct copychunk_ioctl_req *)&req->Buffer[0],
+ (struct copychunk_ioctl_req *)buffer,
le32_to_cpu(req->CtlCode),
le32_to_cpu(req->InputCount),
req->VolatileFileId,
@@ -7793,8 +8079,7 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- ret = fsctl_set_sparse(work, id,
- (struct file_sparse *)&req->Buffer[0]);
+ ret = fsctl_set_sparse(work, id, (struct file_sparse *)buffer);
if (ret < 0)
goto out;
break;
@@ -7817,7 +8102,7 @@ int smb2_ioctl(struct ksmbd_work *work)
}
zero_data =
- (struct file_zero_data_information *)&req->Buffer[0];
+ (struct file_zero_data_information *)buffer;
off = le64_to_cpu(zero_data->FileOffset);
bfz = le64_to_cpu(zero_data->BeyondFinalZero);
@@ -7848,7 +8133,7 @@ int smb2_ioctl(struct ksmbd_work *work)
}
ret = fsctl_query_allocated_ranges(work, id,
- (struct file_allocated_range_buffer *)&req->Buffer[0],
+ (struct file_allocated_range_buffer *)buffer,
(struct file_allocated_range_buffer *)&rsp->Buffer[0],
out_buf_len /
sizeof(struct file_allocated_range_buffer), &nbytes);
@@ -7892,7 +8177,7 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0];
+ dup_ext = (struct duplicate_extents_to_file *)buffer;
fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle,
dup_ext->PersistentFileHandle);
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index d12cfd3b0927..bd1d2a0e9203 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -72,6 +72,18 @@ struct create_durable_req_v2 {
__u8 CreateGuid[16];
} __packed;
+struct create_durable_reconn_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ union {
+ __u8 Reserved[16];
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ } Data;
+} __packed;
+
struct create_durable_reconn_v2_req {
struct create_context ccontext;
__u8 Name[8];
@@ -98,6 +110,9 @@ struct create_durable_rsp {
} Data;
} __packed;
+/* See MS-SMB2 2.2.13.2.11 */
+/* Flags */
+#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
struct create_durable_v2_rsp {
struct create_context ccontext;
__u8 Name[8];
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 7c98bf699772..fcaf373cc008 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -457,10 +457,13 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
}
ksmbd_kstat.kstat = &kstat;
- ksmbd_vfs_fill_dentry_attrs(work,
- idmap,
- dentry,
- &ksmbd_kstat);
+ rc = ksmbd_vfs_fill_dentry_attrs(work,
+ idmap,
+ dentry,
+ &ksmbd_kstat);
+ if (rc)
+ break;
+
rc = fn(conn, info_level, d_info, &ksmbd_kstat);
if (rc)
break;
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index c487e834331a..22f0f3db3ac9 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1682,11 +1682,19 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
struct dentry *dentry,
struct ksmbd_kstat *ksmbd_kstat)
{
+ struct ksmbd_share_config *share_conf = work->tcon->share_conf;
u64 time;
int rc;
+ struct path path = {
+ .mnt = share_conf->vfs_path.mnt,
+ .dentry = dentry,
+ };
- generic_fillattr(idmap, STATX_BASIC_STATS, d_inode(dentry),
- ksmbd_kstat->kstat);
+ rc = vfs_getattr(&path, ksmbd_kstat->kstat,
+ STATX_BASIC_STATS | STATX_BTIME,
+ AT_STATX_SYNC_AS_STAT);
+ if (rc)
+ return rc;
time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime);
ksmbd_kstat->create_time = time;
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 4e82ff627d12..030f70700036 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -305,7 +305,8 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
fd_limit_close();
__ksmbd_remove_durable_fd(fp);
- __ksmbd_remove_fd(ft, fp);
+ if (ft)
+ __ksmbd_remove_fd(ft, fp);
close_id_del_oplock(fp);
filp = fp->filp;
@@ -465,11 +466,32 @@ struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
return fp;
}
-struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id)
+struct ksmbd_file *ksmbd_lookup_global_fd(unsigned long long id)
{
return __ksmbd_lookup_fd(&global_ft, id);
}
+struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id)
+{
+ struct ksmbd_file *fp;
+
+ fp = __ksmbd_lookup_fd(&global_ft, id);
+ if (fp && fp->conn) {
+ ksmbd_put_durable_fd(fp);
+ fp = NULL;
+ }
+
+ return fp;
+}
+
+void ksmbd_put_durable_fd(struct ksmbd_file *fp)
+{
+ if (!atomic_dec_and_test(&fp->refcount))
+ return;
+
+ __ksmbd_close_fd(NULL, fp);
+}
+
struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid)
{
struct ksmbd_file *fp = NULL;
@@ -639,6 +661,32 @@ __close_file_table_ids(struct ksmbd_file_table *ft,
return num;
}
+static inline bool is_reconnectable(struct ksmbd_file *fp)
+{
+ struct oplock_info *opinfo = opinfo_get(fp);
+ bool reconn = false;
+
+ if (!opinfo)
+ return false;
+
+ if (opinfo->op_state != OPLOCK_STATE_NONE) {
+ opinfo_put(opinfo);
+ return false;
+ }
+
+ if (fp->is_resilient || fp->is_persistent)
+ reconn = true;
+ else if (fp->is_durable && opinfo->is_lease &&
+ opinfo->o_lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+ reconn = true;
+
+ else if (fp->is_durable && opinfo->level == SMB2_OPLOCK_LEVEL_BATCH)
+ reconn = true;
+
+ opinfo_put(opinfo);
+ return reconn;
+}
+
static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
struct ksmbd_file *fp)
{
@@ -648,7 +696,28 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
static bool session_fd_check(struct ksmbd_tree_connect *tcon,
struct ksmbd_file *fp)
{
- return false;
+ struct ksmbd_inode *ci;
+ struct oplock_info *op;
+ struct ksmbd_conn *conn;
+
+ if (!is_reconnectable(fp))
+ return false;
+
+ conn = fp->conn;
+ ci = fp->f_ci;
+ write_lock(&ci->m_lock);
+ list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
+ if (op->conn != conn)
+ continue;
+ op->conn = NULL;
+ }
+ write_unlock(&ci->m_lock);
+
+ fp->conn = NULL;
+ fp->tcon = NULL;
+ fp->volatile_id = KSMBD_NO_FID;
+
+ return true;
}
void ksmbd_close_tree_conn_fds(struct ksmbd_work *work)
@@ -687,6 +756,68 @@ void ksmbd_free_global_file_table(void)
ksmbd_destroy_file_table(&global_ft);
}
+int ksmbd_validate_name_reconnect(struct ksmbd_share_config *share,
+ struct ksmbd_file *fp, char *name)
+{
+ char *pathname, *ab_pathname;
+ int ret = 0;
+
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathname)
+ return -EACCES;
+
+ ab_pathname = d_path(&fp->filp->f_path, pathname, PATH_MAX);
+ if (IS_ERR(ab_pathname)) {
+ kfree(pathname);
+ return -EACCES;
+ }
+
+ if (name && strcmp(&ab_pathname[share->path_sz + 1], name)) {
+ ksmbd_debug(SMB, "invalid name reconnect %s\n", name);
+ ret = -EINVAL;
+ }
+
+ kfree(pathname);
+
+ return ret;
+}
+
+int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp)
+{
+ struct ksmbd_inode *ci;
+ struct oplock_info *op;
+
+ if (!fp->is_durable || fp->conn || fp->tcon) {
+ pr_err("Invalid durable fd [%p:%p]\n", fp->conn, fp->tcon);
+ return -EBADF;
+ }
+
+ if (has_file_id(fp->volatile_id)) {
+ pr_err("Still in use durable fd: %llu\n", fp->volatile_id);
+ return -EBADF;
+ }
+
+ fp->conn = work->conn;
+ fp->tcon = work->tcon;
+
+ ci = fp->f_ci;
+ write_lock(&ci->m_lock);
+ list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
+ if (op->conn)
+ continue;
+ op->conn = fp->conn;
+ }
+ write_unlock(&ci->m_lock);
+
+ __open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID);
+ if (!has_file_id(fp->volatile_id)) {
+ fp->conn = NULL;
+ fp->tcon = NULL;
+ return -EBADF;
+ }
+ return 0;
+}
+
int ksmbd_init_file_table(struct ksmbd_file_table *ft)
{
ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL);
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index a528f0cc775a..ed44fb4e18e7 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -14,6 +14,7 @@
#include <linux/workqueue.h>
#include "vfs.h"
+#include "mgmt/share_config.h"
/* Windows style file permissions for extended response */
#define FILE_GENERIC_ALL 0x1F01FF
@@ -106,6 +107,9 @@ struct ksmbd_file {
int dot_dotdot[2];
unsigned int f_state;
bool reserve_lease_break;
+ bool is_durable;
+ bool is_persistent;
+ bool is_resilient;
};
static inline void set_ctx_actor(struct dir_context *ctx,
@@ -141,7 +145,9 @@ struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp);
struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d);
void ksmbd_inode_put(struct ksmbd_inode *ci);
+struct ksmbd_file *ksmbd_lookup_global_fd(unsigned long long id);
struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id);
+void ksmbd_put_durable_fd(struct ksmbd_file *fp);
struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid);
struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry);
unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp);
@@ -173,6 +179,9 @@ void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp);
void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp);
void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp,
int file_info);
+int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp);
+int ksmbd_validate_name_reconnect(struct ksmbd_share_config *share,
+ struct ksmbd_file *fp, char *name);
int ksmbd_init_file_cache(void);
void ksmbd_exit_file_cache(void);
#endif /* __VFS_CACHE_H__ */
diff --git a/fs/super.c b/fs/super.c
index ee05ab6b37e7..71d9779c42b1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1515,11 +1515,29 @@ static int fs_bdev_thaw(struct block_device *bdev)
return error;
}
+static void fs_bdev_super_get(void *data)
+{
+ struct super_block *sb = data;
+
+ spin_lock(&sb_lock);
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+}
+
+static void fs_bdev_super_put(void *data)
+{
+ struct super_block *sb = data;
+
+ put_super(sb);
+}
+
const struct blk_holder_ops fs_holder_ops = {
.mark_dead = fs_bdev_mark_dead,
.sync = fs_bdev_sync,
.freeze = fs_bdev_freeze,
.thaw = fs_bdev_thaw,
+ .get_holder = fs_bdev_super_get,
+ .put_holder = fs_bdev_super_put,
};
EXPORT_SYMBOL_GPL(fs_holder_ops);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 138676463336..d22ad67a0f32 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -31,6 +31,17 @@ static void remove_files(struct kernfs_node *parent,
kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
}
+static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj)
+{
+ if (grp->attrs && grp->attrs[0] && grp->is_visible)
+ return grp->is_visible(kobj, grp->attrs[0], 0);
+
+ if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible)
+ return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0);
+
+ return 0;
+}
+
static int create_files(struct kernfs_node *parent, struct kobject *kobj,
kuid_t uid, kgid_t gid,
const struct attribute_group *grp, int update)
@@ -52,6 +63,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
kernfs_remove_by_name(parent, (*attr)->name);
if (grp->is_visible) {
mode = grp->is_visible(kobj, *attr, i);
+ mode &= ~SYSFS_GROUP_INVISIBLE;
if (!mode)
continue;
}
@@ -81,6 +93,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
(*bin_attr)->attr.name);
if (grp->is_bin_visible) {
mode = grp->is_bin_visible(kobj, *bin_attr, i);
+ mode &= ~SYSFS_GROUP_INVISIBLE;
if (!mode)
continue;
}
@@ -127,16 +140,31 @@ static int internal_create_group(struct kobject *kobj, int update,
kobject_get_ownership(kobj, &uid, &gid);
if (grp->name) {
+ umode_t mode = __first_visible(grp, kobj);
+
+ if (mode & SYSFS_GROUP_INVISIBLE)
+ mode = 0;
+ else
+ mode = S_IRWXU | S_IRUGO | S_IXUGO;
+
if (update) {
kn = kernfs_find_and_get(kobj->sd, grp->name);
if (!kn) {
- pr_warn("Can't update unknown attr grp name: %s/%s\n",
- kobj->name, grp->name);
- return -EINVAL;
+ pr_debug("attr grp %s/%s not created yet\n",
+ kobj->name, grp->name);
+ /* may have been invisible prior to this update */
+ update = 0;
+ } else if (!mode) {
+ sysfs_remove_group(kobj, grp);
+ kernfs_put(kn);
+ return 0;
}
- } else {
- kn = kernfs_create_dir_ns(kobj->sd, grp->name,
- S_IRWXU | S_IRUGO | S_IXUGO,
+ }
+
+ if (!update) {
+ if (!mode)
+ return 0;
+ kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode,
uid, gid, kobj, NULL);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
@@ -279,9 +307,8 @@ void sysfs_remove_group(struct kobject *kobj,
if (grp->name) {
kn = kernfs_find_and_get(parent, grp->name);
if (!kn) {
- WARN(!kn, KERN_WARNING
- "sysfs group '%s' not found for kobject '%s'\n",
- grp->name, kobject_name(kobj));
+ pr_debug("sysfs group '%s' not found for kobject '%s'\n",
+ grp->name, kobject_name(kobj));
return;
}
} else {
@@ -318,13 +345,13 @@ void sysfs_remove_groups(struct kobject *kobj,
EXPORT_SYMBOL_GPL(sysfs_remove_groups);
/**
- * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * sysfs_merge_group - merge files into a pre-existing named attribute group.
* @kobj: The kobject containing the group.
* @grp: The files to create and the attribute group they belong to.
*
- * This function returns an error if the group doesn't exist or any of the
- * files already exist in that group, in which case none of the new files
- * are created.
+ * This function returns an error if the group doesn't exist, the .name field is
+ * NULL or any of the files already exist in that group, in which case none of
+ * the new files are created.
*/
int sysfs_merge_group(struct kobject *kobj,
const struct attribute_group *grp)
@@ -356,7 +383,7 @@ int sysfs_merge_group(struct kobject *kobj,
EXPORT_SYMBOL_GPL(sysfs_merge_group);
/**
- * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * sysfs_unmerge_group - remove files from a pre-existing named attribute group.
* @kobj: The kobject containing the group.
* @grp: The files to remove and the attribute group they belong to.
*/
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 110e8a272189..dc067eeb6387 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -35,6 +35,17 @@ static DEFINE_MUTEX(eventfs_mutex);
/* Choose something "unique" ;-) */
#define EVENTFS_FILE_INODE_INO 0x12c4e37
+struct eventfs_root_inode {
+ struct eventfs_inode ei;
+ struct dentry *events_dir;
+};
+
+static struct eventfs_root_inode *get_root_inode(struct eventfs_inode *ei)
+{
+ WARN_ON_ONCE(!ei->is_events);
+ return container_of(ei, struct eventfs_root_inode, ei);
+}
+
/* Just try to make something consistent and unique */
static int eventfs_dir_ino(struct eventfs_inode *ei)
{
@@ -73,12 +84,18 @@ enum {
static void release_ei(struct kref *ref)
{
struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
+ struct eventfs_root_inode *rei;
WARN_ON_ONCE(!ei->is_freed);
kfree(ei->entry_attrs);
kfree_const(ei->name);
- kfree_rcu(ei, rcu);
+ if (ei->is_events) {
+ rei = get_root_inode(ei);
+ kfree_rcu(rei, ei.rcu);
+ } else {
+ kfree_rcu(ei, rcu);
+ }
}
static inline void put_ei(struct eventfs_inode *ei)
@@ -408,19 +425,43 @@ static struct dentry *lookup_dir_entry(struct dentry *dentry,
return NULL;
}
+static inline struct eventfs_inode *init_ei(struct eventfs_inode *ei, const char *name)
+{
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name)
+ return NULL;
+ kref_init(&ei->kref);
+ return ei;
+}
+
static inline struct eventfs_inode *alloc_ei(const char *name)
{
struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ struct eventfs_inode *result;
if (!ei)
return NULL;
- ei->name = kstrdup_const(name, GFP_KERNEL);
- if (!ei->name) {
+ result = init_ei(ei, name);
+ if (!result)
kfree(ei);
+
+ return result;
+}
+
+static inline struct eventfs_inode *alloc_root_ei(const char *name)
+{
+ struct eventfs_root_inode *rei = kzalloc(sizeof(*rei), GFP_KERNEL);
+ struct eventfs_inode *ei;
+
+ if (!rei)
return NULL;
- }
- kref_init(&ei->kref);
+
+ rei->ei.is_events = 1;
+ ei = init_ei(&rei->ei, name);
+ if (!ei)
+ kfree(rei);
+
return ei;
}
@@ -483,7 +524,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *result = NULL;
ti = get_tracefs(dir);
- if (!(ti->flags & TRACEFS_EVENT_INODE))
+ if (WARN_ON_ONCE(!(ti->flags & TRACEFS_EVENT_INODE)))
return ERR_PTR(-EIO);
mutex_lock(&eventfs_mutex);
@@ -495,7 +536,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
list_for_each_entry(ei_child, &ei->children, list) {
if (strcmp(ei_child->name, name) != 0)
continue;
- if (ei_child->is_freed)
+ /* A child is freed and removed from the list at the same time */
+ if (WARN_ON_ONCE(ei_child->is_freed))
goto out;
result = lookup_dir_entry(dentry, ei, ei_child);
goto out;
@@ -709,6 +751,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
int size, void *data)
{
struct dentry *dentry = tracefs_start_creating(name, parent);
+ struct eventfs_root_inode *rei;
struct eventfs_inode *ei;
struct tracefs_inode *ti;
struct inode *inode;
@@ -721,7 +764,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
if (IS_ERR(dentry))
return ERR_CAST(dentry);
- ei = alloc_ei(name);
+ ei = alloc_root_ei(name);
if (!ei)
goto fail;
@@ -730,10 +773,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
goto fail;
// Note: we have a ref to the dentry from tracefs_start_creating()
- ei->events_dir = dentry;
+ rei = get_root_inode(ei);
+ rei->events_dir = dentry;
+
ei->entries = entries;
ei->nr_entries = size;
- ei->is_events = 1;
ei->data = data;
/* Save the ownership of this directory */
@@ -844,13 +888,15 @@ void eventfs_remove_dir(struct eventfs_inode *ei)
*/
void eventfs_remove_events_dir(struct eventfs_inode *ei)
{
+ struct eventfs_root_inode *rei;
struct dentry *dentry;
- dentry = ei->events_dir;
+ rei = get_root_inode(ei);
+ dentry = rei->events_dir;
if (!dentry)
return;
- ei->events_dir = NULL;
+ rei->events_dir = NULL;
eventfs_remove_dir(ei);
/*
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index beb3dcd0e434..15c26f9aaad4 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -36,7 +36,6 @@ struct eventfs_attr {
* @children: link list into the child eventfs_inode
* @entries: the array of entries representing the files in the directory
* @name: the name of the directory to create
- * @events_dir: the dentry of the events directory
* @entry_attrs: Saved mode and ownership of the @d_children
* @data: The private data to pass to the callbacks
* @attr: Saved mode and ownership of eventfs_inode itself
@@ -54,7 +53,6 @@ struct eventfs_inode {
struct list_head children;
const struct eventfs_entry *entries;
const char *name;
- struct dentry *events_dir;
struct eventfs_attr *entry_attrs;
void *data;
struct eventfs_attr attr;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index d013c5b3f1ed..ac77ac1fd73e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -1742,17 +1742,22 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
err = dbg_walk_index(c, NULL, add_size, &calc);
if (err) {
ubifs_err(c, "error %d while walking the index", err);
- return err;
+ goto out_err;
}
if (calc != idx_size) {
ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld",
calc, idx_size);
dump_stack();
- return -EINVAL;
+ err = -EINVAL;
+ goto out_err;
}
return 0;
+
+out_err:
+ ubifs_destroy_tnc_tree(c);
+ return err;
}
/**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 551148de66cd..eac0fef801f1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1133,6 +1133,8 @@ out_cancel:
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
+ /* Free inode->i_link before inode is marked as bad. */
+ fscrypt_free_inode(inode);
make_bad_inode(inode);
iput(inode);
out_fname:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5029eb3390a5..a1f46919934c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -96,36 +96,36 @@ dump:
return -EINVAL;
}
-static int do_readpage(struct page *page)
+static int do_readpage(struct folio *folio)
{
void *addr;
int err = 0, i;
unsigned int block, beyond;
- struct ubifs_data_node *dn;
- struct inode *inode = page->mapping->host;
+ struct ubifs_data_node *dn = NULL;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
loff_t i_size = i_size_read(inode);
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
- inode->i_ino, page->index, i_size, page->flags);
- ubifs_assert(c, !PageChecked(page));
- ubifs_assert(c, !PagePrivate(page));
+ inode->i_ino, folio->index, i_size, folio->flags);
+ ubifs_assert(c, !folio_test_checked(folio));
+ ubifs_assert(c, !folio->private);
- addr = kmap(page);
+ addr = kmap_local_folio(folio, 0);
- block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
if (block >= beyond) {
/* Reading beyond inode */
- SetPageChecked(page);
- memset(addr, 0, PAGE_SIZE);
+ folio_set_checked(folio);
+ addr = folio_zero_tail(folio, 0, addr);
goto out;
}
dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
if (!dn) {
err = -ENOMEM;
- goto error;
+ goto out;
}
i = 0;
@@ -150,39 +150,35 @@ static int do_readpage(struct page *page)
memset(addr + ilen, 0, dlen - ilen);
}
}
- if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio)))
break;
block += 1;
addr += UBIFS_BLOCK_SIZE;
+ if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) {
+ kunmap_local(addr - UBIFS_BLOCK_SIZE);
+ addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE);
+ }
}
+
if (err) {
struct ubifs_info *c = inode->i_sb->s_fs_info;
if (err == -ENOENT) {
/* Not found, so it must be a hole */
- SetPageChecked(page);
+ folio_set_checked(folio);
dbg_gen("hole");
- goto out_free;
+ err = 0;
+ } else {
+ ubifs_err(c, "cannot read page %lu of inode %lu, error %d",
+ folio->index, inode->i_ino, err);
}
- ubifs_err(c, "cannot read page %lu of inode %lu, error %d",
- page->index, inode->i_ino, err);
- goto error;
}
-out_free:
- kfree(dn);
out:
- SetPageUptodate(page);
- ClearPageError(page);
- flush_dcache_page(page);
- kunmap(page);
- return 0;
-
-error:
kfree(dn);
- ClearPageUptodate(page);
- SetPageError(page);
- flush_dcache_page(page);
- kunmap(page);
+ if (!err)
+ folio_mark_uptodate(folio);
+ flush_dcache_folio(folio);
+ kunmap_local(addr);
return err;
}
@@ -222,16 +218,16 @@ static int write_begin_slow(struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
struct ubifs_budget_req req = { .new_page = 1 };
int err, appending = !!(pos + len > inode->i_size);
- struct page *page;
+ struct folio *folio;
dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
inode->i_ino, pos, len, inode->i_size);
/*
- * At the slow path we have to budget before locking the page, because
- * budgeting may force write-back, which would wait on locked pages and
- * deadlock if we had the page locked. At this point we do not know
- * anything about the page, so assume that this is a new page which is
+ * At the slow path we have to budget before locking the folio, because
+ * budgeting may force write-back, which would wait on locked folios and
+ * deadlock if we had the folio locked. At this point we do not know
+ * anything about the folio, so assume that this is a new folio which is
* written to a hole. This corresponds to largest budget. Later the
* budget will be amended if this is not true.
*/
@@ -243,45 +239,43 @@ static int write_begin_slow(struct address_space *mapping,
if (unlikely(err))
return err;
- page = grab_cache_page_write_begin(mapping, index);
- if (unlikely(!page)) {
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio)) {
ubifs_release_budget(c, &req);
- return -ENOMEM;
+ return PTR_ERR(folio);
}
- if (!PageUptodate(page)) {
- if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE)
- SetPageChecked(page);
+ if (!folio_test_uptodate(folio)) {
+ if (pos == folio_pos(folio) && len >= folio_size(folio))
+ folio_set_checked(folio);
else {
- err = do_readpage(page);
+ err = do_readpage(folio);
if (err) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
ubifs_release_budget(c, &req);
return err;
}
}
-
- SetPageUptodate(page);
- ClearPageError(page);
}
- if (PagePrivate(page))
+ if (folio->private)
/*
- * The page is dirty, which means it was budgeted twice:
+ * The folio is dirty, which means it was budgeted twice:
* o first time the budget was allocated by the task which
- * made the page dirty and set the PG_private flag;
+ * made the folio dirty and set the private field;
* o and then we budgeted for it for the second time at the
* very beginning of this function.
*
- * So what we have to do is to release the page budget we
+ * So what we have to do is to release the folio budget we
* allocated.
*/
release_new_page_budget(c);
- else if (!PageChecked(page))
+ else if (!folio_test_checked(folio))
/*
- * We are changing a page which already exists on the media.
- * This means that changing the page does not make the amount
+ * We are changing a folio which already exists on the media.
+ * This means that changing the folio does not make the amount
* of indexing information larger, and this part of the budget
* which we have already acquired may be released.
*/
@@ -304,14 +298,14 @@ static int write_begin_slow(struct address_space *mapping,
ubifs_release_dirty_inode_budget(c, ui);
}
- *pagep = page;
+ *pagep = &folio->page;
return 0;
}
/**
* allocate_budget - allocate budget for 'ubifs_write_begin()'.
* @c: UBIFS file-system description object
- * @page: page to allocate budget for
+ * @folio: folio to allocate budget for
* @ui: UBIFS inode object the page belongs to
* @appending: non-zero if the page is appended
*
@@ -322,15 +316,15 @@ static int write_begin_slow(struct address_space *mapping,
*
* Returns: %0 in case of success and %-ENOSPC in case of failure.
*/
-static int allocate_budget(struct ubifs_info *c, struct page *page,
+static int allocate_budget(struct ubifs_info *c, struct folio *folio,
struct ubifs_inode *ui, int appending)
{
struct ubifs_budget_req req = { .fast = 1 };
- if (PagePrivate(page)) {
+ if (folio->private) {
if (!appending)
/*
- * The page is dirty and we are not appending, which
+ * The folio is dirty and we are not appending, which
* means no budget is needed at all.
*/
return 0;
@@ -354,11 +348,11 @@ static int allocate_budget(struct ubifs_info *c, struct page *page,
*/
req.dirtied_ino = 1;
} else {
- if (PageChecked(page))
+ if (folio_test_checked(folio))
/*
* The page corresponds to a hole and does not
* exist on the media. So changing it makes
- * make the amount of indexing information
+ * the amount of indexing information
* larger, and we have to budget for a new
* page.
*/
@@ -428,7 +422,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
int err, appending = !!(pos + len > inode->i_size);
int skipped_read = 0;
- struct page *page;
+ struct folio *folio;
ubifs_assert(c, ubifs_inode(inode)->ui_size == inode->i_size);
ubifs_assert(c, !c->ro_media && !c->ro_mount);
@@ -437,13 +431,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
return -EROFS;
/* Try out the fast-path part first */
- page = grab_cache_page_write_begin(mapping, index);
- if (unlikely(!page))
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
/* The page is not loaded from the flash */
- if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) {
+ if (pos == folio_pos(folio) && len >= folio_size(folio)) {
/*
* We change whole page so no need to load it. But we
* do not know whether this page exists on the media or
@@ -453,32 +448,27 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
* media. Thus, we are setting the @PG_checked flag
* here.
*/
- SetPageChecked(page);
+ folio_set_checked(folio);
skipped_read = 1;
} else {
- err = do_readpage(page);
+ err = do_readpage(folio);
if (err) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return err;
}
}
-
- SetPageUptodate(page);
- ClearPageError(page);
}
- err = allocate_budget(c, page, ui, appending);
+ err = allocate_budget(c, folio, ui, appending);
if (unlikely(err)) {
ubifs_assert(c, err == -ENOSPC);
/*
* If we skipped reading the page because we were going to
* write all of it, then it is not up to date.
*/
- if (skipped_read) {
- ClearPageChecked(page);
- ClearPageUptodate(page);
- }
+ if (skipped_read)
+ folio_clear_checked(folio);
/*
* Budgeting failed which means it would have to force
* write-back but didn't, because we set the @fast flag in the
@@ -490,8 +480,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
mutex_unlock(&ui->ui_mutex);
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return write_begin_slow(mapping, pos, len, pagep);
}
@@ -502,22 +492,21 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
* with @ui->ui_mutex locked if we are appending pages, and unlocked
* otherwise. This is an optimization (slightly hacky though).
*/
- *pagep = page;
+ *pagep = &folio->page;
return 0;
-
}
/**
* cancel_budget - cancel budget.
* @c: UBIFS file-system description object
- * @page: page to cancel budget for
+ * @folio: folio to cancel budget for
* @ui: UBIFS inode object the page belongs to
* @appending: non-zero if the page is appended
*
* This is a helper function for a page write operation. It unlocks the
* @ui->ui_mutex in case of appending.
*/
-static void cancel_budget(struct ubifs_info *c, struct page *page,
+static void cancel_budget(struct ubifs_info *c, struct folio *folio,
struct ubifs_inode *ui, int appending)
{
if (appending) {
@@ -525,8 +514,8 @@ static void cancel_budget(struct ubifs_info *c, struct page *page,
ubifs_release_dirty_inode_budget(c, ui);
mutex_unlock(&ui->ui_mutex);
}
- if (!PagePrivate(page)) {
- if (PageChecked(page))
+ if (!folio->private) {
+ if (folio_test_checked(folio))
release_new_page_budget(c);
else
release_existing_page_budget(c);
@@ -537,6 +526,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -544,44 +534,47 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
int appending = !!(end_pos > inode->i_size);
dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
- inode->i_ino, pos, page->index, len, copied, inode->i_size);
+ inode->i_ino, pos, folio->index, len, copied, inode->i_size);
- if (unlikely(copied < len && len == PAGE_SIZE)) {
+ if (unlikely(copied < len && !folio_test_uptodate(folio))) {
/*
- * VFS copied less data to the page that it intended and
+ * VFS copied less data to the folio than it intended and
* declared in its '->write_begin()' call via the @len
- * argument. If the page was not up-to-date, and @len was
- * @PAGE_SIZE, the 'ubifs_write_begin()' function did
+ * argument. If the folio was not up-to-date,
+ * the 'ubifs_write_begin()' function did
* not load it from the media (for optimization reasons). This
- * means that part of the page contains garbage. So read the
- * page now.
+ * means that part of the folio contains garbage. So read the
+ * folio now.
*/
dbg_gen("copied %d instead of %d, read page and repeat",
copied, len);
- cancel_budget(c, page, ui, appending);
- ClearPageChecked(page);
+ cancel_budget(c, folio, ui, appending);
+ folio_clear_checked(folio);
/*
* Return 0 to force VFS to repeat the whole operation, or the
* error code if 'do_readpage()' fails.
*/
- copied = do_readpage(page);
+ copied = do_readpage(folio);
goto out;
}
- if (!PagePrivate(page)) {
- attach_page_private(page, (void *)1);
+ if (len == folio_size(folio))
+ folio_mark_uptodate(folio);
+
+ if (!folio->private) {
+ folio_attach_private(folio, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
- __set_page_dirty_nobuffers(page);
+ filemap_dirty_folio(mapping, folio);
}
if (appending) {
i_size_write(inode, end_pos);
ui->ui_size = end_pos;
/*
- * Note, we do not set @I_DIRTY_PAGES (which means that the
- * inode has dirty pages), this has been done in
- * '__set_page_dirty_nobuffers()'.
+ * We do not set @I_DIRTY_PAGES (which means that
+ * the inode has dirty pages), this was done in
+ * filemap_dirty_folio().
*/
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
@@ -589,43 +582,43 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
}
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
/**
* populate_page - copy data nodes into a page for bulk-read.
* @c: UBIFS file-system description object
- * @page: page
+ * @folio: folio
* @bu: bulk-read information
* @n: next zbranch slot
*
* Returns: %0 on success and a negative error code on failure.
*/
-static int populate_page(struct ubifs_info *c, struct page *page,
+static int populate_page(struct ubifs_info *c, struct folio *folio,
struct bu_info *bu, int *n)
{
int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
loff_t i_size = i_size_read(inode);
unsigned int page_block;
void *addr, *zaddr;
pgoff_t end_index;
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
- inode->i_ino, page->index, i_size, page->flags);
+ inode->i_ino, folio->index, i_size, folio->flags);
- addr = zaddr = kmap(page);
+ addr = zaddr = kmap_local_folio(folio, 0);
end_index = (i_size - 1) >> PAGE_SHIFT;
- if (!i_size || page->index > end_index) {
+ if (!i_size || folio->index > end_index) {
hole = 1;
- memset(addr, 0, PAGE_SIZE);
+ addr = folio_zero_tail(folio, 0, addr);
goto out_hole;
}
- page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ page_block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
while (1) {
int err, len, out_len, dlen;
@@ -674,9 +667,13 @@ static int populate_page(struct ubifs_info *c, struct page *page,
break;
addr += UBIFS_BLOCK_SIZE;
page_block += 1;
+ if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) {
+ kunmap_local(addr - UBIFS_BLOCK_SIZE);
+ addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE);
+ }
}
- if (end_index == page->index) {
+ if (end_index == folio->index) {
int len = i_size & (PAGE_SIZE - 1);
if (len && len < read)
@@ -685,22 +682,19 @@ static int populate_page(struct ubifs_info *c, struct page *page,
out_hole:
if (hole) {
- SetPageChecked(page);
+ folio_set_checked(folio);
dbg_gen("hole");
}
- SetPageUptodate(page);
- ClearPageError(page);
- flush_dcache_page(page);
- kunmap(page);
+ folio_mark_uptodate(folio);
+ flush_dcache_folio(folio);
+ kunmap_local(addr);
*n = nn;
return 0;
out_err:
- ClearPageUptodate(page);
- SetPageError(page);
- flush_dcache_page(page);
- kunmap(page);
+ flush_dcache_folio(folio);
+ kunmap_local(addr);
ubifs_err(c, "bad data node (block %u, inode %lu)",
page_block, inode->i_ino);
return -EINVAL;
@@ -710,15 +704,15 @@ out_err:
* ubifs_do_bulk_read - do bulk-read.
* @c: UBIFS file-system description object
* @bu: bulk-read information
- * @page1: first page to read
+ * @folio1: first folio to read
*
* Returns: %1 if the bulk-read is done, otherwise %0 is returned.
*/
static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
- struct page *page1)
+ struct folio *folio1)
{
- pgoff_t offset = page1->index, end_index;
- struct address_space *mapping = page1->mapping;
+ pgoff_t offset = folio1->index, end_index;
+ struct address_space *mapping = folio1->mapping;
struct inode *inode = mapping->host;
struct ubifs_inode *ui = ubifs_inode(inode);
int err, page_idx, page_cnt, ret = 0, n = 0;
@@ -768,11 +762,11 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
goto out_warn;
}
- err = populate_page(c, page1, bu, &n);
+ err = populate_page(c, folio1, bu, &n);
if (err)
goto out_warn;
- unlock_page(page1);
+ folio_unlock(folio1);
ret = 1;
isize = i_size_read(inode);
@@ -782,19 +776,19 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
for (page_idx = 1; page_idx < page_cnt; page_idx++) {
pgoff_t page_offset = offset + page_idx;
- struct page *page;
+ struct folio *folio;
if (page_offset > end_index)
break;
- page = pagecache_get_page(mapping, page_offset,
+ folio = __filemap_get_folio(mapping, page_offset,
FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT,
ra_gfp_mask);
- if (!page)
+ if (IS_ERR(folio))
break;
- if (!PageUptodate(page))
- err = populate_page(c, page, bu, &n);
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio))
+ err = populate_page(c, folio, bu, &n);
+ folio_unlock(folio);
+ folio_put(folio);
if (err)
break;
}
@@ -817,7 +811,7 @@ out_bu_off:
/**
* ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
- * @page: page from which to start bulk-read.
+ * @folio: folio from which to start bulk-read.
*
* Some flash media are capable of reading sequentially at faster rates. UBIFS
* bulk-read facility is designed to take advantage of that, by reading in one
@@ -826,12 +820,12 @@ out_bu_off:
*
* Returns: %1 if a bulk-read is done and %0 otherwise.
*/
-static int ubifs_bulk_read(struct page *page)
+static int ubifs_bulk_read(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_inode *ui = ubifs_inode(inode);
- pgoff_t index = page->index, last_page_read = ui->last_page_read;
+ pgoff_t index = folio->index, last_page_read = ui->last_page_read;
struct bu_info *bu;
int err = 0, allocated = 0;
@@ -879,8 +873,8 @@ static int ubifs_bulk_read(struct page *page)
bu->buf_len = c->max_bu_buf_len;
data_key_init(c, &bu->key, inode->i_ino,
- page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
- err = ubifs_do_bulk_read(c, bu, page);
+ folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+ err = ubifs_do_bulk_read(c, bu, folio);
if (!allocated)
mutex_unlock(&c->bu_mutex);
@@ -894,69 +888,71 @@ out_unlock:
static int ubifs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
-
- if (ubifs_bulk_read(page))
+ if (ubifs_bulk_read(folio))
return 0;
- do_readpage(page);
+ do_readpage(folio);
folio_unlock(folio);
return 0;
}
-static int do_writepage(struct page *page, int len)
+static int do_writepage(struct folio *folio, size_t len)
{
- int err = 0, i, blen;
+ int err = 0, blen;
unsigned int block;
void *addr;
+ size_t offset = 0;
union ubifs_key key;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
#ifdef UBIFS_DEBUG
struct ubifs_inode *ui = ubifs_inode(inode);
spin_lock(&ui->ui_lock);
- ubifs_assert(c, page->index <= ui->synced_i_size >> PAGE_SHIFT);
+ ubifs_assert(c, folio->index <= ui->synced_i_size >> PAGE_SHIFT);
spin_unlock(&ui->ui_lock);
#endif
- /* Update radix tree tags */
- set_page_writeback(page);
+ folio_start_writeback(folio);
- addr = kmap(page);
- block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
- i = 0;
- while (len) {
- blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+ addr = kmap_local_folio(folio, offset);
+ block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ for (;;) {
+ blen = min_t(size_t, len, UBIFS_BLOCK_SIZE);
data_key_init(c, &key, inode->i_ino, block);
err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
if (err)
break;
- if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ len -= blen;
+ if (!len)
break;
block += 1;
addr += blen;
- len -= blen;
+ if (folio_test_highmem(folio) && !offset_in_page(addr)) {
+ kunmap_local(addr - blen);
+ offset += PAGE_SIZE;
+ addr = kmap_local_folio(folio, offset);
+ }
}
+ kunmap_local(addr);
if (err) {
- SetPageError(page);
- ubifs_err(c, "cannot write page %lu of inode %lu, error %d",
- page->index, inode->i_ino, err);
+ mapping_set_error(folio->mapping, err);
+ ubifs_err(c, "cannot write folio %lu of inode %lu, error %d",
+ folio->index, inode->i_ino, err);
ubifs_ro_mode(c, err);
}
- ubifs_assert(c, PagePrivate(page));
- if (PageChecked(page))
+ ubifs_assert(c, folio->private != NULL);
+ if (folio_test_checked(folio))
release_new_page_budget(c);
else
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- detach_page_private(page);
- ClearPageChecked(page);
+ folio_detach_private(folio);
+ folio_clear_checked(folio);
- kunmap(page);
- unlock_page(page);
- end_page_writeback(page);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
return err;
}
@@ -1006,22 +1002,21 @@ static int do_writepage(struct page *page, int len)
* on the page lock and it would not write the truncated inode node to the
* journal before we have finished.
*/
-static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc,
+ void *data)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_inode *ui = ubifs_inode(inode);
loff_t i_size = i_size_read(inode), synced_i_size;
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- int err, len = i_size & (PAGE_SIZE - 1);
- void *kaddr;
+ int err, len = folio_size(folio);
dbg_gen("ino %lu, pg %lu, pg flags %#lx",
- inode->i_ino, page->index, page->flags);
- ubifs_assert(c, PagePrivate(page));
+ inode->i_ino, folio->index, folio->flags);
+ ubifs_assert(c, folio->private != NULL);
- /* Is the page fully outside @i_size? (truncate in progress) */
- if (page->index > end_index || (page->index == end_index && !len)) {
+ /* Is the folio fully outside @i_size? (truncate in progress) */
+ if (folio_pos(folio) >= i_size) {
err = 0;
goto out_unlock;
}
@@ -1030,9 +1025,9 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
synced_i_size = ui->synced_i_size;
spin_unlock(&ui->ui_lock);
- /* Is the page fully inside @i_size? */
- if (page->index < end_index) {
- if (page->index >= synced_i_size >> PAGE_SHIFT) {
+ /* Is the folio fully inside i_size? */
+ if (folio_pos(folio) + len <= i_size) {
+ if (folio_pos(folio) >= synced_i_size) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
goto out_redirty;
@@ -1045,20 +1040,18 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
* with this.
*/
}
- return do_writepage(page, PAGE_SIZE);
+ return do_writepage(folio, len);
}
/*
- * The page straddles @i_size. It must be zeroed out on each and every
+ * The folio straddles @i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- kaddr = kmap_atomic(page);
- memset(kaddr + len, 0, PAGE_SIZE - len);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
+ len = i_size - folio_pos(folio);
+ folio_zero_segment(folio, len, folio_size(folio));
if (i_size > synced_i_size) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
@@ -1066,19 +1059,25 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
goto out_redirty;
}
- return do_writepage(page, len);
+ return do_writepage(folio, len);
out_redirty:
/*
- * redirty_page_for_writepage() won't call ubifs_dirty_inode() because
+ * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because
* it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so
* there is no need to do space budget for dirty inode.
*/
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
+static int ubifs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return write_cache_pages(mapping, wbc, ubifs_writepage, NULL);
+}
+
/**
* do_attr_changes - change inode attributes.
* @inode: inode to change attributes for
@@ -1155,11 +1154,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
if (offset) {
pgoff_t index = new_size >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
- page = find_lock_page(inode->i_mapping, index);
- if (page) {
- if (PageDirty(page)) {
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (!IS_ERR(folio)) {
+ if (folio_test_dirty(folio)) {
/*
* 'ubifs_jnl_truncate()' will try to truncate
* the last data node, but it contains
@@ -1168,14 +1167,14 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
* 'ubifs_jnl_truncate()' will see an already
* truncated (and up to date) data node.
*/
- ubifs_assert(c, PagePrivate(page));
+ ubifs_assert(c, folio->private != NULL);
- clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
- offset = new_size &
- (PAGE_SIZE - 1);
- err = do_writepage(page, offset);
- put_page(page);
+ offset = offset_in_folio(folio,
+ new_size);
+ err = do_writepage(folio, offset);
+ folio_put(folio);
if (err)
goto out_budg;
/*
@@ -1188,8 +1187,8 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
* to 'ubifs_jnl_truncate()' to save it from
* having to read it.
*/
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
}
}
@@ -1512,14 +1511,14 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
*/
static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct timespec64 now = current_time(inode);
struct ubifs_budget_req req = { .new_page = 1 };
int err, update_time;
- dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
+ dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, folio->index,
i_size_read(inode));
ubifs_assert(c, !c->ro_media && !c->ro_mount);
@@ -1527,17 +1526,17 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
return VM_FAULT_SIGBUS; /* -EROFS */
/*
- * We have not locked @page so far so we may budget for changing the
- * page. Note, we cannot do this after we locked the page, because
+ * We have not locked @folio so far so we may budget for changing the
+ * folio. Note, we cannot do this after we locked the folio, because
* budgeting may cause write-back which would cause deadlock.
*
- * At the moment we do not know whether the page is dirty or not, so we
- * assume that it is not and budget for a new page. We could look at
+ * At the moment we do not know whether the folio is dirty or not, so we
+ * assume that it is not and budget for a new folio. We could look at
* the @PG_private flag and figure this out, but we may race with write
- * back and the page state may change by the time we lock it, so this
+ * back and the folio state may change by the time we lock it, so this
* would need additional care. We do not bother with this at the
* moment, although it might be good idea to do. Instead, we allocate
- * budget for a new page and amend it later on if the page was in fact
+ * budget for a new folio and amend it later on if the folio was in fact
* dirty.
*
* The budgeting-related logic of this function is similar to what we
@@ -1560,21 +1559,21 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- lock_page(page);
- if (unlikely(page->mapping != inode->i_mapping ||
- page_offset(page) > i_size_read(inode))) {
- /* Page got truncated out from underneath us */
+ folio_lock(folio);
+ if (unlikely(folio->mapping != inode->i_mapping ||
+ folio_pos(folio) >= i_size_read(inode))) {
+ /* Folio got truncated out from underneath us */
goto sigbus;
}
- if (PagePrivate(page))
+ if (folio->private)
release_new_page_budget(c);
else {
- if (!PageChecked(page))
+ if (!folio_test_checked(folio))
ubifs_convert_page_budget(c);
- attach_page_private(page, (void *)1);
+ folio_attach_private(folio, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
- __set_page_dirty_nobuffers(page);
+ filemap_dirty_folio(folio->mapping, folio);
}
if (update_time) {
@@ -1590,11 +1589,11 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
ubifs_release_dirty_inode_budget(c, ui);
}
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
return VM_FAULT_LOCKED;
sigbus:
- unlock_page(page);
+ folio_unlock(folio);
ubifs_release_budget(c, &req);
return VM_FAULT_SIGBUS;
}
@@ -1648,7 +1647,7 @@ static int ubifs_symlink_getattr(struct mnt_idmap *idmap,
const struct address_space_operations ubifs_file_address_operations = {
.read_folio = ubifs_read_folio,
- .writepage = ubifs_writepage,
+ .writepages = ubifs_writepages,
.write_begin = ubifs_write_begin,
.write_end = ubifs_write_end,
.invalidate_folio = ubifs_invalidate_folio,
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 873e6e1c92b5..6ebf3c04ac5f 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -82,8 +82,9 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
*/
static int scan_for_dirty_cb(struct ubifs_info *c,
const struct ubifs_lprops *lprops, int in_tree,
- struct scan_data *data)
+ void *arg)
{
+ struct scan_data *data = arg;
int ret = LPT_SCAN_CONTINUE;
/* Exclude LEBs that are currently in use */
@@ -166,8 +167,7 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
data.pick_free = pick_free;
data.lnum = -1;
data.exclude_index = exclude_index;
- err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
- (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_dirty_cb,
&data);
if (err)
return ERR_PTR(err);
@@ -349,8 +349,9 @@ out:
*/
static int scan_for_free_cb(struct ubifs_info *c,
const struct ubifs_lprops *lprops, int in_tree,
- struct scan_data *data)
+ void *arg)
{
+ struct scan_data *data = arg;
int ret = LPT_SCAN_CONTINUE;
/* Exclude LEBs that are currently in use */
@@ -446,7 +447,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
data.pick_free = pick_free;
data.lnum = -1;
err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
- (ubifs_lpt_scan_callback)scan_for_free_cb,
+ scan_for_free_cb,
&data);
if (err)
return ERR_PTR(err);
@@ -589,8 +590,9 @@ out:
*/
static int scan_for_idx_cb(struct ubifs_info *c,
const struct ubifs_lprops *lprops, int in_tree,
- struct scan_data *data)
+ void *arg)
{
+ struct scan_data *data = arg;
int ret = LPT_SCAN_CONTINUE;
/* Exclude LEBs that are currently in use */
@@ -625,8 +627,7 @@ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
int err;
data.lnum = -1;
- err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
- (ubifs_lpt_scan_callback)scan_for_idx_cb,
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_idx_cb,
&data);
if (err)
return ERR_PTR(err);
@@ -726,11 +727,10 @@ out:
return err;
}
-static int cmp_dirty_idx(const struct ubifs_lprops **a,
- const struct ubifs_lprops **b)
+static int cmp_dirty_idx(const void *a, const void *b)
{
- const struct ubifs_lprops *lpa = *a;
- const struct ubifs_lprops *lpb = *b;
+ const struct ubifs_lprops *lpa = *(const struct ubifs_lprops **)a;
+ const struct ubifs_lprops *lpb = *(const struct ubifs_lprops **)b;
return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
}
@@ -754,7 +754,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
sizeof(void *) * c->dirty_idx.cnt);
/* Sort it so that the dirtiest is now at the end */
sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
- (int (*)(const void *, const void *))cmp_dirty_idx, NULL);
+ cmp_dirty_idx, NULL);
dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
if (c->dirty_idx.cnt)
dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
@@ -782,8 +782,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
*/
static int scan_dirty_idx_cb(struct ubifs_info *c,
const struct ubifs_lprops *lprops, int in_tree,
- struct scan_data *data)
+ void *arg)
{
+ struct scan_data *data = arg;
int ret = LPT_SCAN_CONTINUE;
/* Exclude LEBs that are currently in use */
@@ -842,8 +843,7 @@ static int find_dirty_idx_leb(struct ubifs_info *c)
if (c->pnodes_have >= c->pnode_cnt)
/* All pnodes are in memory, so skip scan */
return -ENOSPC;
- err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
- (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_dirty_idx_cb,
&data);
if (err)
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f0a5538c84b0..74aee92433d7 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -293,6 +293,96 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
}
/**
+ * __queue_and_wait - queue a task and wait until the task is waked up.
+ * @c: UBIFS file-system description object
+ *
+ * This function adds current task in queue and waits until the task is waked
+ * up. This function should be called with @c->reserve_space_wq locked.
+ */
+static void __queue_and_wait(struct ubifs_info *c)
+{
+ DEFINE_WAIT(wait);
+
+ __add_wait_queue_entry_tail_exclusive(&c->reserve_space_wq, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&c->reserve_space_wq.lock);
+
+ schedule();
+ finish_wait(&c->reserve_space_wq, &wait);
+}
+
+/**
+ * wait_for_reservation - try queuing current task to wait until waked up.
+ * @c: UBIFS file-system description object
+ *
+ * This function queues current task to wait until waked up, if queuing is
+ * started(@c->need_wait_space is not %0). Returns %true if current task is
+ * added in queue, otherwise %false is returned.
+ */
+static bool wait_for_reservation(struct ubifs_info *c)
+{
+ if (likely(atomic_read(&c->need_wait_space) == 0))
+ /* Quick path to check whether queuing is started. */
+ return false;
+
+ spin_lock(&c->reserve_space_wq.lock);
+ if (atomic_read(&c->need_wait_space) == 0) {
+ /* Queuing is not started, don't queue current task. */
+ spin_unlock(&c->reserve_space_wq.lock);
+ return false;
+ }
+
+ __queue_and_wait(c);
+ return true;
+}
+
+/**
+ * wake_up_reservation - wake up first task in queue or stop queuing.
+ * @c: UBIFS file-system description object
+ *
+ * This function wakes up the first task in queue if it exists, or stops
+ * queuing if no tasks in queue.
+ */
+static void wake_up_reservation(struct ubifs_info *c)
+{
+ spin_lock(&c->reserve_space_wq.lock);
+ if (waitqueue_active(&c->reserve_space_wq))
+ wake_up_locked(&c->reserve_space_wq);
+ else
+ /*
+ * Compared with wait_for_reservation(), set @c->need_wait_space
+ * under the protection of wait queue lock, which can avoid that
+ * @c->need_wait_space is set to 0 after new task queued.
+ */
+ atomic_set(&c->need_wait_space, 0);
+ spin_unlock(&c->reserve_space_wq.lock);
+}
+
+/**
+ * wake_up_reservation - add current task in queue or start queuing.
+ * @c: UBIFS file-system description object
+ *
+ * This function starts queuing if queuing is not started, otherwise adds
+ * current task in queue.
+ */
+static void add_or_start_queue(struct ubifs_info *c)
+{
+ spin_lock(&c->reserve_space_wq.lock);
+ if (atomic_cmpxchg(&c->need_wait_space, 0, 1) == 0) {
+ /* Starts queuing, task can go on directly. */
+ spin_unlock(&c->reserve_space_wq.lock);
+ return;
+ }
+
+ /*
+ * There are at least two tasks have retried more than 32 times
+ * at certain point, first task has started queuing, just queue
+ * the left tasks.
+ */
+ __queue_and_wait(c);
+}
+
+/**
* make_reservation - reserve journal space.
* @c: UBIFS file-system description object
* @jhead: journal head
@@ -311,33 +401,27 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
static int make_reservation(struct ubifs_info *c, int jhead, int len)
{
int err, cmt_retries = 0, nospc_retries = 0;
+ bool blocked = wait_for_reservation(c);
again:
down_read(&c->commit_sem);
err = reserve_space(c, jhead, len);
- if (!err)
+ if (!err) {
/* c->commit_sem will get released via finish_reservation(). */
- return 0;
+ goto out_wake_up;
+ }
up_read(&c->commit_sem);
if (err == -ENOSPC) {
/*
* GC could not make any progress. We should try to commit
- * once because it could make some dirty space and GC would
- * make progress, so make the error -EAGAIN so that the below
+ * because it could make some dirty space and GC would make
+ * progress, so make the error -EAGAIN so that the below
* will commit and re-try.
*/
- if (nospc_retries++ < 2) {
- dbg_jnl("no space, retry");
- err = -EAGAIN;
- }
-
- /*
- * This means that the budgeting is incorrect. We always have
- * to be able to write to the media, because all operations are
- * budgeted. Deletions are not budgeted, though, but we reserve
- * an extra LEB for them.
- */
+ nospc_retries++;
+ dbg_jnl("no space, retry");
+ err = -EAGAIN;
}
if (err != -EAGAIN)
@@ -349,15 +433,37 @@ again:
*/
if (cmt_retries > 128) {
/*
- * This should not happen unless the journal size limitations
- * are too tough.
+ * This should not happen unless:
+ * 1. The journal size limitations are too tough.
+ * 2. The budgeting is incorrect. We always have to be able to
+ * write to the media, because all operations are budgeted.
+ * Deletions are not budgeted, though, but we reserve an
+ * extra LEB for them.
*/
- ubifs_err(c, "stuck in space allocation");
+ ubifs_err(c, "stuck in space allocation, nospc_retries %d",
+ nospc_retries);
err = -ENOSPC;
goto out;
- } else if (cmt_retries > 32)
- ubifs_warn(c, "too many space allocation re-tries (%d)",
- cmt_retries);
+ } else if (cmt_retries > 32) {
+ /*
+ * It's almost impossible to happen, unless there are many tasks
+ * making reservation concurrently and someone task has retried
+ * gc + commit for many times, generated available space during
+ * this period are grabbed by other tasks.
+ * But if it happens, start queuing up all tasks that will make
+ * space reservation, then there is only one task making space
+ * reservation at any time, and it can always make success under
+ * the premise of correct budgeting.
+ */
+ ubifs_warn(c, "too many space allocation cmt_retries (%d) "
+ "nospc_retries (%d), start queuing tasks",
+ cmt_retries, nospc_retries);
+
+ if (!blocked) {
+ blocked = true;
+ add_or_start_queue(c);
+ }
+ }
dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
cmt_retries);
@@ -365,7 +471,7 @@ again:
err = ubifs_run_commit(c);
if (err)
- return err;
+ goto out_wake_up;
goto again;
out:
@@ -380,6 +486,27 @@ out:
cmt_retries = dbg_check_lprops(c);
up_write(&c->commit_sem);
}
+out_wake_up:
+ if (blocked) {
+ /*
+ * Only tasks that have ever started queuing or ever been queued
+ * can wake up other queued tasks, which can make sure that
+ * there is only one task waked up to make space reservation.
+ * For example:
+ * task A task B task C
+ * make_reservation make_reservation
+ * reserve_space // 0
+ * wake_up_reservation
+ * atomic_cmpxchg // 0, start queuing
+ * reserve_space
+ * wait_for_reservation
+ * __queue_and_wait
+ * add_wait_queue
+ * if (blocked) // false
+ * // So that task C won't be waked up to race with task B
+ */
+ wake_up_reservation(c);
+ }
return err;
}
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 6d6cd85c2b4c..a11c3dab7e16 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1014,8 +1014,9 @@ out:
*/
static int scan_check_cb(struct ubifs_info *c,
const struct ubifs_lprops *lp, int in_tree,
- struct ubifs_lp_stats *lst)
+ void *arg)
{
+ struct ubifs_lp_stats *lst = arg;
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
@@ -1269,8 +1270,7 @@ int dbg_check_lprops(struct ubifs_info *c)
memset(&lst, 0, sizeof(struct ubifs_lp_stats));
err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
- (ubifs_lpt_scan_callback)scan_check_cb,
- &lst);
+ scan_check_cb, &lst);
if (err && err != -ENOSPC)
goto out;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index c4d079328b92..07351fdce722 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1646,7 +1646,6 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
len -= node_len;
}
- err = 0;
out:
vfree(buf);
return err;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7f4031a15f4d..291583005dd1 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2151,6 +2151,8 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
mutex_init(&c->bu_mutex);
mutex_init(&c->write_reserve_mutex);
init_waitqueue_head(&c->cmt_wq);
+ init_waitqueue_head(&c->reserve_space_wq);
+ atomic_set(&c->need_wait_space, 0);
c->buds = RB_ROOT;
c->old_idx = RB_ROOT;
c->size_tree = RB_ROOT;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f4728e65d1bd..45cacdcd4746 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -3116,14 +3116,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
void ubifs_tnc_close(struct ubifs_info *c)
{
tnc_destroy_cnext(c);
- if (c->zroot.znode) {
- long n, freed;
-
- n = atomic_long_read(&c->clean_zn_cnt);
- freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode);
- ubifs_assert(c, freed == n);
- atomic_long_sub(n, &ubifs_clean_zn_cnt);
- }
+ ubifs_destroy_tnc_tree(c);
kfree(c->gap_lebs);
kfree(c->ilebs);
destroy_old_idx(c);
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index 4d686e34e64d..d3f8a6aa1f49 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -251,6 +251,28 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c,
}
/**
+ * ubifs_destroy_tnc_tree - destroy all znodes connected to the TNC tree.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the whole TNC tree and updates clean global znode
+ * count.
+ */
+void ubifs_destroy_tnc_tree(struct ubifs_info *c)
+{
+ long n, freed;
+
+ if (!c->zroot.znode)
+ return;
+
+ n = atomic_long_read(&c->clean_zn_cnt);
+ freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode);
+ ubifs_assert(c, freed == n);
+ atomic_long_sub(n, &ubifs_clean_zn_cnt);
+
+ c->zroot.znode = NULL;
+}
+
+/**
* read_znode - read an indexing node from flash and fill znode.
* @c: UBIFS file-system description object
* @zzbr: the zbranch describing the node to read
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 3916dc4f30ca..1f3ea879d93a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1047,6 +1047,8 @@ struct ubifs_debug_info;
* @bg_bud_bytes: number of bud bytes when background commit is initiated
* @old_buds: buds to be released after commit ends
* @max_bud_cnt: maximum number of buds
+ * @need_wait_space: Non %0 means space reservation tasks need to wait in queue
+ * @reserve_space_wq: wait queue to sleep on if @need_wait_space is not %0
*
* @commit_sem: synchronizes committer with other processes
* @cmt_state: commit state
@@ -1305,6 +1307,8 @@ struct ubifs_info {
long long bg_bud_bytes;
struct list_head old_buds;
int max_bud_cnt;
+ atomic_t need_wait_space;
+ wait_queue_head_t reserve_space_wq;
struct rw_semaphore commit_sem;
int cmt_state;
@@ -1903,6 +1907,7 @@ struct ubifs_znode *ubifs_tnc_postorder_next(const struct ubifs_info *c,
struct ubifs_znode *znode);
long ubifs_destroy_tnc_subtree(const struct ubifs_info *c,
struct ubifs_znode *zr);
+void ubifs_destroy_tnc_tree(struct ubifs_info *c);
struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
struct ubifs_zbranch *zbr,
struct ubifs_znode *parent, int iip);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f6533f93851b..f94f45fe2c91 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -67,7 +67,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
pos_valid = true;
}
- fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ fname = kmalloc(UDF_NAME_LEN, GFP_KERNEL);
if (!fname) {
ret = -ENOMEM;
goto out;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index d8493449d4c5..2f831a3a91af 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -357,7 +357,7 @@ int udf_expand_file_adinicb(struct inode *inode)
return 0;
}
- page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, 0, GFP_KERNEL);
if (!page)
return -ENOMEM;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 1bb6ed948927..1308109fd42d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -59,7 +59,7 @@ static int udf_fiiter_find_entry(struct inode *dir, const struct qstr *child,
child->name[0] == '.' && child->name[1] == '.';
int ret;
- fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ fname = kmalloc(UDF_NAME_LEN, GFP_KERNEL);
if (!fname)
return -ENOMEM;
@@ -566,7 +566,7 @@ out:
static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
- struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777);
+ struct inode *inode;
struct pathComponent *pc;
const char *compstart;
struct extent_position epos = {};
@@ -579,17 +579,20 @@ static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct udf_inode_info *iinfo;
struct super_block *sb = dir->i_sb;
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- iinfo = UDF_I(inode);
- down_write(&iinfo->i_data_sem);
- name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
+ name = kmalloc(UDF_NAME_LEN_CS0, GFP_KERNEL);
if (!name) {
err = -ENOMEM;
- goto out_no_entry;
+ goto out;
+ }
+
+ inode = udf_new_inode(dir, S_IFLNK | 0777);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out;
}
+ iinfo = UDF_I(inode);
+ down_write(&iinfo->i_data_sem);
inode->i_data.a_ops = &udf_symlink_aops;
inode->i_op = &udf_symlink_inode_operations;
inode_nohighmem(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6f420f4ca005..2217f7ed7a49 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -40,20 +40,20 @@
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/parser.h>
#include <linux/stat.h>
#include <linux/cdrom.h>
#include <linux/nls.h>
#include <linux/vfs.h>
#include <linux/vmalloc.h>
#include <linux/errno.h>
-#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>
#include <linux/crc-itu-t.h>
#include <linux/log2.h>
#include <asm/byteorder.h>
#include <linux/iversion.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "udf_sb.h"
#include "udf_i.h"
@@ -91,16 +91,20 @@ enum { UDF_MAX_LINKS = 0xffff };
#define UDF_MAX_FILESIZE (1ULL << 42)
/* These are the "meat" - everything else is stuffing */
-static int udf_fill_super(struct super_block *, void *, int);
+static int udf_fill_super(struct super_block *sb, struct fs_context *fc);
static void udf_put_super(struct super_block *);
static int udf_sync_fs(struct super_block *, int);
-static int udf_remount_fs(struct super_block *, int *, char *);
static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
static void udf_open_lvid(struct super_block *);
static void udf_close_lvid(struct super_block *);
static unsigned int udf_count_free(struct super_block *);
static int udf_statfs(struct dentry *, struct kstatfs *);
static int udf_show_options(struct seq_file *, struct dentry *);
+static int udf_init_fs_context(struct fs_context *fc);
+static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param);
+static int udf_reconfigure(struct fs_context *fc);
+static void udf_free_fc(struct fs_context *fc);
+static const struct fs_parameter_spec udf_param_spec[];
struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
{
@@ -119,18 +123,25 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
}
/* UDF filesystem type */
-static struct dentry *udf_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int udf_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+ return get_tree_bdev(fc, udf_fill_super);
}
+static const struct fs_context_operations udf_context_ops = {
+ .parse_param = udf_parse_param,
+ .get_tree = udf_get_tree,
+ .reconfigure = udf_reconfigure,
+ .free = udf_free_fc,
+};
+
static struct file_system_type udf_fstype = {
.owner = THIS_MODULE,
.name = "udf",
- .mount = udf_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = udf_init_fs_context,
+ .parameters = udf_param_spec,
};
MODULE_ALIAS_FS("udf");
@@ -203,12 +214,10 @@ static const struct super_operations udf_sb_ops = {
.put_super = udf_put_super,
.sync_fs = udf_sync_fs,
.statfs = udf_statfs,
- .remount_fs = udf_remount_fs,
.show_options = udf_show_options,
};
struct udf_options {
- unsigned char novrs;
unsigned int blocksize;
unsigned int session;
unsigned int lastblock;
@@ -222,6 +231,65 @@ struct udf_options {
struct nls_table *nls_map;
};
+/*
+ * UDF has historically preserved prior mount options across
+ * a remount, so copy those here if remounting, otherwise set
+ * initial mount defaults.
+ */
+static void udf_init_options(struct fs_context *fc, struct udf_options *uopt)
+{
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct udf_sb_info *sbi = UDF_SB(sb);
+
+ uopt->flags = sbi->s_flags;
+ uopt->uid = sbi->s_uid;
+ uopt->gid = sbi->s_gid;
+ uopt->umask = sbi->s_umask;
+ uopt->fmode = sbi->s_fmode;
+ uopt->dmode = sbi->s_dmode;
+ uopt->nls_map = NULL;
+ } else {
+ uopt->flags = (1 << UDF_FLAG_USE_AD_IN_ICB) |
+ (1 << UDF_FLAG_STRICT);
+ /*
+ * By default we'll use overflow[ug]id when UDF
+ * inode [ug]id == -1
+ */
+ uopt->uid = make_kuid(current_user_ns(), overflowuid);
+ uopt->gid = make_kgid(current_user_ns(), overflowgid);
+ uopt->umask = 0;
+ uopt->fmode = UDF_INVALID_MODE;
+ uopt->dmode = UDF_INVALID_MODE;
+ uopt->nls_map = NULL;
+ uopt->session = 0xFFFFFFFF;
+ }
+}
+
+static int udf_init_fs_context(struct fs_context *fc)
+{
+ struct udf_options *uopt;
+
+ uopt = kzalloc(sizeof(*uopt), GFP_KERNEL);
+ if (!uopt)
+ return -ENOMEM;
+
+ udf_init_options(fc, uopt);
+
+ fc->fs_private = uopt;
+ fc->ops = &udf_context_ops;
+
+ return 0;
+}
+
+static void udf_free_fc(struct fs_context *fc)
+{
+ struct udf_options *uopt = fc->fs_private;
+
+ unload_nls(uopt->nls_map);
+ kfree(fc->fs_private);
+}
+
static int __init init_udf_fs(void)
{
int err;
@@ -357,7 +425,7 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
}
/*
- * udf_parse_options
+ * udf_parse_param
*
* PURPOSE
* Parse mount options.
@@ -400,12 +468,12 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
* yield highly unpredictable results.
*
* PRE-CONDITIONS
- * options Pointer to mount options string.
- * uopts Pointer to mount options variable.
+ * fc fs_context with pointer to mount options variable.
+ * param Pointer to fs_parameter being parsed.
*
* POST-CONDITIONS
- * <return> 1 Mount options parsed okay.
- * <return> 0 Error parsing mount options.
+ * <return> 0 Mount options parsed okay.
+ * <return> errno Error parsing mount options.
*
* HISTORY
* July 1, 1997 - Andrew E. Mileski
@@ -417,229 +485,193 @@ enum {
Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad,
Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
- Opt_rootdir, Opt_utf8, Opt_iocharset,
- Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
- Opt_fmode, Opt_dmode
-};
-
-static const match_table_t tokens = {
- {Opt_novrs, "novrs"},
- {Opt_nostrict, "nostrict"},
- {Opt_bs, "bs=%u"},
- {Opt_unhide, "unhide"},
- {Opt_undelete, "undelete"},
- {Opt_noadinicb, "noadinicb"},
- {Opt_adinicb, "adinicb"},
- {Opt_shortad, "shortad"},
- {Opt_longad, "longad"},
- {Opt_uforget, "uid=forget"},
- {Opt_uignore, "uid=ignore"},
- {Opt_gforget, "gid=forget"},
- {Opt_gignore, "gid=ignore"},
- {Opt_gid, "gid=%u"},
- {Opt_uid, "uid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_session, "session=%u"},
- {Opt_lastblock, "lastblock=%u"},
- {Opt_anchor, "anchor=%u"},
- {Opt_volume, "volume=%u"},
- {Opt_partition, "partition=%u"},
- {Opt_fileset, "fileset=%u"},
- {Opt_rootdir, "rootdir=%u"},
- {Opt_utf8, "utf8"},
- {Opt_iocharset, "iocharset=%s"},
- {Opt_fmode, "mode=%o"},
- {Opt_dmode, "dmode=%o"},
- {Opt_err, NULL}
+ Opt_rootdir, Opt_utf8, Opt_iocharset, Opt_err, Opt_fmode, Opt_dmode
};
-static int udf_parse_options(char *options, struct udf_options *uopt,
- bool remount)
+static const struct fs_parameter_spec udf_param_spec[] = {
+ fsparam_flag ("novrs", Opt_novrs),
+ fsparam_flag ("nostrict", Opt_nostrict),
+ fsparam_u32 ("bs", Opt_bs),
+ fsparam_flag ("unhide", Opt_unhide),
+ fsparam_flag ("undelete", Opt_undelete),
+ fsparam_flag_no ("adinicb", Opt_adinicb),
+ fsparam_flag ("shortad", Opt_shortad),
+ fsparam_flag ("longad", Opt_longad),
+ fsparam_string ("gid", Opt_gid),
+ fsparam_string ("uid", Opt_uid),
+ fsparam_u32 ("umask", Opt_umask),
+ fsparam_u32 ("session", Opt_session),
+ fsparam_u32 ("lastblock", Opt_lastblock),
+ fsparam_u32 ("anchor", Opt_anchor),
+ fsparam_u32 ("volume", Opt_volume),
+ fsparam_u32 ("partition", Opt_partition),
+ fsparam_u32 ("fileset", Opt_fileset),
+ fsparam_u32 ("rootdir", Opt_rootdir),
+ fsparam_flag ("utf8", Opt_utf8),
+ fsparam_string ("iocharset", Opt_iocharset),
+ fsparam_u32 ("mode", Opt_fmode),
+ fsparam_u32 ("dmode", Opt_dmode),
+ {}
+ };
+
+static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- int option;
unsigned int uv;
-
- uopt->novrs = 0;
- uopt->session = 0xFFFFFFFF;
- uopt->lastblock = 0;
- uopt->anchor = 0;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- unsigned n;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_novrs:
- uopt->novrs = 1;
- break;
- case Opt_bs:
- if (match_int(&args[0], &option))
- return 0;
- n = option;
- if (n != 512 && n != 1024 && n != 2048 && n != 4096)
- return 0;
- uopt->blocksize = n;
- uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
- break;
- case Opt_unhide:
- uopt->flags |= (1 << UDF_FLAG_UNHIDE);
- break;
- case Opt_undelete:
- uopt->flags |= (1 << UDF_FLAG_UNDELETE);
- break;
- case Opt_noadinicb:
+ unsigned int n;
+ struct udf_options *uopt = fc->fs_private;
+ struct fs_parse_result result;
+ int token;
+ bool remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);
+
+ token = fs_parse(fc, udf_param_spec, param, &result);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_novrs:
+ uopt->flags |= (1 << UDF_FLAG_NOVRS);
+ break;
+ case Opt_bs:
+ n = result.uint_32;
+ if (n != 512 && n != 1024 && n != 2048 && n != 4096)
+ return -EINVAL;
+ uopt->blocksize = n;
+ uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
+ break;
+ case Opt_unhide:
+ uopt->flags |= (1 << UDF_FLAG_UNHIDE);
+ break;
+ case Opt_undelete:
+ uopt->flags |= (1 << UDF_FLAG_UNDELETE);
+ break;
+ case Opt_adinicb:
+ if (result.negated)
uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB);
- break;
- case Opt_adinicb:
+ else
uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB);
- break;
- case Opt_shortad:
- uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
- break;
- case Opt_longad:
- uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
- break;
- case Opt_gid:
- if (match_uint(args, &uv))
- return 0;
- uopt->gid = make_kgid(current_user_ns(), uv);
- if (!gid_valid(uopt->gid))
- return 0;
+ break;
+ case Opt_shortad:
+ uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
+ break;
+ case Opt_longad:
+ uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
+ break;
+ case Opt_gid:
+ if (kstrtoint(param->string, 10, &uv) == 0) {
+ kgid_t gid = make_kgid(current_user_ns(), uv);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ uopt->gid = gid;
uopt->flags |= (1 << UDF_FLAG_GID_SET);
- break;
- case Opt_uid:
- if (match_uint(args, &uv))
- return 0;
- uopt->uid = make_kuid(current_user_ns(), uv);
- if (!uid_valid(uopt->uid))
- return 0;
+ } else if (!strcmp(param->string, "forget")) {
+ uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
+ } else if (!strcmp(param->string, "ignore")) {
+ /* this option is superseded by gid=<number> */
+ ;
+ } else {
+ return -EINVAL;
+ }
+ break;
+ case Opt_uid:
+ if (kstrtoint(param->string, 10, &uv) == 0) {
+ kuid_t uid = make_kuid(current_user_ns(), uv);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ uopt->uid = uid;
uopt->flags |= (1 << UDF_FLAG_UID_SET);
- break;
- case Opt_umask:
- if (match_octal(args, &option))
- return 0;
- uopt->umask = option;
- break;
- case Opt_nostrict:
- uopt->flags &= ~(1 << UDF_FLAG_STRICT);
- break;
- case Opt_session:
- if (match_int(args, &option))
- return 0;
- uopt->session = option;
- if (!remount)
- uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
- break;
- case Opt_lastblock:
- if (match_int(args, &option))
- return 0;
- uopt->lastblock = option;
- if (!remount)
- uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
- break;
- case Opt_anchor:
- if (match_int(args, &option))
- return 0;
- uopt->anchor = option;
- break;
- case Opt_volume:
- case Opt_partition:
- case Opt_fileset:
- case Opt_rootdir:
- /* Ignored (never implemented properly) */
- break;
- case Opt_utf8:
- if (!remount) {
- unload_nls(uopt->nls_map);
- uopt->nls_map = NULL;
- }
- break;
- case Opt_iocharset:
- if (!remount) {
- unload_nls(uopt->nls_map);
- uopt->nls_map = NULL;
- }
- /* When nls_map is not loaded then UTF-8 is used */
- if (!remount && strcmp(args[0].from, "utf8") != 0) {
- uopt->nls_map = load_nls(args[0].from);
- if (!uopt->nls_map) {
- pr_err("iocharset %s not found\n",
- args[0].from);
- return 0;
- }
- }
- break;
- case Opt_uforget:
+ } else if (!strcmp(param->string, "forget")) {
uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
- break;
- case Opt_uignore:
- case Opt_gignore:
- /* These options are superseeded by uid=<number> */
- break;
- case Opt_gforget:
- uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
- break;
- case Opt_fmode:
- if (match_octal(args, &option))
- return 0;
- uopt->fmode = option & 0777;
- break;
- case Opt_dmode:
- if (match_octal(args, &option))
- return 0;
- uopt->dmode = option & 0777;
- break;
- default:
- pr_err("bad mount option \"%s\" or missing value\n", p);
- return 0;
+ } else if (!strcmp(param->string, "ignore")) {
+ /* this option is superseded by uid=<number> */
+ ;
+ } else {
+ return -EINVAL;
+ }
+ break;
+ case Opt_umask:
+ uopt->umask = result.uint_32;
+ break;
+ case Opt_nostrict:
+ uopt->flags &= ~(1 << UDF_FLAG_STRICT);
+ break;
+ case Opt_session:
+ uopt->session = result.uint_32;
+ if (!remount)
+ uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
+ break;
+ case Opt_lastblock:
+ uopt->lastblock = result.uint_32;
+ if (!remount)
+ uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
+ break;
+ case Opt_anchor:
+ uopt->anchor = result.uint_32;
+ break;
+ case Opt_volume:
+ case Opt_partition:
+ case Opt_fileset:
+ case Opt_rootdir:
+ /* Ignored (never implemented properly) */
+ break;
+ case Opt_utf8:
+ if (!remount) {
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
}
+ break;
+ case Opt_iocharset:
+ if (!remount) {
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
+ }
+ /* When nls_map is not loaded then UTF-8 is used */
+ if (!remount && strcmp(param->string, "utf8") != 0) {
+ uopt->nls_map = load_nls(param->string);
+ if (!uopt->nls_map) {
+ errorf(fc, "iocharset %s not found",
+ param->string);
+ return -EINVAL;;
+ }
+ }
+ break;
+ case Opt_fmode:
+ uopt->fmode = result.uint_32 & 0777;
+ break;
+ case Opt_dmode:
+ uopt->dmode = result.uint_32 & 0777;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
-static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
+static int udf_reconfigure(struct fs_context *fc)
{
- struct udf_options uopt;
+ struct udf_options *uopt = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
+ int readonly = fc->sb_flags & SB_RDONLY;
int error = 0;
- if (!(*flags & SB_RDONLY) && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
+ if (!readonly && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
return -EACCES;
sync_filesystem(sb);
- uopt.flags = sbi->s_flags;
- uopt.uid = sbi->s_uid;
- uopt.gid = sbi->s_gid;
- uopt.umask = sbi->s_umask;
- uopt.fmode = sbi->s_fmode;
- uopt.dmode = sbi->s_dmode;
- uopt.nls_map = NULL;
-
- if (!udf_parse_options(options, &uopt, true))
- return -EINVAL;
-
write_lock(&sbi->s_cred_lock);
- sbi->s_flags = uopt.flags;
- sbi->s_uid = uopt.uid;
- sbi->s_gid = uopt.gid;
- sbi->s_umask = uopt.umask;
- sbi->s_fmode = uopt.fmode;
- sbi->s_dmode = uopt.dmode;
+ sbi->s_flags = uopt->flags;
+ sbi->s_uid = uopt->uid;
+ sbi->s_gid = uopt->gid;
+ sbi->s_umask = uopt->umask;
+ sbi->s_fmode = uopt->fmode;
+ sbi->s_dmode = uopt->dmode;
write_unlock(&sbi->s_cred_lock);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if (readonly == sb_rdonly(sb))
goto out_unlock;
- if (*flags & SB_RDONLY)
+ if (readonly)
udf_close_lvid(sb);
else
udf_open_lvid(sb);
@@ -863,7 +895,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
int ret;
struct timestamp *ts;
- outstr = kmalloc(128, GFP_NOFS);
+ outstr = kmalloc(128, GFP_KERNEL);
if (!outstr)
return -ENOMEM;
@@ -1538,6 +1570,20 @@ out_bh:
return ret;
}
+static bool udf_lvid_valid(struct super_block *sb,
+ struct logicalVolIntegrityDesc *lvid)
+{
+ u32 parts, impuselen;
+
+ parts = le32_to_cpu(lvid->numOfPartitions);
+ impuselen = le32_to_cpu(lvid->lengthOfImpUse);
+ if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
+ sizeof(struct logicalVolIntegrityDesc) + impuselen +
+ 2 * parts * sizeof(u32) > sb->s_blocksize)
+ return false;
+ return true;
+}
+
/*
* Find the prevailing Logical Volume Integrity Descriptor.
*/
@@ -1548,7 +1594,6 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
struct udf_sb_info *sbi = UDF_SB(sb);
struct logicalVolIntegrityDesc *lvid;
int indirections = 0;
- u32 parts, impuselen;
while (++indirections <= UDF_MAX_LVID_NESTING) {
final_bh = NULL;
@@ -1570,32 +1615,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
if (!final_bh)
return;
- brelse(sbi->s_lvid_bh);
- sbi->s_lvid_bh = final_bh;
-
lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data;
+ if (udf_lvid_valid(sb, lvid)) {
+ brelse(sbi->s_lvid_bh);
+ sbi->s_lvid_bh = final_bh;
+ } else {
+ udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
+ "ignoring.\n",
+ le32_to_cpu(lvid->numOfPartitions),
+ le32_to_cpu(lvid->lengthOfImpUse));
+ }
+
if (lvid->nextIntegrityExt.extLength == 0)
- goto check;
+ return;
loc = leea_to_cpu(lvid->nextIntegrityExt);
}
udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n",
UDF_MAX_LVID_NESTING);
-out_err:
brelse(sbi->s_lvid_bh);
sbi->s_lvid_bh = NULL;
- return;
-check:
- parts = le32_to_cpu(lvid->numOfPartitions);
- impuselen = le32_to_cpu(lvid->lengthOfImpUse);
- if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
- sizeof(struct logicalVolIntegrityDesc) + impuselen +
- 2 * parts * sizeof(u32) > sb->s_blocksize) {
- udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
- "ignoring.\n", parts, impuselen);
- goto out_err;
- }
}
/*
@@ -1945,7 +1985,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
return -EINVAL;
}
sbi->s_last_block = uopt->lastblock;
- if (!uopt->novrs) {
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_NOVRS)) {
/* Check that it is NSR02 compliant */
nsr = udf_check_vsd(sb);
if (!nsr) {
@@ -2083,23 +2123,15 @@ u64 lvid_get_unique_id(struct super_block *sb)
return ret;
}
-static int udf_fill_super(struct super_block *sb, void *options, int silent)
+static int udf_fill_super(struct super_block *sb, struct fs_context *fc)
{
int ret = -EINVAL;
struct inode *inode = NULL;
- struct udf_options uopt;
+ struct udf_options *uopt = fc->fs_private;
struct kernel_lb_addr rootdir, fileset;
struct udf_sb_info *sbi;
bool lvid_open = false;
-
- uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
- /* By default we'll use overflow[ug]id when UDF inode [ug]id == -1 */
- uopt.uid = make_kuid(current_user_ns(), overflowuid);
- uopt.gid = make_kgid(current_user_ns(), overflowgid);
- uopt.umask = 0;
- uopt.fmode = UDF_INVALID_MODE;
- uopt.dmode = UDF_INVALID_MODE;
- uopt.nls_map = NULL;
+ int silent = fc->sb_flags & SB_SILENT;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -2109,25 +2141,23 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
mutex_init(&sbi->s_alloc_mutex);
- if (!udf_parse_options((char *)options, &uopt, false))
- goto parse_options_failure;
-
fileset.logicalBlockNum = 0xFFFFFFFF;
fileset.partitionReferenceNum = 0xFFFF;
- sbi->s_flags = uopt.flags;
- sbi->s_uid = uopt.uid;
- sbi->s_gid = uopt.gid;
- sbi->s_umask = uopt.umask;
- sbi->s_fmode = uopt.fmode;
- sbi->s_dmode = uopt.dmode;
- sbi->s_nls_map = uopt.nls_map;
+ sbi->s_flags = uopt->flags;
+ sbi->s_uid = uopt->uid;
+ sbi->s_gid = uopt->gid;
+ sbi->s_umask = uopt->umask;
+ sbi->s_fmode = uopt->fmode;
+ sbi->s_dmode = uopt->dmode;
+ sbi->s_nls_map = uopt->nls_map;
+ uopt->nls_map = NULL;
rwlock_init(&sbi->s_cred_lock);
- if (uopt.session == 0xFFFFFFFF)
+ if (uopt->session == 0xFFFFFFFF)
sbi->s_session = udf_get_last_session(sb);
else
- sbi->s_session = uopt.session;
+ sbi->s_session = uopt->session;
udf_debug("Multi-session=%d\n", sbi->s_session);
@@ -2138,16 +2168,16 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sb->s_magic = UDF_SUPER_MAGIC;
sb->s_time_gran = 1000;
- if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
- ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+ if (uopt->flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+ ret = udf_load_vrs(sb, uopt, silent, &fileset);
} else {
- uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
- while (uopt.blocksize <= 4096) {
- ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+ uopt->blocksize = bdev_logical_block_size(sb->s_bdev);
+ while (uopt->blocksize <= 4096) {
+ ret = udf_load_vrs(sb, uopt, silent, &fileset);
if (ret < 0) {
if (!silent && ret != -EACCES) {
pr_notice("Scanning with blocksize %u failed\n",
- uopt.blocksize);
+ uopt->blocksize);
}
brelse(sbi->s_lvid_bh);
sbi->s_lvid_bh = NULL;
@@ -2160,7 +2190,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
} else
break;
- uopt.blocksize <<= 1;
+ uopt->blocksize <<= 1;
}
}
if (ret < 0) {
@@ -2265,8 +2295,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
error_out:
iput(sbi->s_vat_inode);
-parse_options_failure:
- unload_nls(uopt.nls_map);
+ unload_nls(uopt->nls_map);
if (lvid_open)
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index f9a60bc1abcf..08ec8756b948 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -23,6 +23,7 @@
#define UDF_FLAG_STRICT 5
#define UDF_FLAG_UNDELETE 6
#define UDF_FLAG_UNHIDE 7
+#define UDF_FLAG_NOVRS 8
#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */
#define UDF_FLAG_GID_FORGET 12
#define UDF_FLAG_UID_SET 13
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 959551ff9a95..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
-/*
- * Start with fault_pending_wqh and fault_wqh so they're more likely
- * to be in the same cacheline.
- *
- * Locking order:
- * fd_wqh.lock
- * fault_pending_wqh.lock
- * fault_wqh.lock
- * event_wqh.lock
- *
- * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
- * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
- * also taken in IRQ context.
- */
-struct userfaultfd_ctx {
- /* waitqueue head for the pending (i.e. not read) userfaults */
- wait_queue_head_t fault_pending_wqh;
- /* waitqueue head for the userfaults */
- wait_queue_head_t fault_wqh;
- /* waitqueue head for the pseudo fd to wakeup poll/read */
- wait_queue_head_t fd_wqh;
- /* waitqueue head for events */
- wait_queue_head_t event_wqh;
- /* a refile sequence protected by fault_pending_wqh lock */
- seqcount_spinlock_t refile_seq;
- /* pseudo fd refcounting */
- refcount_t refcount;
- /* userfaultfd syscall flags */
- unsigned int flags;
- /* features requested from the userspace */
- unsigned int features;
- /* released */
- bool released;
- /* memory mappings are changing because of non-cooperative event */
- atomic_t mmap_changing;
- /* mm with one ore more vmas attached to this userfaultfd_ctx */
- struct mm_struct *mm;
-};
-
struct userfaultfd_fork_ctx {
struct userfaultfd_ctx *orig;
struct userfaultfd_ctx *new;
@@ -724,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->flags = octx->flags;
ctx->features = octx->features;
ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
+ down_write(&octx->map_changing_lock);
atomic_inc(&octx->mmap_changing);
+ up_write(&octx->map_changing_lock);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -776,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
vma_start_write(vma);
@@ -822,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
mmap_read_unlock(mm);
msg_init(&ewq.msg);
@@ -864,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -1748,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- flags);
+ ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1800,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1857,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return -EINVAL;
if (mmget_not_zero(ctx->mm)) {
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1909,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
- &ctx->mmap_changing, flags);
+ ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
+ uffdio_continue.range.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1964,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
- uffdio_poison.range.len,
- &ctx->mmap_changing, 0);
+ ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
+ uffdio_poison.range.len, 0);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -2040,16 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
return -EINVAL;
if (mmget_not_zero(mm)) {
- mmap_read_lock(mm);
-
- /* Re-check after taking mmap_lock */
- if (likely(!atomic_read(&ctx->mmap_changing)))
- ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
- uffdio_move.len, uffdio_move.mode);
- else
- ret = -EINVAL;
-
- mmap_read_unlock(mm);
+ ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
mmput(mm);
} else {
return -ESRCH;
@@ -2255,6 +2212,7 @@ static int new_userfaultfd(int flags)
ctx->flags = flags;
ctx->features = 0;
ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 567fb37274d3..d41edd30388b 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -124,12 +124,24 @@ config XFS_DRAIN_INTENTS
bool
select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+config XFS_LIVE_HOOKS
+ bool
+ select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+
+config XFS_MEMORY_BUFS
+ bool
+
+config XFS_BTREE_IN_MEM
+ bool
+
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
default n
depends on XFS_FS
depends on TMPFS && SHMEM
+ select XFS_LIVE_HOOKS
select XFS_DRAIN_INTENTS
+ select XFS_MEMORY_BUFS
help
If you say Y here you will be able to check metadata on a
mounted XFS filesystem. This feature is intended to reduce
@@ -164,6 +176,7 @@ config XFS_ONLINE_REPAIR
bool "XFS online metadata repair support"
default n
depends on XFS_FS && XFS_ONLINE_SCRUB
+ select XFS_BTREE_IN_MEM
help
If you say Y here you will be able to repair metadata on a
mounted XFS filesystem. This feature is intended to reduce
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index fbe3cdc79036..76674ad5833e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -92,8 +92,7 @@ xfs-y += xfs_aops.o \
xfs_symlink.o \
xfs_sysfs.o \
xfs_trans.o \
- xfs_xattr.o \
- kmem.o
+ xfs_xattr.o
# low-level transaction/log code
xfs-y += xfs_log.o \
@@ -137,6 +136,9 @@ xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
endif
xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o
+xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o
+xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o
+xfs-$(CONFIG_XFS_BTREE_IN_MEM) += libxfs/xfs_btree_mem.o
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
@@ -159,6 +161,8 @@ xfs-y += $(addprefix scrub/, \
health.o \
ialloc.o \
inode.o \
+ iscan.o \
+ nlinks.o \
parent.o \
readdir.o \
refcount.o \
@@ -179,6 +183,7 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
dqiterate.o \
quota.o \
+ quotacheck.o \
)
# online repair
@@ -188,12 +193,17 @@ xfs-y += $(addprefix scrub/, \
alloc_repair.o \
bmap_repair.o \
cow_repair.o \
+ fscounters_repair.o \
ialloc_repair.o \
inode_repair.o \
newbt.o \
+ nlinks_repair.o \
+ rcbag_btree.o \
+ rcbag.o \
reap.o \
refcount_repair.o \
repair.o \
+ rmap_repair.o \
)
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
@@ -202,6 +212,7 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
quota_repair.o \
+ quotacheck_repair.o \
)
endif
endif
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
deleted file mode 100644
index c557a030acfe..000000000000
--- a/fs/xfs/kmem.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_message.h"
-#include "xfs_trace.h"
-
-void *
-kmem_alloc(size_t size, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_alloc(size, flags, _RET_IP_);
-
- do {
- ptr = kmalloc(size, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
- current->comm, current->pid,
- (unsigned int)size, __func__, lflags);
- memalloc_retry_wait(lflags);
- } while (1);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
deleted file mode 100644
index b987dc2c6851..000000000000
--- a/fs/xfs/kmem.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-
-/*
- * General memory allocation interfaces
- */
-
-typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
-#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
-#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
-#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u)
-
-/*
- * We use a special process flag to avoid recursive callbacks into
- * the filesystem during transactions. We will also issue our own
- * warnings, so we explicitly skip any generic ones (silly of us).
- */
-static inline gfp_t
-kmem_flags_convert(xfs_km_flags_t flags)
-{
- gfp_t lflags;
-
- BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP));
-
- lflags = GFP_KERNEL | __GFP_NOWARN;
- if (flags & KM_NOFS)
- lflags &= ~__GFP_FS;
-
- /*
- * Default page/slab allocator behavior is to retry for ever
- * for small allocations. We can override this behavior by using
- * __GFP_RETRY_MAYFAIL which will tell the allocator to retry as long
- * as it is feasible but rather fail than retry forever for all
- * request sizes.
- */
- if (flags & KM_MAYFAIL)
- lflags |= __GFP_RETRY_MAYFAIL;
-
- if (flags & KM_ZERO)
- lflags |= __GFP_ZERO;
-
- if (flags & KM_NOLOCKDEP)
- lflags |= __GFP_NOLOCKDEP;
-
- return lflags;
-}
-
-extern void *kmem_alloc(size_t, xfs_km_flags_t);
-static inline void kmem_free(const void *ptr)
-{
- kvfree(ptr);
-}
-
-
-static inline void *
-kmem_zalloc(size_t size, xfs_km_flags_t flags)
-{
- return kmem_alloc(size, flags | KM_ZERO);
-}
-
-/*
- * Zone interfaces
- */
-static inline struct page *
-kmem_to_page(void *addr)
-{
- if (is_vmalloc_addr(addr))
- return vmalloc_to_page(addr);
- return virt_to_page(addr);
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 39d9525270b7..dc1873f76bff 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -217,6 +217,7 @@ xfs_initialize_perag_data(
*/
if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
error = -EFSCORRUPTED;
goto out;
}
@@ -241,7 +242,7 @@ __xfs_free_perag(
struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
- kmem_free(pag);
+ kfree(pag);
}
/*
@@ -263,7 +264,7 @@ xfs_free_perag(
xfs_defer_drain_free(&pag->pag_intents_drain);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_buf_hash_destroy(pag);
+ xfs_buf_cache_destroy(&pag->pag_bcache);
/* drop the mount's active reference */
xfs_perag_rele(pag);
@@ -351,9 +352,9 @@ xfs_free_unused_perag_range(
spin_unlock(&mp->m_perag_lock);
if (!pag)
break;
- xfs_buf_hash_destroy(pag);
+ xfs_buf_cache_destroy(&pag->pag_bcache);
xfs_defer_drain_free(&pag->pag_intents_drain);
- kmem_free(pag);
+ kfree(pag);
}
}
@@ -381,7 +382,7 @@ xfs_initialize_perag(
continue;
}
- pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+ pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!pag) {
error = -ENOMEM;
goto out_unwind_new_pags;
@@ -389,7 +390,7 @@ xfs_initialize_perag(
pag->pag_agno = index;
pag->pag_mount = mp;
- error = radix_tree_preload(GFP_NOFS);
+ error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (error)
goto out_free_pag;
@@ -416,9 +417,10 @@ xfs_initialize_perag(
init_waitqueue_head(&pag->pag_active_wq);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
+ xfs_hooks_init(&pag->pag_rmap_update_hooks);
#endif /* __KERNEL__ */
- error = xfs_buf_hash_init(pag);
+ error = xfs_buf_cache_init(&pag->pag_bcache);
if (error)
goto out_remove_pag;
@@ -453,7 +455,7 @@ out_remove_pag:
radix_tree_delete(&mp->m_perag_tree, index);
spin_unlock(&mp->m_perag_lock);
out_free_pag:
- kmem_free(pag);
+ kfree(pag);
out_unwind_new_pags:
/* unwind any prior newly initialized pags */
xfs_free_unused_perag_range(mp, first_initialised, agcount);
@@ -491,7 +493,7 @@ xfs_btroot_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno);
}
/* Finish initializing a free space btree. */
@@ -549,7 +551,7 @@ xfs_freesp_init_recs(
}
/*
- * Alloc btree root block init functions
+ * bnobt/cntbt btree root block init functions
*/
static void
xfs_bnoroot_init(
@@ -557,17 +559,7 @@ xfs_bnoroot_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno);
- xfs_freesp_init_recs(mp, bp, id);
-}
-
-static void
-xfs_cntroot_init(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- struct aghdr_init_data *id)
-{
- xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno);
xfs_freesp_init_recs(mp, bp, id);
}
@@ -583,7 +575,7 @@ xfs_rmaproot_init(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_rmap_rec *rrec;
- xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 4, id->agno);
/*
* mark the AG header regions as static metadata The BNO
@@ -678,14 +670,13 @@ xfs_agfblock_init(
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(id->agno);
agf->agf_length = cpu_to_be32(id->agsize);
- agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
- agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
- agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+ agf->agf_bno_root = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ agf->agf_cnt_root = cpu_to_be32(XFS_CNT_BLOCK(mp));
+ agf->agf_bno_level = cpu_to_be32(1);
+ agf->agf_cnt_level = cpu_to_be32(1);
if (xfs_has_rmapbt(mp)) {
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(XFS_RMAP_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_root = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ agf->agf_rmap_level = cpu_to_be32(1);
agf->agf_rmap_blocks = cpu_to_be32(1);
}
@@ -796,7 +787,7 @@ struct xfs_aghdr_grow_data {
size_t numblks;
const struct xfs_buf_ops *ops;
aghdr_init_work_f work;
- xfs_btnum_t type;
+ const struct xfs_btree_ops *bc_ops;
bool need_init;
};
@@ -850,13 +841,15 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
+ .bc_ops = &xfs_bnobt_ops,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_cntbt_buf_ops,
- .work = &xfs_cntroot_init,
+ .work = &xfs_bnoroot_init,
+ .bc_ops = &xfs_cntbt_ops,
.need_init = true
},
{ /* INO root block */
@@ -864,7 +857,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_inobt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_INO,
+ .bc_ops = &xfs_inobt_ops,
.need_init = true
},
{ /* FINO root block */
@@ -872,7 +865,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_FINO,
+ .bc_ops = &xfs_finobt_ops,
.need_init = xfs_has_finobt(mp)
},
{ /* RMAP root block */
@@ -880,6 +873,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_rmapbt_buf_ops,
.work = &xfs_rmaproot_init,
+ .bc_ops = &xfs_rmapbt_ops,
.need_init = xfs_has_rmapbt(mp)
},
{ /* REFC root block */
@@ -887,7 +881,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_refcountbt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_REFC,
+ .bc_ops = &xfs_refcountbt_ops,
.need_init = xfs_has_reflink(mp)
},
{ /* NULL terminating block */
@@ -905,7 +899,7 @@ xfs_ag_init_headers(
id->daddr = dp->daddr;
id->numblks = dp->numblks;
- id->type = dp->type;
+ id->bc_ops = dp->bc_ops;
error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
if (error)
break;
@@ -950,8 +944,10 @@ xfs_ag_shrink_space(
agf = agfbp->b_addr;
aglen = be32_to_cpu(agi->agi_length);
/* some extra paranoid checks before we shrink the ag */
- if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
+ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
return -EFSCORRUPTED;
+ }
if (delta >= aglen)
return -EINVAL;
@@ -979,14 +975,23 @@ xfs_ag_shrink_space(
if (error) {
/*
- * if extent allocation fails, need to roll the transaction to
+ * If extent allocation fails, need to roll the transaction to
* ensure that the AGFL fixup has been committed anyway.
+ *
+ * We need to hold the AGF across the roll to ensure nothing can
+ * access the AG for allocation until the shrink is fully
+ * cleaned up. And due to the resetting of the AG block
+ * reservation space needing to lock the AGI, we also have to
+ * hold that so we don't get AGI/AGF lock order inversions in
+ * the error handling path.
*/
xfs_trans_bhold(*tpp, agfbp);
+ xfs_trans_bhold(*tpp, agibp);
err2 = xfs_trans_roll(tpp);
if (err2)
return err2;
xfs_trans_bjoin(*tpp, agfbp);
+ xfs_trans_bjoin(*tpp, agibp);
goto resv_init_out;
}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4b343c4fac28..35de09a2516c 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -36,8 +36,9 @@ struct xfs_perag {
atomic_t pag_active_ref; /* active reference count */
wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */
unsigned long pag_opstate;
- uint8_t pagf_levels[XFS_BTNUM_AGF];
- /* # of levels in bno & cnt btree */
+ uint8_t pagf_bno_level; /* # of levels in bno btree */
+ uint8_t pagf_cnt_level; /* # of levels in cnt btree */
+ uint8_t pagf_rmap_level;/* # of levels in rmap btree */
uint32_t pagf_flcount; /* count of blocks in freelist */
xfs_extlen_t pagf_freeblks; /* total free blocks */
xfs_extlen_t pagf_longest; /* longest free space */
@@ -86,8 +87,10 @@ struct xfs_perag {
* Alternate btree heights so that online repair won't trip the write
* verifiers while rebuilding the AG btrees.
*/
- uint8_t pagf_repair_levels[XFS_BTNUM_AGF];
+ uint8_t pagf_repair_bno_level;
+ uint8_t pagf_repair_cnt_level;
uint8_t pagf_repair_refcount_level;
+ uint8_t pagf_repair_rmap_level;
#endif
spinlock_t pag_state_lock;
@@ -104,9 +107,7 @@ struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
- /* buffer cache index */
- spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
- struct rhashtable pag_buf_hash;
+ struct xfs_buf_cache pag_bcache;
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
@@ -119,6 +120,9 @@ struct xfs_perag {
* inconsistencies.
*/
struct xfs_defer_drain pag_intents_drain;
+
+ /* Hook to feed rmapbt updates to an active online repair. */
+ struct xfs_hooks pag_rmap_update_hooks;
#endif /* __KERNEL__ */
};
@@ -331,7 +335,7 @@ struct aghdr_init_data {
/* per header data */
xfs_daddr_t daddr; /* header location */
size_t numblks; /* size of header */
- xfs_btnum_t type; /* type of btree root block */
+ const struct xfs_btree_ops *bc_ops; /* btree ops */
};
int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3bd0a33fee0a..9da52e92172a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -26,6 +26,7 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_extfree_item_cache;
@@ -150,23 +151,38 @@ xfs_alloc_ag_max_usable(
return mp->m_sb.sb_agblocks - blocks;
}
+
+static int
+xfs_alloc_lookup(
+ struct xfs_btree_cur *cur,
+ xfs_lookup_t dir,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ int *stat)
+{
+ int error;
+
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ error = xfs_btree_lookup(cur, dir, stat);
+ if (*stat == 1)
+ cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
+ else
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+ return error;
+}
+
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
*/
-STATIC int /* error */
+static inline int /* error */
xfs_alloc_lookup_eq(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
-
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat);
}
/*
@@ -180,13 +196,7 @@ xfs_alloc_lookup_ge(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
-
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat);
}
/*
@@ -200,19 +210,14 @@ xfs_alloc_lookup_le(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat);
}
static inline bool
xfs_alloc_cur_active(
struct xfs_btree_cur *cur)
{
- return cur && cur->bc_ag.abt.active;
+ return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE);
}
/*
@@ -268,12 +273,12 @@ xfs_alloc_complain_bad_rec(
struct xfs_mount *mp = cur->bc_mp;
xfs_warn(mp,
- "%s Freespace BTree record corruption in AG %d detected at %pS!",
- cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
- cur->bc_ag.pag->pag_agno, fa);
+ "%sbt record corruption in AG %d detected at %pS!",
+ cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"start block 0x%x block count 0x%x", irec->ar_startblock,
irec->ar_blockcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -497,14 +502,18 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
i != 1 ||
nfbno1 != fbno ||
- nflen1 != flen))
+ nflen1 != flen)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
#endif
} else {
if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
/*
* Look up the record in the by-block tree if necessary.
@@ -516,14 +525,18 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
i != 1 ||
nfbno1 != fbno ||
- nflen1 != flen))
+ nflen1 != flen)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
#endif
} else {
if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
#ifdef DEBUG
@@ -536,8 +549,10 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
bnoblock->bb_numrecs !=
- cntblock->bb_numrecs))
+ cntblock->bb_numrecs)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
#endif
@@ -567,30 +582,40 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
/*
* Add new by-size btree entry(s).
*/
if (nfbno1 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
if (nfbno2 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
/*
* Fix up the by-block btree entry(s).
@@ -601,8 +626,10 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
} else {
/*
* Update the by-block entry to start later|be shorter.
@@ -616,12 +643,16 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
return 0;
}
@@ -755,6 +786,8 @@ xfs_alloc_read_agfl(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
if (error)
return error;
xfs_buf_set_ref(bp, XFS_AGFL_REF);
@@ -776,6 +809,7 @@ xfs_alloc_update_counters(
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
be32_to_cpu(agf->agf_length))) {
xfs_buf_mark_corrupt(agbp);
+ xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
return -EFSCORRUPTED;
}
@@ -828,8 +862,8 @@ xfs_alloc_cur_setup(
* attempt a small allocation.
*/
if (!acur->cnt)
- acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_CNT);
+ acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
if (error)
return error;
@@ -838,11 +872,11 @@ xfs_alloc_cur_setup(
* Allocate the bnobt left and right search cursors.
*/
if (!acur->bnolt)
- acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_BNO);
+ acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
if (!acur->bnogt)
- acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_BNO);
+ acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
return i == 1 ? 0 : -ENOSPC;
}
@@ -884,15 +918,17 @@ xfs_alloc_cur_check(
bool busy;
unsigned busy_gen = 0;
bool deactivate = false;
- bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+ bool isbnobt = xfs_btree_is_bno(cur->bc_ops);
*new = 0;
error = xfs_alloc_get_rec(cur, &bno, &len, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->mp, i != 1))
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/*
* Check minlen and deactivate a cntbt cursor if out of acceptable size
@@ -958,9 +994,8 @@ xfs_alloc_cur_check(
deactivate = true;
out:
if (deactivate)
- cur->bc_ag.abt.active = false;
- trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
- *new);
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+ trace_xfs_alloc_cur_check(cur, bno, len, diff, *new);
return 0;
}
@@ -1098,6 +1133,7 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(ccur);
error = -EFSCORRUPTED;
goto error;
}
@@ -1132,6 +1168,7 @@ xfs_alloc_ag_vextent_small(
*fbnop = args->agbno = fbno;
*flenp = args->len = 1;
if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
+ xfs_btree_mark_sick(ccur);
error = -EFSCORRUPTED;
goto error;
}
@@ -1197,8 +1234,8 @@ xfs_alloc_ag_vextent_exact(
/*
* Allocate/initialize a cursor for the by-number freespace btree.
*/
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
@@ -1218,6 +1255,7 @@ xfs_alloc_ag_vextent_exact(
if (error)
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1257,8 +1295,8 @@ xfs_alloc_ag_vextent_exact(
* We are allocating agbno for args->len
* Allocate/initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
@@ -1330,7 +1368,7 @@ xfs_alloc_walk_iter(
if (error)
return error;
if (i == 0)
- cur->bc_ag.abt.active = false;
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
if (count > 0)
count--;
@@ -1444,7 +1482,7 @@ xfs_alloc_ag_vextent_locality(
if (error)
return error;
if (i) {
- acur->cnt->bc_ag.abt.active = true;
+ acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
fbcur = acur->cnt;
fbinc = false;
}
@@ -1497,8 +1535,10 @@ xfs_alloc_ag_vextent_lastblock(
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->mp, i != 1))
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(acur->cnt);
return -EFSCORRUPTED;
+ }
if (*len >= args->minlen)
break;
error = xfs_btree_increment(acur->cnt, 0, &i);
@@ -1670,8 +1710,8 @@ restart:
/*
* Allocate and initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
bno_cur = NULL;
/*
@@ -1710,6 +1750,7 @@ restart:
if (error)
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1756,6 +1797,7 @@ restart:
rlen != 0 &&
(rlen > flen ||
rbno + rlen > fbno + flen))) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1778,6 +1820,7 @@ restart:
&i)))
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1790,6 +1833,7 @@ restart:
rlen != 0 &&
(rlen > flen ||
rbno + rlen > fbno + flen))) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1806,6 +1850,7 @@ restart:
&i)))
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1844,14 +1889,15 @@ restart:
rlen = args->len;
if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
/*
* Allocate and initialize a cursor for the by-block tree.
*/
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
@@ -1863,6 +1909,7 @@ restart:
if (XFS_IS_CORRUPT(args->mp,
args->agbno + args->len >
be32_to_cpu(agf->agf_length))) {
+ xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1924,7 +1971,7 @@ xfs_free_ag_extent(
/*
* Allocate and initialize a cursor for the by-block btree.
*/
- bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
@@ -1938,6 +1985,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1953,6 +2001,7 @@ xfs_free_ag_extent(
* Very bad.
*/
if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1971,6 +2020,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1986,6 +2036,7 @@ xfs_free_ag_extent(
* Very bad.
*/
if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1994,7 +2045,7 @@ xfs_free_ag_extent(
/*
* Now allocate and initialize a cursor for the by-size tree.
*/
- cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
/*
* Have both left and right contiguous neighbors.
* Merge all three into a single free block.
@@ -2006,12 +2057,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2021,12 +2074,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2036,6 +2091,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2045,6 +2101,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2064,6 +2121,7 @@ xfs_free_ag_extent(
i != 1 ||
xxbno != ltbno ||
xxlen != ltlen)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2088,12 +2146,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2104,6 +2164,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2123,12 +2184,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2151,6 +2214,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2163,12 +2227,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2267,8 +2333,9 @@ xfs_alloc_min_freelist(
struct xfs_perag *pag)
{
/* AG btrees have at least 1 level. */
- static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
- const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
+ const unsigned int bno_level = pag ? pag->pagf_bno_level : 1;
+ const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1;
+ const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1;
unsigned int min_free;
ASSERT(mp->m_alloc_maxlevels > 0);
@@ -2295,16 +2362,12 @@ xfs_alloc_min_freelist(
*/
/* space needed by-bno freespace btree */
- min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_alloc_maxlevels) * 2 - 2;
+ min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
/* space needed by-size freespace btree */
- min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_alloc_maxlevels) * 2 - 2;
+ min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
- min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels) * 2 - 2;
-
+ min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2;
return min_free;
}
@@ -2691,13 +2754,14 @@ xfs_exact_minlen_extent_available(
xfs_extlen_t flen;
int error = 0;
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp,
+ args->pag);
error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
if (error)
goto out;
if (*stat == 0) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -2987,8 +3051,8 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_versionnum),
offsetof(xfs_agf_t, agf_seqno),
offsetof(xfs_agf_t, agf_length),
- offsetof(xfs_agf_t, agf_roots[0]),
- offsetof(xfs_agf_t, agf_levels[0]),
+ offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */
+ offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */
offsetof(xfs_agf_t, agf_flfirst),
offsetof(xfs_agf_t, agf_fllast),
offsetof(xfs_agf_t, agf_flcount),
@@ -3167,12 +3231,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_freeblks) > agf_length)
return __this_address;
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
- mp->m_alloc_maxlevels ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
- mp->m_alloc_maxlevels)
+ if (be32_to_cpu(agf->agf_bno_level) < 1 ||
+ be32_to_cpu(agf->agf_cnt_level) < 1 ||
+ be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels)
return __this_address;
if (xfs_has_lazysbcount(mp) &&
@@ -3183,9 +3245,8 @@ xfs_agf_verify(
if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
return __this_address;
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
- mp->m_rmap_maxlevels)
+ if (be32_to_cpu(agf->agf_rmap_level) < 1 ||
+ be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels)
return __this_address;
}
@@ -3268,6 +3329,8 @@ xfs_read_agf(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
if (error)
return error;
@@ -3309,12 +3372,9 @@ xfs_alloc_read_agf(
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
- pag->pagf_levels[XFS_BTNUM_BNOi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
- pag->pagf_levels[XFS_BTNUM_CNTi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- pag->pagf_levels[XFS_BTNUM_RMAPi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+ pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+ pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
if (xfs_agfl_needs_reset(pag->pag_mount, agf))
set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
@@ -3343,10 +3403,8 @@ xfs_alloc_read_agf(
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
- ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
- ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+ ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
+ ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
}
#endif
if (agfbpp)
@@ -3895,17 +3953,23 @@ __xfs_free_extent(
return -EIO;
error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
- if (error)
+ if (error) {
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
return error;
+ }
+
agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto err_release;
}
/* validate the extent size is legal now we have the agf locked */
if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto err_release;
}
@@ -3962,7 +4026,7 @@ xfs_alloc_query_range(
union xfs_btree_irec high_brec = { .a = *high_rec };
struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn };
- ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ ASSERT(xfs_btree_is_bno(cur->bc_ops));
return xfs_btree_query_range(cur, &low_brec, &high_brec,
xfs_alloc_query_range_helper, &query);
}
@@ -3976,7 +4040,7 @@ xfs_alloc_query_all(
{
struct xfs_alloc_query_range_info query;
- ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ ASSERT(xfs_btree_is_bno(cur->bc_ops));
query.priv = priv;
query.fn = fn;
return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index a7032bf0cd37..6ef5ddd89600 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -16,6 +16,7 @@
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_ag.h"
@@ -23,13 +24,22 @@
static struct kmem_cache *xfs_allocbt_cur_cache;
STATIC struct xfs_btree_cur *
-xfs_allocbt_dup_cursor(
+xfs_bnobt_dup_cursor(
struct xfs_btree_cur *cur)
{
- return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+ return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
+ cur->bc_ag.pag);
}
+STATIC struct xfs_btree_cur *
+xfs_cntbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
+ cur->bc_ag.pag);
+}
+
+
STATIC void
xfs_allocbt_set_root(
struct xfs_btree_cur *cur,
@@ -38,13 +48,18 @@ xfs_allocbt_set_root(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- int btnum = cur->bc_btnum;
ASSERT(ptr->s != 0);
- agf->agf_roots[btnum] = ptr->s;
- be32_add_cpu(&agf->agf_levels[btnum], inc);
- cur->bc_ag.pag->pagf_levels[btnum] += inc;
+ if (xfs_btree_is_bno(cur->bc_ops)) {
+ agf->agf_bno_root = ptr->s;
+ be32_add_cpu(&agf->agf_bno_level, inc);
+ cur->bc_ag.pag->pagf_bno_level += inc;
+ } else {
+ agf->agf_cnt_root = ptr->s;
+ be32_add_cpu(&agf->agf_cnt_level, inc);
+ cur->bc_ag.pag->pagf_cnt_level += inc;
+ }
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -116,7 +131,7 @@ xfs_allocbt_update_lastrec(
__be32 len;
int numrecs;
- ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+ ASSERT(!xfs_btree_is_bno(cur->bc_ops));
switch (reason) {
case LASTREC_UPDATE:
@@ -226,7 +241,10 @@ xfs_allocbt_init_ptr_from_cur(
ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
- ptr->s = agf->agf_roots[cur->bc_btnum];
+ if (xfs_btree_is_bno(cur->bc_ops))
+ ptr->s = agf->agf_bno_root;
+ else
+ ptr->s = agf->agf_cnt_root;
}
STATIC int64_t
@@ -299,13 +317,12 @@ xfs_allocbt_verify(
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
- xfs_btnum_t btnum = XFS_BTNUM_BNOi;
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (xfs_has_crc(mp)) {
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
}
@@ -320,26 +337,32 @@ xfs_allocbt_verify(
* against.
*/
level = be16_to_cpu(block->bb_level);
- if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
- btnum = XFS_BTNUM_CNTi;
if (pag && xfs_perag_initialised_agf(pag)) {
- unsigned int maxlevel = pag->pagf_levels[btnum];
+ unsigned int maxlevel, repair_maxlevel = 0;
-#ifdef CONFIG_XFS_ONLINE_REPAIR
/*
* Online repair could be rewriting the free space btrees, so
* we'll validate against the larger of either tree while this
* is going on.
*/
- maxlevel = max_t(unsigned int, maxlevel,
- pag->pagf_repair_levels[btnum]);
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) {
+ maxlevel = pag->pagf_cnt_level;
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ repair_maxlevel = pag->pagf_repair_cnt_level;
+#endif
+ } else {
+ maxlevel = pag->pagf_bno_level;
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ repair_maxlevel = pag->pagf_repair_bno_level;
#endif
- if (level >= maxlevel)
+ }
+
+ if (level >= max(maxlevel, repair_maxlevel))
return __this_address;
} else if (level >= mp->m_alloc_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
static void
@@ -348,7 +371,7 @@ xfs_allocbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_allocbt_verify(bp);
@@ -372,7 +395,7 @@ xfs_allocbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -454,11 +477,19 @@ xfs_allocbt_keys_contiguous(
be32_to_cpu(key2->alloc.ar_startblock));
}
-static const struct xfs_btree_ops xfs_bnobt_ops = {
+const struct xfs_btree_ops xfs_bnobt_ops = {
+ .name = "bno",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
- .dup_cursor = xfs_allocbt_dup_cursor,
+ .lru_refs = XFS_ALLOC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2),
+ .sick_mask = XFS_SICK_AG_BNOBT,
+
+ .dup_cursor = xfs_bnobt_dup_cursor,
.set_root = xfs_allocbt_set_root,
.alloc_block = xfs_allocbt_alloc_block,
.free_block = xfs_allocbt_free_block,
@@ -477,11 +508,20 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.keys_contiguous = xfs_allocbt_keys_contiguous,
};
-static const struct xfs_btree_ops xfs_cntbt_ops = {
+const struct xfs_btree_ops xfs_cntbt_ops = {
+ .name = "cnt",
+ .type = XFS_BTREE_TYPE_AG,
+ .geom_flags = XFS_BTGEO_LASTREC_UPDATE,
+
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_ALLOC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2),
+ .sick_mask = XFS_SICK_AG_CNTBT,
- .dup_cursor = xfs_allocbt_dup_cursor,
+ .dup_cursor = xfs_cntbt_dup_cursor,
.set_root = xfs_allocbt_set_root,
.alloc_block = xfs_allocbt_alloc_block,
.free_block = xfs_allocbt_free_block,
@@ -500,76 +540,55 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.keys_contiguous = NULL, /* not needed right now */
};
-/* Allocate most of a new allocation btree cursor. */
-STATIC struct xfs_btree_cur *
-xfs_allocbt_init_common(
+/*
+ * Allocate a new bnobt cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
+struct xfs_btree_cur *
+xfs_bnobt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
- cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
- xfs_allocbt_cur_cache);
- cur->bc_ag.abt.active = false;
-
- if (btnum == XFS_BTNUM_CNT) {
- cur->bc_ops = &xfs_cntbt_ops;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
- cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
- } else {
- cur->bc_ops = &xfs_bnobt_ops;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
- }
-
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops,
+ mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
+ cur->bc_nlevels = be32_to_cpu(agf->agf_bno_level);
+ }
return cur;
}
/*
- * Allocate a new allocation btree cursor.
+ * Allocate a new cntbt cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-struct xfs_btree_cur * /* new alloc btree cursor */
-xfs_allocbt_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for agf structure */
- struct xfs_perag *pag,
- xfs_btnum_t btnum) /* btree identifier */
-{
- struct xfs_agf *agf = agbp->b_addr;
- struct xfs_btree_cur *cur;
-
- cur = xfs_allocbt_init_common(mp, tp, pag, btnum);
- if (btnum == XFS_BTNUM_CNT)
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- else
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
-
- cur->bc_ag.agbp = agbp;
-
- return cur;
-}
-
-/* Create a free space btree cursor with a fake root for staging. */
struct xfs_btree_cur *
-xfs_allocbt_stage_cursor(
+xfs_cntbt_init_cursor(
struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- cur = xfs_allocbt_init_common(mp, NULL, pag, btnum);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops,
+ mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
+ cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agf->agf_cnt_level);
+ }
return cur;
}
@@ -588,16 +607,16 @@ xfs_allocbt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
- agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
- xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+ if (xfs_btree_is_bno(cur->bc_ops)) {
+ agf->agf_bno_root = cpu_to_be32(afake->af_root);
+ agf->agf_bno_level = cpu_to_be32(afake->af_levels);
} else {
- cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+ agf->agf_cnt_root = cpu_to_be32(afake->af_root);
+ agf->agf_cnt_level = cpu_to_be32(afake->af_levels);
}
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in an alloc btree block. */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45df893ef6bb..155b47f231ab 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -47,12 +47,12 @@ struct xbtree_afakeroot;
(maxrecs) * sizeof(xfs_alloc_key_t) + \
((index) - 1) * sizeof(xfs_alloc_ptr_t)))
-extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
+struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_perag *pag, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag,
- xfs_btnum_t btnum);
+ struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_perag *pag);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e965a48e7db9..673a4b6d2e8d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -224,7 +224,7 @@ int
xfs_attr_get_ilocked(
struct xfs_da_args *args)
{
- ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
@@ -891,7 +891,8 @@ xfs_attr_defer_add(
struct xfs_attr_intent *new;
- new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ new = kmem_cache_zalloc(xfs_attr_intent_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
new->xattri_op_flags = op_flags;
new->xattri_da_args = args;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6374bf107242..ac904cc1a97b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -29,6 +29,7 @@
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
/*
@@ -879,8 +880,7 @@ xfs_attr_shortform_to_leaf(
trace_xfs_attr_sf_to_leaf(args);
- tmpbuffer = kmem_alloc(size, 0);
- ASSERT(tmpbuffer != NULL);
+ tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, ifp->if_data, size);
sf = (struct xfs_attr_sf_hdr *)tmpbuffer;
@@ -924,7 +924,7 @@ xfs_attr_shortform_to_leaf(
}
error = 0;
out:
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
return error;
}
@@ -1059,7 +1059,7 @@ xfs_attr3_leaf_to_shortform(
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+ tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
if (!tmpbuffer)
return -ENOMEM;
@@ -1125,7 +1125,7 @@ xfs_attr3_leaf_to_shortform(
error = 0;
out:
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
return error;
}
@@ -1533,7 +1533,7 @@ xfs_attr3_leaf_compact(
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+ tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1571,7 +1571,7 @@ xfs_attr3_leaf_compact(
*/
xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
}
/*
@@ -2250,7 +2250,8 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
+ tmp_leaf = kzalloc(state->args->geo->blksize,
+ GFP_KERNEL | __GFP_NOFAIL);
/*
* Copy the header into the temp leaf so that all the stuff
@@ -2290,7 +2291,7 @@ xfs_attr3_leaf_unbalance(
}
memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
savehdr = tmphdr; /* struct copy */
- kmem_free(tmp_leaf);
+ kfree(tmp_leaf);
}
xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
@@ -2343,6 +2344,7 @@ xfs_attr3_leaf_lookup_int(
entries = xfs_attr3_leaf_entryp(leaf);
if (ichdr.count >= args->geo->blksize / 8) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -2362,10 +2364,12 @@ xfs_attr3_leaf_lookup_int(
}
if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..ff0412828772 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -22,6 +22,7 @@
#include "xfs_attr_remote.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
@@ -276,17 +277,18 @@ xfs_attr3_rmt_hdr_set(
*/
STATIC int
xfs_attr_rmtval_copyout(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_ino_t ino,
- int *offset,
- int *valuelen,
- uint8_t **dst)
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct xfs_inode *dp,
+ int *offset,
+ int *valuelen,
+ uint8_t **dst)
{
- char *src = bp->b_addr;
- xfs_daddr_t bno = xfs_buf_daddr(bp);
- int len = BBTOB(bp->b_length);
- int blksize = mp->m_attr_geo->blksize;
+ char *src = bp->b_addr;
+ xfs_ino_t ino = dp->i_ino;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
ASSERT(len >= blksize);
@@ -302,6 +304,7 @@ xfs_attr_rmtval_copyout(
xfs_alert(mp,
"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
bno, *offset, byte_cnt, ino);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
@@ -418,10 +421,12 @@ xfs_attr_rmtval_get(
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
0, &bp, &xfs_attr3_rmt_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
if (error)
return error;
- error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+ error = xfs_attr_rmtval_copyout(mp, bp, args->dp,
&offset, &valuelen,
&dst);
xfs_buf_relse(bp);
@@ -545,11 +550,13 @@ xfs_attr_rmtval_stale(
struct xfs_buf *bp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) ||
- XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
+ XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) {
+ xfs_bmap_mark_sick(ip, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
error = xfs_buf_incore(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map->br_startblock),
@@ -659,8 +666,10 @@ xfs_attr_rmtval_invalidate(
blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1))
+ if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) {
+ xfs_bmap_mark_sick(args->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f362345467fa..656c95a22f2e 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -36,6 +36,9 @@
#include "xfs_refcount.h"
#include "xfs_icache.h"
#include "xfs_iomap.h"
+#include "xfs_health.h"
+#include "xfs_bmap_item.h"
+#include "xfs_symlink_remote.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -225,6 +228,28 @@ xfs_bmap_forkoff_reset(
}
}
+static int
+xfs_bmap_read_buf(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ struct xfs_buf **bpp) /* buffer for fsbno */
+{
+ struct xfs_buf *bp; /* return value */
+ int error;
+
+ if (!xfs_verify_fsbno(mp, fsbno))
+ return -EFSCORRUPTED;
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp,
+ &xfs_bmbt_buf_ops);
+ if (!error) {
+ xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+ *bpp = bp;
+ }
+ return error;
+}
+
#ifdef DEBUG
STATIC struct xfs_buf *
xfs_bmap_get_bp(
@@ -364,9 +389,9 @@ xfs_bmap_check_leaf_extents(
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, NULL, bno, &bp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
goto error_norelse;
}
@@ -383,6 +408,7 @@ xfs_bmap_check_leaf_extents(
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -450,9 +476,9 @@ xfs_bmap_check_leaf_extents(
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, NULL, bno, &bp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
goto error_norelse;
}
@@ -562,11 +588,14 @@ xfs_bmap_btree_to_extents(
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
#ifdef DEBUG
- if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1)))
+ if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
#endif
- error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, tp, cbno, &cbp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
return error;
cblock = XFS_BUF_TO_BLOCK(cbp);
@@ -634,14 +663,13 @@ xfs_bmap_extents_to_btree(
* Fill in the root.
*/
block = ifp->if_broot;
- xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
- XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, block, NULL, 1, 1);
/*
* Need a cursor. Can't allocate until bb_level is filled in.
*/
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+ if (wasdel)
+ cur->bc_flags |= XFS_BTREE_BMBT_WASDEL;
/*
* Convert to a btree with two levels, one record in root.
*/
@@ -667,7 +695,7 @@ xfs_bmap_extents_to_btree(
goto out_root_realloc;
}
- cur->bc_ino.allocated++;
+ cur->bc_bmap.allocated++;
ip->i_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
@@ -679,11 +707,8 @@ xfs_bmap_extents_to_btree(
/*
* Fill in the child block.
*/
- abp->b_ops = &xfs_bmbt_buf_ops;
ablock = XFS_BUF_TO_BLOCK(abp);
- xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp),
- XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, ablock, abp, 0, 0);
for_each_xfs_iext(ifp, &icur, &rec) {
if (isnullstartblock(rec.br_startblock))
@@ -878,6 +903,7 @@ xfs_bmap_add_attrfork_btree(
goto error0;
/* must be at least one entry */
if (XFS_IS_CORRUPT(mp, stat != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -887,7 +913,7 @@ xfs_bmap_add_attrfork_btree(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return -ENOSPC;
}
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
}
return 0;
@@ -915,7 +941,7 @@ xfs_bmap_add_attrfork_extents(
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
XFS_DATA_FORK);
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -960,6 +986,7 @@ xfs_bmap_add_attrfork_local(
/* should only be called for types that support local format data */
ASSERT(0);
+ xfs_bmap_mark_sick(ip, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -1143,6 +1170,7 @@ xfs_iread_bmbt_block(
(unsigned long long)ip->i_ino);
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
sizeof(*block), __this_address);
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -1158,6 +1186,7 @@ xfs_iread_bmbt_block(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iread_extents(2)", frp,
sizeof(*frp), fa);
+ xfs_bmap_mark_sick(ip, whichfork);
return xfs_bmap_complain_bad_rec(ip, whichfork, fa,
&new);
}
@@ -1189,7 +1218,7 @@ xfs_iread_extents(
if (!xfs_need_iread_extents(ifp))
return 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ir.loaded = 0;
xfs_iext_first(ifp, &ir.icur);
@@ -1201,6 +1230,7 @@ xfs_iread_extents(
goto out;
if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto out;
}
@@ -1213,6 +1243,8 @@ xfs_iread_extents(
smp_store_release(&ifp->if_needextents, 0);
return 0;
out:
+ if (xfs_metadata_is_sick(error))
+ xfs_bmap_mark_sick(ip, whichfork);
xfs_iext_destroy(ifp);
return error;
}
@@ -1292,6 +1324,7 @@ xfs_bmap_last_before(
break;
default:
ASSERT(0);
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -1388,8 +1421,10 @@ xfs_bmap_last_offset(
if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
return 0;
- if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp)))
+ if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
if (error || is_empty)
@@ -1429,8 +1464,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(whichfork != XFS_ATTR_FORK);
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!bma->cur ||
- (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+ ASSERT(!bma->cur || (bma->cur->bc_flags & XFS_BTREE_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -1528,6 +1562,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1535,6 +1570,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1542,6 +1578,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1571,6 +1608,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1604,6 +1642,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1632,6 +1671,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1639,6 +1679,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1673,6 +1714,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1698,6 +1740,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1705,6 +1748,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1721,7 +1765,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_ino.allocated : 0));
+ (bma->cur ? bma->cur->bc_bmap.allocated : 0));
PREV.br_startoff = new_endoff;
PREV.br_blockcount = temp;
@@ -1749,6 +1793,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1785,6 +1830,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1792,6 +1838,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1808,7 +1855,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_ino.allocated : 0));
+ (bma->cur ? bma->cur->bc_bmap.allocated : 0));
PREV.br_startblock = nullstartblock(da_new);
PREV.br_blockcount = temp;
@@ -1871,6 +1918,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1878,6 +1926,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1929,8 +1978,8 @@ xfs_bmap_add_extent_delay_real(
xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
if (bma->cur) {
- da_new += bma->cur->bc_ino.allocated;
- bma->cur->bc_ino.allocated = 0;
+ da_new += bma->cur->bc_bmap.allocated;
+ bma->cur->bc_bmap.allocated = 0;
}
/* adjust for changes in reserved delayed indirect blocks */
@@ -2074,30 +2123,35 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2126,18 +2180,21 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2169,18 +2226,21 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2207,6 +2267,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2240,6 +2301,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2277,6 +2339,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2287,6 +2350,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2317,6 +2381,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2353,6 +2418,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2363,12 +2429,14 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2405,6 +2473,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2417,6 +2486,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2429,6 +2499,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2436,6 +2507,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2472,7 +2544,7 @@ xfs_bmap_add_extent_unwritten_real(
/* clear out the allocated field, done with it now in any case. */
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
*curp = cur;
}
@@ -2651,7 +2723,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+ ASSERT(!cur || !(cur->bc_flags & XFS_BTREE_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2721,6 +2793,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2728,6 +2801,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2735,6 +2809,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2764,6 +2839,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2794,6 +2870,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2820,6 +2897,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2827,6 +2905,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2853,7 +2932,7 @@ xfs_bmap_add_extent_hole_real(
/* clear out the allocated field, done with it now in any case. */
if (cur)
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
@@ -3898,14 +3977,18 @@ xfs_bmapi_read(
ASSERT(*nmap >= 1);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE)));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (WARN_ON_ONCE(!ifp))
+ if (WARN_ON_ONCE(!ifp)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (xfs_is_shutdown(mp))
return -EIO;
@@ -4160,9 +4243,8 @@ xfs_bmapi_allocate(
*/
bma->nallocs++;
- if (bma->cur)
- bma->cur->bc_ino.flags =
- bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+ if (bma->cur && bma->wasdel)
+ bma->cur->bc_flags |= XFS_BTREE_BMBT_WASDEL;
bma->got.br_startoff = bma->offset;
bma->got.br_startblock = bma->blkno;
@@ -4369,7 +4451,7 @@ xfs_bmapi_write(
ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!(flags & XFS_BMAPI_REMAP));
/* zeroing is for currently only for data extents, not metadata */
@@ -4386,6 +4468,7 @@ xfs_bmapi_write(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4613,9 +4696,11 @@ xfs_bmapi_convert_delalloc(
error = -ENOSPC;
if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
goto out_finish;
- error = -EFSCORRUPTED;
- if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
+ if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
+ error = -EFSCORRUPTED;
goto out_finish;
+ }
XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
XFS_STATS_INC(mp, xs_xstrat_quick);
@@ -4666,7 +4751,7 @@ xfs_bmapi_remap(
ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(len > 0);
ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
@@ -4674,6 +4759,7 @@ xfs_bmapi_remap(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4693,10 +4779,8 @@ xfs_bmapi_remap(
ip->i_nblocks += len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
got.br_startoff = bno;
got.br_startblock = startblock;
@@ -4831,7 +4915,7 @@ xfs_bmap_del_extent_delay(
XFS_STATS_INC(mp, xs_del_exlist);
- isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ isrt = xfs_ifork_is_realtime(ip, whichfork);
del_endoff = del->br_startoff + del->br_blockcount;
got_endoff = got->br_startoff + got->br_blockcount;
da_old = startblockval(got->br_startblock);
@@ -5067,7 +5151,7 @@ xfs_bmap_del_extent_real(
return -ENOSPC;
*logflagsp = XFS_ILOG_CORE;
- if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+ if (xfs_ifork_is_realtime(ip, whichfork)) {
if (!(bflags & XFS_BMAPI_REMAP)) {
error = xfs_rtfree_blocks(tp, del->br_startblock,
del->br_blockcount);
@@ -5088,8 +5172,10 @@ xfs_bmap_del_extent_real(
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
}
if (got.br_startoff == del->br_startoff)
@@ -5113,8 +5199,10 @@ xfs_bmap_del_extent_real(
}
if ((error = xfs_btree_delete(cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
break;
case BMAP_LEFT_FILLING:
/*
@@ -5186,8 +5274,10 @@ xfs_bmap_del_extent_real(
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/*
* Update the btree record back
* to the original value.
@@ -5203,8 +5293,10 @@ xfs_bmap_del_extent_real(
*logflagsp = 0;
return -ENOSPC;
}
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
} else
*logflagsp |= xfs_ilog_fext(whichfork);
@@ -5286,12 +5378,14 @@ __xfs_bunmapi(
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
ifp = xfs_ifork_ptr(ip, whichfork);
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(len > 0);
ASSERT(nexts >= 0);
@@ -5304,7 +5398,7 @@ __xfs_bunmapi(
return 0;
}
XFS_STATS_INC(mp, xs_blk_unmap);
- isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ isrt = xfs_ifork_is_realtime(ip, whichfork);
end = start + len;
if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
@@ -5317,7 +5411,6 @@ __xfs_bunmapi(
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
} else
cur = NULL;
@@ -5367,7 +5460,7 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
- if (!isrt)
+ if (!isrt || (flags & XFS_BMAPI_REMAP))
goto delete;
mod = xfs_rtb_to_rtxoff(mp,
@@ -5385,7 +5478,7 @@ __xfs_bunmapi(
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
*/
- ASSERT(end >= mod);
+ ASSERT((flags & XFS_BMAPI_REMAP) || end >= mod);
end -= mod > del.br_blockcount ?
del.br_blockcount : mod;
if (end < got.br_startoff &&
@@ -5555,7 +5648,7 @@ error0:
xfs_trans_log_inode(tp, ip, logflags);
if (cur) {
if (!error)
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -5635,8 +5728,7 @@ xfs_bmse_merge(
blockcount = left->br_blockcount + got->br_blockcount;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
ASSERT(xfs_bmse_can_merge(left, got, shift));
new = *left;
@@ -5657,21 +5749,27 @@ xfs_bmse_merge(
error = xfs_bmbt_lookup_eq(cur, got, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_btree_delete(cur, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/* lookup and update size of the previous extent */
error = xfs_bmbt_lookup_eq(cur, left, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_bmbt_update(cur, &new);
if (error)
@@ -5719,8 +5817,10 @@ xfs_bmap_shift_update_extent(
error = xfs_bmbt_lookup_eq(cur, &prev, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_bmbt_update(cur, got);
if (error)
@@ -5758,28 +5858,28 @@ xfs_bmap_collapse_extents(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
*done = true;
goto del_cursor;
}
if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5837,7 +5937,7 @@ xfs_bmap_can_insert_extents(
int is_empty;
int error = 0;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
if (xfs_is_shutdown(ip->i_mount))
return -EIO;
@@ -5873,22 +5973,21 @@ xfs_bmap_insert_extents(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
if (*next_fsb == NULLFSBLOCK) {
xfs_iext_last(ifp, &icur);
@@ -5904,11 +6003,13 @@ xfs_bmap_insert_extents(
}
}
if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5976,6 +6077,7 @@ xfs_bmap_split_extent(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -6002,11 +6104,11 @@ xfs_bmap_split_extent(
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6034,6 +6136,7 @@ xfs_bmap_split_extent(
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6041,6 +6144,7 @@ xfs_bmap_split_extent(
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6060,7 +6164,7 @@ xfs_bmap_split_extent(
del_cursor:
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
@@ -6069,17 +6173,8 @@ del_cursor:
return error;
}
-/* Deferred mapping is only for real extents in the data fork. */
-static bool
-xfs_bmap_is_update_needed(
- struct xfs_bmbt_irec *bmap)
-{
- return bmap->br_startblock != HOLESTARTBLOCK &&
- bmap->br_startblock != DELAYSTARTBLOCK;
-}
-
/* Record a bmap intent. */
-static int
+static inline void
__xfs_bmap_add(
struct xfs_trans *tp,
enum xfs_bmap_intent_type type,
@@ -6089,25 +6184,19 @@ __xfs_bmap_add(
{
struct xfs_bmap_intent *bi;
- trace_xfs_bmap_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
- type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
- ip->i_ino, whichfork,
- bmap->br_startoff,
- bmap->br_blockcount,
- bmap->br_state);
+ if ((whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) ||
+ bmap->br_startblock == HOLESTARTBLOCK ||
+ bmap->br_startblock == DELAYSTARTBLOCK)
+ return;
- bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
bi->bi_whichfork = whichfork;
bi->bi_bmap = *bmap;
- xfs_bmap_update_get_group(tp->t_mountp, bi);
- xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
- return 0;
+ xfs_bmap_defer_add(tp, bi);
}
/* Map an extent into a file. */
@@ -6115,12 +6204,10 @@ void
xfs_bmap_map_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
+ int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_bmap_is_update_needed(PREV))
- return;
-
- __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -6128,12 +6215,10 @@ void
xfs_bmap_unmap_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
+ int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_bmap_is_update_needed(PREV))
- return;
-
- __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, whichfork, PREV);
}
/*
@@ -6147,36 +6232,35 @@ xfs_bmap_finish_one(
{
struct xfs_bmbt_irec *bmap = &bi->bi_bmap;
int error = 0;
+ int flags = 0;
- ASSERT(tp->t_highest_agno == NULLAGNUMBER);
+ if (bi->bi_whichfork == XFS_ATTR_FORK)
+ flags |= XFS_BMAPI_ATTRFORK;
- trace_xfs_bmap_deferred(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
- bi->bi_type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
- bi->bi_owner->i_ino, bi->bi_whichfork,
- bmap->br_startoff, bmap->br_blockcount,
- bmap->br_state);
+ ASSERT(tp->t_highest_agno == NULLAGNUMBER);
- if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK))
- return -EFSCORRUPTED;
+ trace_xfs_bmap_deferred(bi);
- if (XFS_TEST_ERROR(false, tp->t_mountp,
- XFS_ERRTAG_BMAP_FINISH_ONE))
+ if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
return -EIO;
switch (bi->bi_type) {
case XFS_BMAP_MAP:
+ if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN)
+ flags |= XFS_BMAPI_PREALLOC;
error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff,
- bmap->br_blockcount, bmap->br_startblock, 0);
+ bmap->br_blockcount, bmap->br_startblock,
+ flags);
bmap->br_blockcount = 0;
break;
case XFS_BMAP_UNMAP:
error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff,
- &bmap->br_blockcount, XFS_BMAPI_REMAP, 1);
+ &bmap->br_blockcount, flags | XFS_BMAPI_REMAP,
+ 1);
break;
default:
ASSERT(0);
+ xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork);
error = -EFSCORRUPTED;
}
@@ -6257,7 +6341,7 @@ xfs_bunmapi_range(
xfs_filblks_t unmap_len = endoff - startoff + 1;
int error = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
while (unmap_len > 0) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@@ -6274,3 +6358,46 @@ xfs_bunmapi_range(
out:
return error;
}
+
+struct xfs_bmap_query_range {
+ xfs_bmap_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_bmap_query_range_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_bmap_query_range *query = priv;
+ struct xfs_bmbt_irec irec;
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ fa = xfs_bmap_validate_extent(cur->bc_ino.ip, cur->bc_ino.whichfork,
+ &irec);
+ if (fa) {
+ xfs_btree_mark_sick(cur);
+ return xfs_bmap_complain_bad_rec(cur->bc_ino.ip,
+ cur->bc_ino.whichfork, fa, &irec);
+ }
+
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all bmaps. */
+int
+xfs_bmap_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_bmap_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_bmap_query_range query = {
+ .priv = priv,
+ .fn = fn,
+ };
+
+ return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f6b73f1bad5f..f7662595309d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -232,6 +232,10 @@ enum xfs_bmap_intent_type {
XFS_BMAP_UNMAP,
};
+#define XFS_BMAP_INTENT_STRINGS \
+ { XFS_BMAP_MAP, "map" }, \
+ { XFS_BMAP_UNMAP, "unmap" }
+
struct xfs_bmap_intent {
struct list_head bi_list;
enum xfs_bmap_intent_type bi_type;
@@ -241,14 +245,11 @@ struct xfs_bmap_intent {
struct xfs_bmbt_irec bi_bmap;
};
-void xfs_bmap_update_get_group(struct xfs_mount *mp,
- struct xfs_bmap_intent *bi);
-
int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
+ int whichfork, struct xfs_bmbt_irec *imap);
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
+ int whichfork, struct xfs_bmbt_irec *imap);
static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
{
@@ -280,4 +281,12 @@ extern struct kmem_cache *xfs_bmap_intent_cache;
int __init xfs_bmap_intent_init_cache(void);
void xfs_bmap_intent_destroy_cache(void);
+typedef int (*xfs_bmap_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv);
+
+int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn,
+ void *priv);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 71f2d50f7823..f5d84dcb58da 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -26,6 +26,22 @@
static struct kmem_cache *xfs_bmbt_cur_cache;
+void
+xfs_bmbt_init_block(
+ struct xfs_inode *ip,
+ struct xfs_btree_block *buf,
+ struct xfs_buf *bp,
+ __u16 level,
+ __u16 numrecs)
+{
+ if (bp)
+ xfs_btree_init_buf(ip->i_mount, bp, &xfs_bmbt_ops, level,
+ numrecs, ip->i_ino);
+ else
+ xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, level,
+ numrecs, ip->i_ino);
+}
+
/*
* Convert on-disk form of btree root to in-memory form.
*/
@@ -44,9 +60,7 @@ xfs_bmdr_to_bmbt(
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
- XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, rblock, NULL, 0, 0);
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
@@ -171,13 +185,8 @@ xfs_bmbt_dup_cursor(
new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
cur->bc_ino.ip, cur->bc_ino.whichfork);
-
- /*
- * Copy the firstblock, dfops, and flags values,
- * since init cursor doesn't get them.
- */
- new->bc_ino.flags = cur->bc_ino.flags;
-
+ new->bc_flags |= (cur->bc_flags &
+ (XFS_BTREE_BMBT_INVALID_OWNER | XFS_BTREE_BMBT_WASDEL));
return new;
}
@@ -189,10 +198,10 @@ xfs_bmbt_update_cursor(
ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) ||
(dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME));
- dst->bc_ino.allocated += src->bc_ino.allocated;
+ dst->bc_bmap.allocated += src->bc_bmap.allocated;
dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno;
- src->bc_ino.allocated = 0;
+ src->bc_bmap.allocated = 0;
}
STATIC int
@@ -211,7 +220,7 @@ xfs_bmbt_alloc_block(
xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
cur->bc_ino.whichfork);
args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
+ args.wasdel = cur->bc_flags & XFS_BTREE_BMBT_WASDEL;
if (!args.wasdel && args.tp->t_blk_res == 0)
return -ENOSPC;
@@ -247,7 +256,7 @@ xfs_bmbt_alloc_block(
}
ASSERT(args.len == 1);
- cur->bc_ino.allocated++;
+ cur->bc_bmap.allocated++;
cur->bc_ino.ip->i_nblocks++;
xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
@@ -360,14 +369,6 @@ xfs_bmbt_init_rec_from_cur(
xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
}
-STATIC void
-xfs_bmbt_init_ptr_from_cur(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- ptr->l = 0;
-}
-
STATIC int64_t
xfs_bmbt_key_diff(
struct xfs_btree_cur *cur,
@@ -419,7 +420,7 @@ xfs_bmbt_verify(
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
*/
- fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
}
@@ -435,7 +436,7 @@ xfs_bmbt_verify(
if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
return __this_address;
- return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
+ return xfs_btree_fsblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
}
static void
@@ -444,7 +445,7 @@ xfs_bmbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_lblock_verify_crc(bp))
+ if (!xfs_btree_fsblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_bmbt_verify(bp);
@@ -468,7 +469,7 @@ xfs_bmbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_lblock_calc_crc(bp);
+ xfs_btree_fsblock_calc_crc(bp);
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
@@ -515,9 +516,16 @@ xfs_bmbt_keys_contiguous(
be64_to_cpu(key2->bmbt.br_startoff));
}
-static const struct xfs_btree_ops xfs_bmbt_ops = {
+const struct xfs_btree_ops xfs_bmbt_ops = {
+ .name = "bmap",
+ .type = XFS_BTREE_TYPE_INODE,
+
.rec_len = sizeof(xfs_bmbt_rec_t),
.key_len = sizeof(xfs_bmbt_key_t),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_BMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2),
.dup_cursor = xfs_bmbt_dup_cursor,
.update_cursor = xfs_bmbt_update_cursor,
@@ -529,7 +537,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
.init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
- .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
.diff_two_keys = xfs_bmbt_diff_two_keys,
.buf_ops = &xfs_bmbt_buf_ops,
@@ -538,35 +545,10 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.keys_contiguous = xfs_bmbt_keys_contiguous,
};
-static struct xfs_btree_cur *
-xfs_bmbt_init_common(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- int whichfork)
-{
- struct xfs_btree_cur *cur;
-
- ASSERT(whichfork != XFS_COW_FORK);
-
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
- mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
-
- cur->bc_ops = &xfs_bmbt_ops;
- cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
- cur->bc_ino.ip = ip;
- cur->bc_ino.allocated = 0;
- cur->bc_ino.flags = 0;
-
- return cur;
-}
-
/*
- * Allocate a new bmap btree cursor.
+ * Create a new bmap btree cursor.
+ *
+ * For staging cursors -1 in passed in whichfork.
*/
struct xfs_btree_cur *
xfs_bmbt_init_cursor(
@@ -575,15 +557,34 @@ xfs_bmbt_init_cursor(
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
+ unsigned int maxlevels;
- cur = xfs_bmbt_init_common(mp, tp, ip, whichfork);
+ ASSERT(whichfork != XFS_COW_FORK);
- cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+ /*
+ * The Data fork always has larger maxlevel, so use that for staging
+ * cursors.
+ */
+ switch (whichfork) {
+ case XFS_STAGING_FORK:
+ maxlevels = mp->m_bm_maxlevels[XFS_DATA_FORK];
+ break;
+ default:
+ maxlevels = mp->m_bm_maxlevels[whichfork];
+ break;
+ }
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bmbt_ops, maxlevels,
+ xfs_bmbt_cur_cache);
+ cur->bc_ino.ip = ip;
cur->bc_ino.whichfork = whichfork;
+ cur->bc_bmap.allocated = 0;
+ if (whichfork != XFS_STAGING_FORK) {
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+ }
return cur;
}
@@ -599,33 +600,6 @@ xfs_bmbt_block_maxrecs(
}
/*
- * Allocate a new bmap btree cursor for reloading an inode block mapping data
- * structure. Note that callers can use the staged cursor to reload extents
- * format inode forks if they rebuild the iext tree and commit the staged
- * cursor immediately.
- */
-struct xfs_btree_cur *
-xfs_bmbt_stage_cursor(
- struct xfs_mount *mp,
- struct xfs_inode *ip,
- struct xbtree_ifakeroot *ifake)
-{
- struct xfs_btree_cur *cur;
- struct xfs_btree_ops *ops;
-
- /* data fork always has larger maxheight */
- cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK);
- cur->bc_nlevels = ifake->if_levels;
- cur->bc_ino.forksize = ifake->if_fork_size;
-
- /* Don't let anyone think we're attached to the real fork yet. */
- cur->bc_ino.whichfork = -1;
- xfs_btree_stage_ifakeroot(cur, ifake, &ops);
- ops->update_cursor = NULL;
- return cur;
-}
-
-/*
* Swap in the new inode fork root. Once we pass this point the newly rebuilt
* mappings are in place and we have to kill off any old btree blocks.
*/
@@ -665,7 +639,7 @@ xfs_bmbt_commit_staged_btree(
break;
}
xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
- xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops);
+ xfs_btree_commit_ifakeroot(cur, tp, whichfork);
}
/*
@@ -751,7 +725,7 @@ xfs_bmbt_change_owner(
ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
- cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
+ cur->bc_flags |= XFS_BTREE_BMBT_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 151b8491f60e..de1b73f1225c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -107,8 +107,6 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
-struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp,
- struct xfs_inode *ip, struct xbtree_ifakeroot *ifake);
void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, int whichfork);
@@ -120,4 +118,7 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void);
int __init xfs_bmbt_init_cur_cache(void);
void xfs_bmbt_destroy_cur_cache(void);
+void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf,
+ struct xfs_buf *bp, __u16 level, __u16 numrecs);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index ea8d3659df20..d29547572a68 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -27,28 +27,24 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
/*
* Btree magic numbers.
*/
-static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
- { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
- XFS_FIBT_MAGIC, 0 },
- { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
- XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
- XFS_REFC_CRC_MAGIC }
-};
-
uint32_t
xfs_btree_magic(
- int crc,
- xfs_btnum_t btnum)
+ struct xfs_mount *mp,
+ const struct xfs_btree_ops *ops)
{
- uint32_t magic = xfs_magics[crc][btnum];
+ int idx = xfs_has_crc(mp) ? 1 : 0;
+ __be32 magic = ops->buf_ops->magic[idx];
/* Ensure we asked for crc for crc-only magics. */
ASSERT(magic != 0);
- return magic;
+ return be32_to_cpu(magic);
}
/*
@@ -63,10 +59,8 @@ xfs_btree_magic(
* bytes.
*/
static inline xfs_failaddr_t
-xfs_btree_check_lblock_siblings(
+xfs_btree_check_fsblock_siblings(
struct xfs_mount *mp,
- struct xfs_btree_cur *cur,
- int level,
xfs_fsblock_t fsb,
__be64 dsibling)
{
@@ -78,22 +72,33 @@ xfs_btree_check_lblock_siblings(
sibling = be64_to_cpu(dsibling);
if (sibling == fsb)
return __this_address;
- if (level >= 0) {
- if (!xfs_btree_check_lptr(cur, sibling, level + 1))
- return __this_address;
- } else {
- if (!xfs_verify_fsbno(mp, sibling))
- return __this_address;
- }
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ return NULL;
+}
+static inline xfs_failaddr_t
+xfs_btree_check_memblock_siblings(
+ struct xfs_buftarg *btp,
+ xfbno_t bno,
+ __be64 dsibling)
+{
+ xfbno_t sibling;
+
+ if (dsibling == cpu_to_be64(NULLFSBLOCK))
+ return NULL;
+
+ sibling = be64_to_cpu(dsibling);
+ if (sibling == bno)
+ return __this_address;
+ if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
+ return __this_address;
return NULL;
}
static inline xfs_failaddr_t
-xfs_btree_check_sblock_siblings(
+xfs_btree_check_agblock_siblings(
struct xfs_perag *pag,
- struct xfs_btree_cur *cur,
- int level,
xfs_agblock_t agbno,
__be32 dsibling)
{
@@ -105,34 +110,21 @@ xfs_btree_check_sblock_siblings(
sibling = be32_to_cpu(dsibling);
if (sibling == agbno)
return __this_address;
- if (level >= 0) {
- if (!xfs_btree_check_sptr(cur, sibling, level + 1))
- return __this_address;
- } else {
- if (!xfs_verify_agbno(pag, sibling))
- return __this_address;
- }
+ if (!xfs_verify_agbno(pag, sibling))
+ return __this_address;
return NULL;
}
-/*
- * Check a long btree block header. Return the address of the failing check,
- * or NULL if everything is ok.
- */
-xfs_failaddr_t
-__xfs_btree_check_lblock(
+static xfs_failaddr_t
+__xfs_btree_check_lblock_hdr(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_has_crc(mp);
- xfs_failaddr_t fa;
- xfs_fsblock_t fsb = NULLFSBLOCK;
- if (crc) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (block->bb_u.l.bb_blkno !=
@@ -142,7 +134,7 @@ __xfs_btree_check_lblock(
return __this_address;
}
- if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
return __this_address;
if (be16_to_cpu(block->bb_level) != level)
return __this_address;
@@ -150,44 +142,83 @@ __xfs_btree_check_lblock(
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (bp)
- fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ return NULL;
+}
+
+/*
+ * Check a long btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_fsblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb;
+
+ fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+ if (fa)
+ return fa;
- fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ /*
+ * For inode-rooted btrees, the root block sits in the inode fork. In
+ * that case bp is NULL, and the block must not have any siblings.
+ */
+ if (!bp) {
+ if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK))
+ return __this_address;
+ if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK))
+ return __this_address;
+ return NULL;
+ }
+
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_rightsib);
return fa;
}
-/* Check a long btree block header. */
-static int
-xfs_btree_check_lblock(
+/*
+ * Check an in-memory btree block header. Return the address of the failing
+ * check, or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_memblock(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
xfs_failaddr_t fa;
+ xfbno_t bno;
- fa = __xfs_btree_check_lblock(cur, block, level, bp);
- if (XFS_IS_CORRUPT(mp, fa != NULL) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
- if (bp)
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- return -EFSCORRUPTED;
- }
- return 0;
+ fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+ if (fa)
+ return fa;
+
+ bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/*
* Check a short btree block header. Return the address of the failing check,
* or NULL if everything is ok.
*/
-xfs_failaddr_t
-__xfs_btree_check_sblock(
+static xfs_failaddr_t
+__xfs_btree_check_agblock(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
@@ -195,20 +226,17 @@ __xfs_btree_check_sblock(
{
struct xfs_mount *mp = cur->bc_mp;
struct xfs_perag *pag = cur->bc_ag.pag;
- xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_has_crc(mp);
xfs_failaddr_t fa;
- xfs_agblock_t agbno = NULLAGBLOCK;
+ xfs_agblock_t agbno;
- if (crc) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (block->bb_u.s.bb_blkno !=
- cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
}
- if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
return __this_address;
if (be16_to_cpu(block->bb_level) != level)
return __this_address;
@@ -216,36 +244,45 @@ __xfs_btree_check_sblock(
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (bp)
- agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
-
- fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_agblock_siblings(pag, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ fa = xfs_btree_check_agblock_siblings(pag, agbno,
block->bb_u.s.bb_rightsib);
return fa;
}
-/* Check a short btree block header. */
-STATIC int
-xfs_btree_check_sblock(
+/*
+ * Internal btree block check.
+ *
+ * Return NULL if the block is ok or the address of the failed check otherwise.
+ */
+xfs_failaddr_t
+__xfs_btree_check_block(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = cur->bc_mp;
- xfs_failaddr_t fa;
-
- fa = __xfs_btree_check_sblock(cur, block, level, bp);
- if (XFS_IS_CORRUPT(mp, fa != NULL) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
- if (bp)
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- return -EFSCORRUPTED;
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ return __xfs_btree_check_memblock(cur, block, level, bp);
+ case XFS_BTREE_TYPE_AG:
+ return __xfs_btree_check_agblock(cur, block, level, bp);
+ case XFS_BTREE_TYPE_INODE:
+ return __xfs_btree_check_fsblock(cur, block, level, bp);
+ default:
+ ASSERT(0);
+ return __this_address;
}
- return 0;
+}
+
+static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN)
+ return XFS_ERRTAG_BTREE_CHECK_SBLOCK;
+ return XFS_ERRTAG_BTREE_CHECK_LBLOCK;
}
/*
@@ -258,34 +295,49 @@ xfs_btree_check_block(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer containing block, if any */
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return xfs_btree_check_lblock(cur, block, level, bp);
- else
- return xfs_btree_check_sblock(cur, block, level, bp);
-}
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
-/* Check that this long pointer is valid and points within the fs. */
-bool
-xfs_btree_check_lptr(
- struct xfs_btree_cur *cur,
- xfs_fsblock_t fsbno,
- int level)
-{
- if (level <= 0)
- return false;
- return xfs_verify_fsbno(cur->bc_mp, fsbno);
+ fa = __xfs_btree_check_block(cur, block, level, bp);
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
-/* Check that this short pointer is valid and points within the AG. */
-bool
-xfs_btree_check_sptr(
- struct xfs_btree_cur *cur,
- xfs_agblock_t agbno,
- int level)
+int
+__xfs_btree_check_ptr(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int index,
+ int level)
{
if (level <= 0)
- return false;
- return xfs_verify_agbno(cur->bc_ag.pag, agbno);
+ return -EFSCORRUPTED;
+
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
+ be64_to_cpu((&ptr->l)[index])))
+ return -EFSCORRUPTED;
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ if (!xfs_verify_fsbno(cur->bc_mp,
+ be64_to_cpu((&ptr->l)[index])))
+ return -EFSCORRUPTED;
+ break;
+ case XFS_BTREE_TYPE_AG:
+ if (!xfs_verify_agbno(cur->bc_ag.pag,
+ be32_to_cpu((&ptr->s)[index])))
+ return -EFSCORRUPTED;
+ break;
+ }
+
+ return 0;
}
/*
@@ -299,26 +351,35 @@ xfs_btree_check_ptr(
int index,
int level)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
- level))
- return 0;
- xfs_err(cur->bc_mp,
-"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
+ int error;
+
+ error = __xfs_btree_check_ptr(cur, ptr, index, level);
+ if (error) {
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ xfs_err(cur->bc_mp,
+"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
+ cur->bc_ops->name, cur->bc_flags, level, index,
+ __this_address);
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ xfs_err(cur->bc_mp,
+"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
cur->bc_ino.ip->i_ino,
- cur->bc_ino.whichfork, cur->bc_btnum,
+ cur->bc_ino.whichfork, cur->bc_ops->name,
level, index);
- } else {
- if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
- level))
- return 0;
- xfs_err(cur->bc_mp,
-"AG %u: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_ag.pag->pag_agno, cur->bc_btnum,
+ break;
+ case XFS_BTREE_TYPE_AG:
+ xfs_err(cur->bc_mp,
+"AG %u: Corrupt %sbt pointer at level %d index %d.",
+ cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
level, index);
+ break;
+ }
+ xfs_btree_mark_sick(cur);
}
- return -EFSCORRUPTED;
+ return error;
}
#ifdef DEBUG
@@ -336,7 +397,7 @@ xfs_btree_check_ptr(
* it to disk.
*/
void
-xfs_btree_lblock_calc_crc(
+xfs_btree_fsblock_calc_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -350,7 +411,7 @@ xfs_btree_lblock_calc_crc(
}
bool
-xfs_btree_lblock_verify_crc(
+xfs_btree_fsblock_verify_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -374,7 +435,7 @@ xfs_btree_lblock_verify_crc(
* it to disk.
*/
void
-xfs_btree_sblock_calc_crc(
+xfs_btree_agblock_calc_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -388,7 +449,7 @@ xfs_btree_sblock_calc_crc(
}
bool
-xfs_btree_sblock_verify_crc(
+xfs_btree_agblock_verify_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -410,6 +471,17 @@ xfs_btree_free_block(
{
int error;
+ trace_xfs_btree_free_block(cur, bp);
+
+ /*
+ * Don't allow block freeing for a staging cursor, because staging
+ * cursors do not support regular btree modifications.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
error = cur->bc_ops->free_block(cur, bp);
if (!error) {
xfs_trans_binval(cur->bc_tp, bp);
@@ -448,33 +520,70 @@ xfs_btree_del_cursor(
* zero, then we should be shut down or on our way to shutdown due to
* cancelling a dirty transaction on error.
*/
- ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+ ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
xfs_is_shutdown(cur->bc_mp) || error != 0);
- if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
- kmem_free(cur->bc_ops);
- if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
- xfs_perag_put(cur->bc_ag.pag);
+
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
+ if (cur->bc_ag.pag)
+ xfs_perag_put(cur->bc_ag.pag);
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ /* nothing to do */
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ if (cur->bc_mem.pag)
+ xfs_perag_put(cur->bc_mem.pag);
+ break;
+ }
+
kmem_cache_free(cur->bc_cache, cur);
}
+/* Return the buffer target for this btree's buffer. */
+static inline struct xfs_buftarg *
+xfs_btree_buftarg(
+ struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+ return cur->bc_mem.xfbtree->target;
+ return cur->bc_mp->m_ddev_targp;
+}
+
+/* Return the block size (in units of 512b sectors) for this btree. */
+static inline unsigned int
+xfs_btree_bbsize(
+ struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+ return XFBNO_BBSIZE;
+ return cur->bc_mp->m_bsize;
+}
+
/*
* Duplicate the btree cursor.
* Allocate a new one, copy the record, re-get the buffers.
*/
-int /* error */
+int /* error */
xfs_btree_dup_cursor(
- struct xfs_btree_cur *cur, /* input cursor */
- struct xfs_btree_cur **ncur) /* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur) /* output cursor */
{
- struct xfs_buf *bp; /* btree block's buffer pointer */
- int error; /* error return value */
- int i; /* level number of btree block */
- xfs_mount_t *mp; /* mount structure for filesystem */
- struct xfs_btree_cur *new; /* new cursor value */
- xfs_trans_t *tp; /* transaction pointer, can be NULL */
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_trans *tp = cur->bc_tp;
+ struct xfs_buf *bp;
+ struct xfs_btree_cur *new;
+ int error;
+ int i;
- tp = cur->bc_tp;
- mp = cur->bc_mp;
+ /*
+ * Don't allow staging cursors to be duplicated because they're supposed
+ * to be kept private to a single thread.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
/*
* Allocate a new cursor like the old one.
@@ -494,10 +603,13 @@ xfs_btree_dup_cursor(
new->bc_levels[i].ra = cur->bc_levels[i].ra;
bp = cur->bc_levels[i].bp;
if (bp) {
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- xfs_buf_daddr(bp), mp->m_bsize,
- 0, &bp,
- cur->bc_ops->buf_ops);
+ error = xfs_trans_read_buf(mp, tp,
+ xfs_btree_buftarg(cur),
+ xfs_buf_daddr(bp),
+ xfs_btree_bbsize(cur), 0, &bp,
+ cur->bc_ops->buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(new);
if (error) {
xfs_btree_del_cursor(new, error);
*ncur = NULL;
@@ -539,7 +651,7 @@ xfs_btree_dup_cursor(
* record, key or pointer (xfs_btree_*_addr). Note that all addressing
* inside the btree block is done using indices starting at one, not zero!
*
- * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing
* overlapping intervals. In such a tree, records are still sorted lowest to
* highest and indexed by the smallest key value that refers to the record.
* However, nodes are different: each pointer has two associated keys -- one
@@ -589,26 +701,17 @@ xfs_btree_dup_cursor(
*/
static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ if (xfs_has_crc(cur->bc_mp))
return XFS_BTREE_LBLOCK_CRC_LEN;
return XFS_BTREE_LBLOCK_LEN;
}
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ if (xfs_has_crc(cur->bc_mp))
return XFS_BTREE_SBLOCK_CRC_LEN;
return XFS_BTREE_SBLOCK_LEN;
}
/*
- * Return size of btree block pointers for this btree instance.
- */
-static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
-{
- return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
- sizeof(__be64) : sizeof(__be32);
-}
-
-/*
* Calculate offset of the n-th record in a btree block.
*/
STATIC size_t
@@ -655,7 +758,7 @@ xfs_btree_ptr_offset(
{
return xfs_btree_block_len(cur) +
cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
- (n - 1) * xfs_btree_ptr_len(cur);
+ (n - 1) * cur->bc_ops->ptr_len;
}
/*
@@ -718,7 +821,7 @@ struct xfs_ifork *
xfs_btree_ifork_ptr(
struct xfs_btree_cur *cur)
{
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
if (cur->bc_flags & XFS_BTREE_STAGING)
return cur->bc_ino.ifake->if_fork;
@@ -750,8 +853,7 @@ xfs_btree_get_block(
int level, /* level in btree */
struct xfs_buf **bpp) /* buffer containing the block */
{
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1)) {
+ if (xfs_btree_at_iroot(cur, level)) {
*bpp = NULL;
return xfs_btree_get_iroot(cur);
}
@@ -856,95 +958,52 @@ xfs_btree_offsets(
}
}
-/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int
-xfs_btree_read_bufl(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- struct xfs_buf **bpp, /* buffer for fsbno */
- int refval, /* ref count value for buffer */
- const struct xfs_buf_ops *ops)
-{
- struct xfs_buf *bp; /* return value */
- xfs_daddr_t d; /* real disk block address */
- int error;
-
- if (!xfs_verify_fsbno(mp, fsbno))
- return -EFSCORRUPTED;
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0, &bp, ops);
- if (error)
- return error;
- if (bp)
- xfs_buf_set_ref(bp, refval);
- *bpp = bp;
- return 0;
-}
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufl(
- struct xfs_mount *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops)
+STATIC int
+xfs_btree_readahead_fsblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
{
- xfs_daddr_t d;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ int rval = 0;
- ASSERT(fsbno != NULLFSBLOCK);
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
-}
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+ xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left),
+ mp->m_bsize, cur->bc_ops->buf_ops);
+ rval++;
+ }
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufs(
- struct xfs_mount *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops)
-{
- xfs_daddr_t d;
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+ xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right),
+ mp->m_bsize, cur->bc_ops->buf_ops);
+ rval++;
+ }
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+ return rval;
}
STATIC int
-xfs_btree_readahead_lblock(
+xfs_btree_readahead_memblock(
struct xfs_btree_cur *cur,
int lr,
struct xfs_btree_block *block)
{
+ struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
+ xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
int rval = 0;
- xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
- xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
- xfs_btree_reada_bufl(cur->bc_mp, left, 1,
- cur->bc_ops->buf_ops);
+ xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
+ cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
- xfs_btree_reada_bufl(cur->bc_mp, right, 1,
- cur->bc_ops->buf_ops);
+ xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
+ cur->bc_ops->buf_ops);
rval++;
}
@@ -952,25 +1011,28 @@ xfs_btree_readahead_lblock(
}
STATIC int
-xfs_btree_readahead_sblock(
+xfs_btree_readahead_agblock(
struct xfs_btree_cur *cur,
int lr,
- struct xfs_btree_block *block)
+ struct xfs_btree_block *block)
{
- int rval = 0;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
-
+ int rval = 0;
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- left, 1, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, left),
+ mp->m_bsize, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- right, 1, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, right),
+ mp->m_bsize, cur->bc_ops->buf_ops);
rval++;
}
@@ -993,8 +1055,7 @@ xfs_btree_readahead(
* No readahead needed if we are at the root level and the
* btree root is stored in the inode.
*/
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (lev == cur->bc_nlevels - 1))
+ if (xfs_btree_at_iroot(cur, lev))
return 0;
if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
@@ -1003,9 +1064,17 @@ xfs_btree_readahead(
cur->bc_levels[lev].ra |= lr;
block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return xfs_btree_readahead_lblock(cur, lr, block);
- return xfs_btree_readahead_sblock(cur, lr, block);
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
+ return xfs_btree_readahead_agblock(cur, lr, block);
+ case XFS_BTREE_TYPE_INODE:
+ return xfs_btree_readahead_fsblock(cur, lr, block);
+ case XFS_BTREE_TYPE_MEM:
+ return xfs_btree_readahead_memblock(cur, lr, block);
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
STATIC int
@@ -1014,23 +1083,24 @@ xfs_btree_ptr_to_daddr(
const union xfs_btree_ptr *ptr,
xfs_daddr_t *daddr)
{
- xfs_fsblock_t fsbno;
- xfs_agblock_t agbno;
int error;
error = xfs_btree_check_ptr(cur, ptr, 0, 1);
if (error)
return error;
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- fsbno = be64_to_cpu(ptr->l);
- *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
- } else {
- agbno = be32_to_cpu(ptr->s);
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
*daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- agbno);
+ be32_to_cpu(ptr->s));
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
+ break;
}
-
return 0;
}
@@ -1050,8 +1120,9 @@ xfs_btree_readahead_ptr(
if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
return;
- xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
- cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
+ xfs_btree_bbsize(cur) * count,
+ cur->bc_ops->buf_ops);
}
/*
@@ -1072,7 +1143,7 @@ xfs_btree_setbuf(
cur->bc_levels[lev].ra = 0;
b = XFS_BUF_TO_BLOCK(bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
@@ -1090,7 +1161,7 @@ xfs_btree_ptr_is_null(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return ptr->l == cpu_to_be64(NULLFSBLOCK);
else
return ptr->s == cpu_to_be32(NULLAGBLOCK);
@@ -1101,12 +1172,23 @@ xfs_btree_set_ptr_null(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
ptr->l = cpu_to_be64(NULLFSBLOCK);
else
ptr->s = cpu_to_be32(NULLAGBLOCK);
}
+static inline bool
+xfs_btree_ptrs_equal(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr1,
+ union xfs_btree_ptr *ptr2)
+{
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
+ return ptr1->l == ptr2->l;
+ return ptr1->s == ptr2->s;
+}
+
/*
* Get/set/init sibling pointers
*/
@@ -1119,7 +1201,7 @@ xfs_btree_get_sibling(
{
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (lr == XFS_BB_RIGHTSIB)
ptr->l = block->bb_u.l.bb_rightsib;
else
@@ -1141,7 +1223,7 @@ xfs_btree_set_sibling(
{
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (lr == XFS_BB_RIGHTSIB)
block->bb_u.l.bb_rightsib = ptr->l;
else
@@ -1154,25 +1236,24 @@ xfs_btree_set_sibling(
}
}
-void
-xfs_btree_init_block_int(
+static void
+__xfs_btree_init_block(
struct xfs_mount *mp,
struct xfs_btree_block *buf,
+ const struct xfs_btree_ops *ops,
xfs_daddr_t blkno,
- xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
- __u64 owner,
- unsigned int flags)
+ __u64 owner)
{
- int crc = xfs_has_crc(mp);
- __u32 magic = xfs_btree_magic(crc, btnum);
+ bool crc = xfs_has_crc(mp);
+ __u32 magic = xfs_btree_magic(mp, ops);
buf->bb_magic = cpu_to_be32(magic);
buf->bb_level = cpu_to_be16(level);
buf->bb_numrecs = cpu_to_be16(numrecs);
- if (flags & XFS_BTREE_LONG_PTRS) {
+ if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
if (crc) {
@@ -1183,14 +1264,12 @@ xfs_btree_init_block_int(
buf->bb_u.l.bb_lsn = 0;
}
} else {
- /* owner is a 32 bit value on short blocks */
- __u32 __owner = (__u32)owner;
-
buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
if (crc) {
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
- buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+ /* owner is a 32 bit value on short blocks */
+ buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner);
uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.s.bb_lsn = 0;
}
@@ -1199,15 +1278,46 @@ xfs_btree_init_block_int(
void
xfs_btree_init_block(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner)
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ const struct xfs_btree_ops *ops,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner)
+{
+ __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level,
+ numrecs, owner);
+}
+
+void
+xfs_btree_init_buf(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ const struct xfs_btree_ops *ops,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner)
+{
+ __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops,
+ xfs_buf_daddr(bp), level, numrecs, owner);
+ bp->b_ops = ops->buf_ops;
+}
+
+static inline __u64
+xfs_btree_owner(
+ struct xfs_btree_cur *cur)
{
- xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
- btnum, level, numrecs, owner, 0);
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ return cur->bc_mem.xfbtree->owner;
+ case XFS_BTREE_TYPE_INODE:
+ return cur->bc_ino.ip->i_ino;
+ case XFS_BTREE_TYPE_AG:
+ return cur->bc_ag.pag->pag_agno;
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
void
@@ -1217,22 +1327,8 @@ xfs_btree_init_block_cur(
int level,
int numrecs)
{
- __u64 owner;
-
- /*
- * we can pull the owner from the cursor right now as the different
- * owners align directly with the pointer size of the btree. This may
- * change in future, but is safe for current users of the generic btree
- * code.
- */
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- owner = cur->bc_ino.ip->i_ino;
- else
- owner = cur->bc_ag.pag->pag_agno;
-
- xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
- xfs_buf_daddr(bp), cur->bc_btnum, level,
- numrecs, owner, cur->bc_flags);
+ xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs,
+ xfs_btree_owner(cur));
}
/*
@@ -1250,7 +1346,7 @@ xfs_btree_is_lastrec(
if (level > 0)
return 0;
- if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE))
return 0;
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
@@ -1265,41 +1361,27 @@ xfs_btree_buf_to_ptr(
struct xfs_buf *bp,
union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(bp)));
- else {
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
xfs_buf_daddr(bp)));
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)));
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
+ break;
}
}
-STATIC void
+static inline void
xfs_btree_set_refs(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- switch (cur->bc_btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
- break;
- case XFS_BTNUM_INO:
- case XFS_BTNUM_FINO:
- xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
- break;
- case XFS_BTNUM_BMAP:
- xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
- break;
- case XFS_BTNUM_RMAP:
- xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
- break;
- case XFS_BTNUM_REFC:
- xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
- break;
- default:
- ASSERT(0);
- }
+ xfs_buf_set_ref(bp, cur->bc_ops->lru_refs);
}
int
@@ -1309,15 +1391,14 @@ xfs_btree_get_buf_block(
struct xfs_btree_block **block,
struct xfs_buf **bpp)
{
- struct xfs_mount *mp = cur->bc_mp;
- xfs_daddr_t d;
- int error;
+ xfs_daddr_t d;
+ int error;
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
- error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
- 0, bpp);
+ error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
+ xfs_btree_bbsize(cur), 0, bpp);
if (error)
return error;
@@ -1348,9 +1429,11 @@ xfs_btree_read_buf_block(
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
- error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, flags, bpp,
- cur->bc_ops->buf_ops);
+ error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
+ xfs_btree_bbsize(cur), flags, bpp,
+ cur->bc_ops->buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
return error;
@@ -1398,7 +1481,7 @@ xfs_btree_copy_ptrs(
int numptrs)
{
ASSERT(numptrs >= 0);
- memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+ memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len);
}
/*
@@ -1454,8 +1537,8 @@ xfs_btree_shift_ptrs(
ASSERT(numptrs >= 0);
ASSERT(dir == 1 || dir == -1);
- dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
- memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+ dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len);
+ memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len);
}
/*
@@ -1566,7 +1649,7 @@ xfs_btree_log_block(
if (bp) {
int nbits;
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (xfs_has_crc(cur->bc_mp)) {
/*
* We don't log the CRC when updating a btree
* block but instead recreate it during log
@@ -1581,7 +1664,7 @@ xfs_btree_log_block(
nbits = XFS_BB_NUM_BITS;
}
xfs_btree_offsets(fields,
- (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ?
loffsets : soffsets,
nbits, &first, &last);
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
@@ -1658,9 +1741,10 @@ xfs_btree_increment(
* confused or have the tree root in an inode.
*/
if (lev == cur->bc_nlevels) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
goto out0;
ASSERT(0);
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1751,9 +1835,10 @@ xfs_btree_decrement(
* or the root of the tree is in an inode.
*/
if (lev == cur->bc_nlevels) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
goto out0;
ASSERT(0);
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1786,6 +1871,33 @@ error0:
return error;
}
+/*
+ * Check the btree block owner now that we have the context to know who the
+ * real owner is.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_block_owner(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block)
+{
+ __u64 owner;
+
+ if (!xfs_has_crc(cur->bc_mp) ||
+ (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER))
+ return NULL;
+
+ owner = xfs_btree_owner(cur);
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ if (be64_to_cpu(block->bb_u.l.bb_owner) != owner)
+ return __this_address;
+ } else {
+ if (be32_to_cpu(block->bb_u.s.bb_owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
int
xfs_btree_lookup_get_block(
struct xfs_btree_cur *cur, /* btree cursor */
@@ -1798,8 +1910,7 @@ xfs_btree_lookup_get_block(
int error = 0;
/* special case the root block if in an inode */
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1)) {
+ if (xfs_btree_at_iroot(cur, level)) {
*blkp = xfs_btree_get_iroot(cur);
return 0;
}
@@ -1824,11 +1935,7 @@ xfs_btree_lookup_get_block(
return error;
/* Check the inode owner since the verifiers don't. */
- if (xfs_has_crc(cur->bc_mp) &&
- !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
- (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
- be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
- cur->bc_ino.ip->i_ino)
+ if (xfs_btree_check_block_owner(cur, *blkp) != NULL)
goto out_bad;
/* Did we get the level we were looking for? */
@@ -1846,6 +1953,7 @@ out_bad:
*blkp = NULL;
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(cur->bc_tp, bp);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -1872,6 +1980,27 @@ xfs_lookup_get_search_key(
}
/*
+ * Initialize a pointer to the root block.
+ */
+void
+xfs_btree_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+ /*
+ * Inode-rooted btrees call xfs_btree_get_iroot to find the root
+ * in xfs_btree_lookup_get_block and don't need a pointer here.
+ */
+ ptr->l = 0;
+ } else if (cur->bc_flags & XFS_BTREE_STAGING) {
+ ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root);
+ } else {
+ cur->bc_ops->init_ptr_from_cur(cur, ptr);
+ }
+}
+
+/*
* Lookup the record. The cursor is made to point to it, based on dir.
* stat is set to 0 if can't find any such record, 1 for success.
*/
@@ -1892,14 +2021,16 @@ xfs_btree_lookup(
XFS_BTREE_STATS_INC(cur, lookup);
/* No such thing as a zero-level tree. */
- if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
+ if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
block = NULL;
keyno = 0;
/* initialise start pointer from cursor */
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
pp = &ptr;
/*
@@ -1936,6 +2067,7 @@ xfs_btree_lookup(
XFS_ERRLEVEL_LOW,
cur->bc_mp, block,
sizeof(*block));
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -2012,8 +2144,10 @@ xfs_btree_lookup(
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
*stat = 1;
return 0;
}
@@ -2040,7 +2174,7 @@ xfs_btree_high_key_from_key(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
- ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
return (union xfs_btree_key *)((char *)key +
(cur->bc_ops->key_len / 2));
}
@@ -2061,7 +2195,7 @@ xfs_btree_get_leaf_keys(
rec = xfs_btree_rec_addr(cur, 1, block);
cur->bc_ops->init_key_from_rec(key, rec);
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
@@ -2088,7 +2222,7 @@ xfs_btree_get_node_keys(
union xfs_btree_key *high;
int n;
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
memcpy(key, xfs_btree_key_addr(cur, 1, block),
cur->bc_ops->key_len / 2);
@@ -2132,7 +2266,7 @@ xfs_btree_needs_key_update(
struct xfs_btree_cur *cur,
int ptr)
{
- return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+ return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1;
}
/*
@@ -2156,7 +2290,7 @@ __xfs_btree_updkeys(
struct xfs_buf *bp;
int ptr;
- ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
/* Exit if there aren't any parent levels to update. */
if (level + 1 >= cur->bc_nlevels)
@@ -2225,7 +2359,7 @@ xfs_btree_update_keys(
ASSERT(level >= 0);
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)
return __xfs_btree_updkeys(cur, level, block, bp, false);
/*
@@ -2332,8 +2466,7 @@ xfs_btree_lshift(
int error; /* error return value */
int i;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1)
+ if (xfs_btree_at_iroot(cur, level))
goto out0;
/* Set up variables for this block as "right". */
@@ -2460,12 +2593,13 @@ xfs_btree_lshift(
* Using a temporary cursor, update the parent key values of the
* block on the left.
*/
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_dup_cursor(cur, &tcur);
if (error)
goto error0;
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2527,8 +2661,7 @@ xfs_btree_rshift(
int error; /* error return value */
int i; /* loop counter */
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1))
+ if (xfs_btree_at_iroot(cur, level))
goto out0;
/* Set up variables for this block as "left". */
@@ -2636,6 +2769,7 @@ xfs_btree_rshift(
goto error0;
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2645,7 +2779,7 @@ xfs_btree_rshift(
goto error1;
/* Update the parent high keys of the left block, if needed. */
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_update_keys(cur, level);
if (error)
goto error1;
@@ -2673,6 +2807,32 @@ error1:
return error;
}
+static inline int
+xfs_btree_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *hint_block,
+ union xfs_btree_ptr *new_block,
+ int *stat)
+{
+ int error;
+
+ /*
+ * Don't allow block allocation for a staging cursor, because staging
+ * cursors do not support regular btree modifications.
+ *
+ * Bulk loading uses a separate callback to obtain new blocks from a
+ * preallocated list, which prevents ENOSPC failures during loading.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat);
+ trace_xfs_btree_alloc_block(cur, new_block, *stat, error);
+ return error;
+}
+
/*
* Split cur/level block in half.
* Return new block number and the key to its first
@@ -2716,7 +2876,7 @@ __xfs_btree_split(
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+ error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2823,7 +2983,7 @@ __xfs_btree_split(
}
/* Update the parent high keys of the left block, if needed. */
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
@@ -2941,7 +3101,7 @@ xfs_btree_split(
struct xfs_btree_split_args args;
DECLARE_COMPLETION_ONSTACK(done);
- if (cur->bc_btnum != XFS_BTNUM_BMAP ||
+ if (!xfs_btree_is_bmap(cur->bc_ops) ||
cur->bc_tp->t_highest_agno == NULLAGNUMBER)
return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
@@ -2963,7 +3123,6 @@ xfs_btree_split(
#define xfs_btree_split __xfs_btree_split
#endif /* __KERNEL__ */
-
/*
* Copy the old inode root contents into a real block and make the
* broot point to it.
@@ -2988,7 +3147,7 @@ xfs_btree_new_iroot(
XFS_BTREE_STATS_INC(cur, newroot);
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
level = cur->bc_nlevels - 1;
@@ -2996,7 +3155,7 @@ xfs_btree_new_iroot(
pp = xfs_btree_ptr_addr(cur, 1, block);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+ error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -3014,9 +3173,9 @@ xfs_btree_new_iroot(
* In that case have to also ensure the blkno remains correct
*/
memcpy(cblock, block, xfs_btree_block_len(cur));
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (xfs_has_crc(cur->bc_mp)) {
__be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
cblock->bb_u.l.bb_blkno = bno;
else
cblock->bb_u.s.bb_blkno = bno;
@@ -3069,6 +3228,21 @@ error0:
return error;
}
+static void
+xfs_btree_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ if (cur->bc_flags & XFS_BTREE_STAGING) {
+ /* Update the btree root information for a per-AG fake root. */
+ cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s);
+ cur->bc_ag.afake->af_levels += inc;
+ } else {
+ cur->bc_ops->set_root(cur, ptr, inc);
+ }
+}
+
/*
* Allocate a new root block, fill it in.
*/
@@ -3093,10 +3267,10 @@ xfs_btree_new_root(
XFS_BTREE_STATS_INC(cur, newroot);
/* initialise our start point from the cursor */
- cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+ xfs_btree_init_ptr_from_cur(cur, &rptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+ error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -3109,7 +3283,7 @@ xfs_btree_new_root(
goto error0;
/* Set the root in the holding structure increasing the level by 1. */
- cur->bc_ops->set_root(cur, &lptr, 1);
+ xfs_btree_set_root(cur, &lptr, 1);
/*
* At the previous root level there are now two blocks: the old root,
@@ -3213,8 +3387,7 @@ xfs_btree_make_block_unfull(
{
int error = 0;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1) {
+ if (xfs_btree_at_iroot(cur, level)) {
struct xfs_inode *ip = cur->bc_ino.ip;
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
@@ -3299,8 +3472,8 @@ xfs_btree_insrec(
* If we have an external root pointer, and we've made it to the
* root level, allocate a new root block and we're done.
*/
- if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level >= cur->bc_nlevels)) {
+ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE &&
+ level >= cur->bc_nlevels) {
error = xfs_btree_new_root(cur, stat);
xfs_btree_set_ptr_null(cur, ptrp);
@@ -3524,6 +3697,7 @@ xfs_btree_insert(
}
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3537,7 +3711,8 @@ xfs_btree_insert(
if (pcur != cur &&
(ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
/* Save the state from the cursor before we trash it */
- if (cur->bc_ops->update_cursor)
+ if (cur->bc_ops->update_cursor &&
+ !(cur->bc_flags & XFS_BTREE_STAGING))
cur->bc_ops->update_cursor(pcur, cur);
cur->bc_nlevels = pcur->bc_nlevels;
xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
@@ -3586,7 +3761,7 @@ xfs_btree_kill_iroot(
#endif
int i;
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_nlevels > 1);
/*
@@ -3680,7 +3855,7 @@ xfs_btree_kill_root(
* Update the root pointer, decreasing the level by 1 and then
* free the old root.
*/
- cur->bc_ops->set_root(cur, newroot, -1);
+ xfs_btree_set_root(cur, newroot, -1);
error = xfs_btree_free_block(cur, bp);
if (error)
@@ -3822,27 +3997,25 @@ xfs_btree_delrec(
* Try to get rid of the next level down. If we can't then there's
* nothing left to do.
*/
- if (level == cur->bc_nlevels - 1) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
- xfs_iroot_realloc(cur->bc_ino.ip, -1,
- cur->bc_ino.whichfork);
+ if (xfs_btree_at_iroot(cur, level)) {
+ xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
- error = xfs_btree_kill_iroot(cur);
- if (error)
- goto error0;
+ error = xfs_btree_kill_iroot(cur);
+ if (error)
+ goto error0;
- error = xfs_btree_dec_cursor(cur, level, stat);
- if (error)
- goto error0;
- *stat = 1;
- return 0;
- }
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ *stat = 1;
+ return 0;
+ }
- /*
- * If this is the root level, and there's only one entry left,
- * and it's NOT the leaf level, then we can get rid of this
- * level.
- */
+ /*
+ * If this is the root level, and there's only one entry left, and it's
+ * NOT the leaf level, then we can get rid of this level.
+ */
+ if (level == cur->bc_nlevels - 1) {
if (numrecs == 1 && level > 0) {
union xfs_btree_ptr *pp;
/*
@@ -3891,7 +4064,7 @@ xfs_btree_delrec(
xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
/*
* One child of root, need to get a chance to copy its contents
* into the root and delete it. Can't go up to next level,
@@ -3931,6 +4104,7 @@ xfs_btree_delrec(
*/
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3939,12 +4113,14 @@ xfs_btree_delrec(
if (error)
goto error0;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3992,6 +4168,7 @@ xfs_btree_delrec(
if (!xfs_btree_ptr_is_null(cur, &lptr)) {
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4000,6 +4177,7 @@ xfs_btree_delrec(
if (error)
goto error0;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4017,6 +4195,7 @@ xfs_btree_delrec(
*/
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4026,6 +4205,7 @@ xfs_btree_delrec(
goto error0;
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4201,8 +4381,8 @@ xfs_btree_delrec(
* If we joined with the right neighbor and there's a level above
* us, increment the cursor at that level.
*/
- else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
- (level + 1 < cur->bc_nlevels)) {
+ else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE ||
+ level + 1 < cur->bc_nlevels) {
error = xfs_btree_increment(cur, level + 1, &i);
if (error)
goto error0;
@@ -4270,7 +4450,7 @@ xfs_btree_delete(
* If we combined blocks as part of deleting the record, delrec won't
* have updated the parent high keys so we have to do that here.
*/
- if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+ if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) {
error = xfs_btree_updkeys_force(cur, 0);
if (error)
goto error0;
@@ -4344,7 +4524,7 @@ xfs_btree_visit_block(
{
struct xfs_btree_block *block;
struct xfs_buf *bp;
- union xfs_btree_ptr rptr;
+ union xfs_btree_ptr rptr, bufptr;
int error;
/* do right sibling readahead */
@@ -4367,15 +4547,12 @@ xfs_btree_visit_block(
* return the same block without checking if the right sibling points
* back to us and creates a cyclic reference in the btree.
*/
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(bp)))
- return -EFSCORRUPTED;
- } else {
- if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
- xfs_buf_daddr(bp)))
- return -EFSCORRUPTED;
+ xfs_btree_buf_to_ptr(cur, bp, &bufptr);
+ if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
}
+
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4393,7 +4570,7 @@ xfs_btree_visit_blocks(
struct xfs_btree_block *block = NULL;
int error = 0;
- cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+ xfs_btree_init_ptr_from_cur(cur, &lptr);
/* for each level */
for (level = cur->bc_nlevels - 1; level >= 0; level--) {
@@ -4471,7 +4648,7 @@ xfs_btree_block_change_owner(
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
return 0;
block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
@@ -4489,7 +4666,7 @@ xfs_btree_block_change_owner(
* though, so everything is consistent in memory.
*/
if (!bp) {
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(level == cur->bc_nlevels - 1);
return 0;
}
@@ -4523,7 +4700,7 @@ xfs_btree_change_owner(
/* Verify the v5 fields of a long-format btree block. */
xfs_failaddr_t
-xfs_btree_lblock_v5hdr_verify(
+xfs_btree_fsblock_v5hdr_verify(
struct xfs_buf *bp,
uint64_t owner)
{
@@ -4544,7 +4721,7 @@ xfs_btree_lblock_v5hdr_verify(
/* Verify a long-format btree block. */
xfs_failaddr_t
-xfs_btree_lblock_verify(
+xfs_btree_fsblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
@@ -4553,28 +4730,60 @@ xfs_btree_lblock_verify(
xfs_fsblock_t fsb;
xfs_failaddr_t fa;
+ ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
- fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_rightsib);
return fa;
}
+/* Verify an in-memory btree block. */
+xfs_failaddr_t
+xfs_btree_memblock_verify(
+ struct xfs_buf *bp,
+ unsigned int max_recs)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buftarg *btp = bp->b_target;
+ xfs_failaddr_t fa;
+ xfbno_t bno;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > max_recs)
+ return __this_address;
+
+ /* sibling pointer verification */
+ bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_leftsib);
+ if (fa)
+ return fa;
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_rightsib);
+ if (fa)
+ return fa;
+
+ return NULL;
+}
/**
- * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
* btree block
*
* @bp: buffer containing the btree block
*/
xfs_failaddr_t
-xfs_btree_sblock_v5hdr_verify(
+xfs_btree_agblock_v5hdr_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
@@ -4593,13 +4802,13 @@ xfs_btree_sblock_v5hdr_verify(
}
/**
- * xfs_btree_sblock_verify() -- verify a short-format btree block
+ * xfs_btree_agblock_verify() -- verify a short-format btree block
*
* @bp: buffer containing the btree block
* @max_recs: maximum records allowed in this btree node
*/
xfs_failaddr_t
-xfs_btree_sblock_verify(
+xfs_btree_agblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
@@ -4608,16 +4817,18 @@ xfs_btree_sblock_verify(
xfs_agblock_t agbno;
xfs_failaddr_t fa;
+ ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
- fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
block->bb_u.s.bb_rightsib);
return fa;
}
@@ -4815,7 +5026,7 @@ xfs_btree_overlapped_query_range(
/* Load the root of the btree. */
level = cur->bc_nlevels - 1;
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
if (error)
return error;
@@ -4966,7 +5177,7 @@ xfs_btree_query_range(
if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
return -EINVAL;
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return xfs_btree_simple_query_range(cur, &low_key,
&high_key, fn, priv);
return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
@@ -5020,7 +5231,7 @@ xfs_btree_diff_two_ptrs(
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
}
@@ -5074,7 +5285,7 @@ xfs_btree_has_records_helper(
key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
&rec_key, info->key_mask);
if (key_contig == XBTREE_KEY_OVERLAP &&
- !(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return -EFSCORRUPTED;
if (key_contig == XBTREE_KEY_GAP)
return -ECANCELED;
@@ -5168,7 +5379,7 @@ xfs_btree_has_more_records(
return true;
/* There are more record blocks. */
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
else
return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
@@ -5233,6 +5444,7 @@ xfs_btree_goto_left_edge(
return error;
if (stat != 0) {
ASSERT(0);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index d906324e25c8..f93374278aa1 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -55,15 +55,8 @@ union xfs_btree_rec {
#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
-#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
-#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
-#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
-#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
-#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
-#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
-#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
-
-uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
+struct xfs_btree_ops;
+uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
/*
* For logging record fields.
@@ -86,9 +79,11 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
* Generic stats interface
*/
#define XFS_BTREE_STATS_INC(cur, stat) \
- XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+ XFS_STATS_INC_OFF((cur)->bc_mp, \
+ (cur)->bc_ops->statoff + __XBTS_ ## stat)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
- XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
+ XFS_STATS_ADD_OFF((cur)->bc_mp, \
+ (cur)->bc_ops->statoff + __XBTS_ ## stat, val)
enum xbtree_key_contig {
XBTREE_KEY_GAP = 0,
@@ -111,10 +106,37 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
return XBTREE_KEY_OVERLAP;
}
+#define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64))
+#define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32))
+
+enum xfs_btree_type {
+ XFS_BTREE_TYPE_AG,
+ XFS_BTREE_TYPE_INODE,
+ XFS_BTREE_TYPE_MEM,
+};
+
struct xfs_btree_ops {
- /* size of the key and record structures */
- size_t key_len;
- size_t rec_len;
+ const char *name;
+
+ /* Type of btree - AG-rooted or inode-rooted */
+ enum xfs_btree_type type;
+
+ /* XFS_BTGEO_* flags that determine the geometry of the btree */
+ unsigned int geom_flags;
+
+ /* size of the key, pointer, and record structures */
+ size_t key_len;
+ size_t ptr_len;
+ size_t rec_len;
+
+ /* LRU refcount to set on each btree buffer created */
+ unsigned int lru_refs;
+
+ /* offset of btree stats array */
+ unsigned int statoff;
+
+ /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */
+ unsigned int sick_mask;
/* cursor operations */
struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
@@ -199,6 +221,10 @@ struct xfs_btree_ops {
const union xfs_btree_key *mask);
};
+/* btree geometry flags */
+#define XFS_BTGEO_LASTREC_UPDATE (1U << 0) /* track last rec externally */
+#define XFS_BTGEO_OVERLAPPING (1U << 1) /* overlapping intervals */
+
/*
* Reasons for the update_lastrec method to be called.
*/
@@ -215,39 +241,6 @@ union xfs_btree_irec {
struct xfs_refcount_irec rc;
};
-/* Per-AG btree information. */
-struct xfs_btree_cur_ag {
- struct xfs_perag *pag;
- union {
- struct xfs_buf *agbp;
- struct xbtree_afakeroot *afake; /* for staging cursor */
- };
- union {
- struct {
- unsigned int nr_ops; /* # record updates */
- unsigned int shape_changes; /* # of extent splits */
- } refc;
- struct {
- bool active; /* allocation cursor state */
- } abt;
- };
-};
-
-/* Btree-in-inode cursor information */
-struct xfs_btree_cur_ino {
- struct xfs_inode *ip;
- struct xbtree_ifakeroot *ifake; /* for staging cursor */
- int allocated;
- short forksize;
- char whichfork;
- char flags;
-/* We are converting a delalloc reservation */
-#define XFS_BTCUR_BMBT_WASDEL (1 << 0)
-
-/* For extent swap, ignore owner check in verifier */
-#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
-};
-
struct xfs_btree_level {
/* buffer pointer */
struct xfs_buf *bp;
@@ -272,21 +265,38 @@ struct xfs_btree_cur
const struct xfs_btree_ops *bc_ops;
struct kmem_cache *bc_cache; /* cursor cache */
unsigned int bc_flags; /* btree features - below */
- xfs_btnum_t bc_btnum; /* identifies which btree type */
union xfs_btree_irec bc_rec; /* current insert/search record value */
uint8_t bc_nlevels; /* number of levels in the tree */
uint8_t bc_maxlevels; /* maximum levels for this btree type */
- int bc_statoff; /* offset of btree stats array */
- /*
- * Short btree pointers need an agno to be able to turn the pointers
- * into physical addresses for IO, so the btree cursor switches between
- * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the
- * cursor.
- */
+ /* per-type information */
union {
- struct xfs_btree_cur_ag bc_ag;
- struct xfs_btree_cur_ino bc_ino;
+ struct {
+ struct xfs_inode *ip;
+ short forksize;
+ char whichfork;
+ struct xbtree_ifakeroot *ifake; /* for staging cursor */
+ } bc_ino;
+ struct {
+ struct xfs_perag *pag;
+ struct xfs_buf *agbp;
+ struct xbtree_afakeroot *afake; /* for staging cursor */
+ } bc_ag;
+ struct {
+ struct xfbtree *xfbtree;
+ struct xfs_perag *pag;
+ } bc_mem;
+ };
+
+ /* per-format private data */
+ union {
+ struct {
+ int allocated;
+ } bc_bmap; /* bmapbt */
+ struct {
+ unsigned int nr_ops; /* # record updates */
+ unsigned int shape_changes; /* # of extent splits */
+ } bc_refc; /* refcountbt */
};
/* Must be at the end of the struct! */
@@ -304,18 +314,22 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels);
}
-/* cursor flags */
-#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
-#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
-#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
-#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
-#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
+/* cursor state flags */
/*
* The root of this btree is a fakeroot structure so that we can stage a btree
* rebuild without leaving it accessible via primary metadata. The ops struct
* is dynamically allocated and must be freed when the cursor is deleted.
*/
-#define XFS_BTREE_STAGING (1<<5)
+#define XFS_BTREE_STAGING (1U << 0)
+
+/* We are converting a delalloc reservation (only for bmbt btrees) */
+#define XFS_BTREE_BMBT_WASDEL (1U << 1)
+
+/* For extent swap, ignore owner check in verifier (only for bmbt btrees) */
+#define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2)
+
+/* Cursor is active (only for allocbt btrees) */
+#define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3)
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
@@ -325,14 +339,10 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
*/
#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
-/*
- * Internal long and short btree block checks. They return NULL if the
- * block is ok or the address of the failed check otherwise.
- */
-xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
- struct xfs_btree_block *block, int level, struct xfs_buf *bp);
-xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
+xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+int __xfs_btree_check_ptr(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int index, int level);
/*
* Check that block header is ok.
@@ -345,24 +355,6 @@ xfs_btree_check_block(
struct xfs_buf *bp); /* buffer containing block, if any */
/*
- * Check that (long) pointer is ok.
- */
-bool /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_fsblock_t fsbno, /* btree block disk address */
- int level); /* btree block level */
-
-/*
- * Check that (short) pointer is ok.
- */
-bool /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agblock_t agbno, /* btree block disk address */
- int level); /* btree block level */
-
-/*
* Delete the btree cursor.
*/
void
@@ -392,63 +384,14 @@ xfs_btree_offsets(
int *last); /* output: last byte offset */
/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int /* error */
-xfs_btree_read_bufl(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- struct xfs_buf **bpp, /* buffer for fsbno */
- int refval, /* ref count value for buffer */
- const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-void /* error */
-xfs_btree_reada_bufl(
- struct xfs_mount *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-void /* error */
-xfs_btree_reada_bufs(
- struct xfs_mount *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops);
-
-/*
* Initialise a new btree block header
*/
-void
-xfs_btree_init_block(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner);
-
-void
-xfs_btree_init_block_int(
- struct xfs_mount *mp,
- struct xfs_btree_block *buf,
- xfs_daddr_t blkno,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner,
- unsigned int flags);
+void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp,
+ const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs,
+ __u64 owner);
+void xfs_btree_init_block(struct xfs_mount *mp,
+ struct xfs_btree_block *buf, const struct xfs_btree_ops *ops,
+ __u16 level, __u16 numrecs, __u64 owner);
/*
* Common btree core entry points.
@@ -467,10 +410,10 @@ int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
/*
* btree block CRC helpers
*/
-void xfs_btree_lblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
-void xfs_btree_sblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+void xfs_btree_fsblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_fsblock_verify_crc(struct xfs_buf *);
+void xfs_btree_agblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_agblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
@@ -510,12 +453,14 @@ static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
-xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
-xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp);
+xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
-xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp,
uint64_t owner);
-xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp,
+ unsigned int max_recs);
+xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
@@ -690,7 +635,7 @@ xfs_btree_islastblock(
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
}
@@ -714,21 +659,28 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
union xfs_btree_key *dst_key,
const union xfs_btree_key *src_key, int numkeys);
+void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
static inline struct xfs_btree_cur *
xfs_btree_alloc_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_btnum_t btnum,
+ const struct xfs_btree_ops *ops,
uint8_t maxlevels,
struct kmem_cache *cache)
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+ ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN ||
+ ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN);
+
+ /* BMBT allocations can come through from non-transactional context. */
+ cur = kmem_cache_zalloc(cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+ cur->bc_ops = ops;
cur->bc_tp = tp;
cur->bc_mp = mp;
- cur->bc_btnum = btnum;
cur->bc_maxlevels = maxlevels;
cur->bc_cache = cache;
@@ -740,4 +692,14 @@ void xfs_btree_destroy_cur_caches(void);
int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
+/* Does this level of the cursor point to the inode root (and not a block)? */
+static inline bool
+xfs_btree_at_iroot(
+ const struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
+ level == cur->bc_nlevels - 1;
+}
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
new file mode 100644
index 000000000000..036061fe32cc
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_mem.c
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_ag.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+
+/* Set the root of an in-memory btree. */
+void
+xfbtree_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ cur->bc_mem.xfbtree->root = *ptr;
+ cur->bc_mem.xfbtree->nlevels += inc;
+}
+
+/* Initialize a pointer from the in-memory btree header. */
+void
+xfbtree_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ *ptr = cur->bc_mem.xfbtree->root;
+}
+
+/* Duplicate an in-memory btree cursor. */
+struct xfs_btree_cur *
+xfbtree_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_cur *ncur;
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops,
+ cur->bc_maxlevels, cur->bc_cache);
+ ncur->bc_flags = cur->bc_flags;
+ ncur->bc_nlevels = cur->bc_nlevels;
+ ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree;
+
+ if (cur->bc_mem.pag)
+ ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
+
+ return ncur;
+}
+
+/* Close the btree xfile and release all resources. */
+void
+xfbtree_destroy(
+ struct xfbtree *xfbt)
+{
+ xfs_buftarg_drain(xfbt->target);
+}
+
+/* Compute the number of bytes available for records. */
+static inline unsigned int
+xfbtree_rec_bytes(
+ struct xfs_mount *mp,
+ const struct xfs_btree_ops *ops)
+{
+ return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+}
+
+/* Initialize an empty leaf block as the btree root. */
+STATIC int
+xfbtree_init_leaf_block(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ const struct xfs_btree_ops *ops)
+{
+ struct xfs_buf *bp;
+ xfbno_t bno = xfbt->highest_bno++;
+ int error;
+
+ error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE,
+ &bp);
+ if (error)
+ return error;
+
+ trace_xfbtree_create_root_buf(xfbt, bp);
+
+ bp->b_ops = ops->buf_ops;
+ xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner);
+ xfs_buf_relse(bp);
+
+ xfbt->root.l = cpu_to_be64(bno);
+ return 0;
+}
+
+/*
+ * Create an in-memory btree root that can be used with the given xmbuf.
+ * Callers must set xfbt->owner.
+ */
+int
+xfbtree_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp,
+ const struct xfs_btree_ops *ops)
+{
+ unsigned int blocklen = xfbtree_rec_bytes(mp, ops);
+ unsigned int keyptr_len;
+ int error;
+
+ /* Requires a long-format CRC-format btree */
+ if (!xfs_has_crc(mp)) {
+ ASSERT(xfs_has_crc(mp));
+ return -EINVAL;
+ }
+ if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) {
+ ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN);
+ return -EINVAL;
+ }
+
+ memset(xfbt, 0, sizeof(*xfbt));
+ xfbt->target = btp;
+
+ /* Set up min/maxrecs for this btree. */
+ keyptr_len = ops->key_len + sizeof(__be64);
+ xfbt->maxrecs[0] = blocklen / ops->rec_len;
+ xfbt->maxrecs[1] = blocklen / keyptr_len;
+ xfbt->minrecs[0] = xfbt->maxrecs[0] / 2;
+ xfbt->minrecs[1] = xfbt->maxrecs[1] / 2;
+ xfbt->highest_bno = 0;
+ xfbt->nlevels = 1;
+
+ /* Initialize the empty btree. */
+ error = xfbtree_init_leaf_block(mp, xfbt, ops);
+ if (error)
+ goto err_freesp;
+
+ trace_xfbtree_init(mp, xfbt, ops);
+
+ return 0;
+
+err_freesp:
+ xfs_buftarg_drain(xfbt->target);
+ return error;
+}
+
+/* Allocate a block to our in-memory btree. */
+int
+xfbtree_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+ xfbno_t bno = xfbt->highest_bno++;
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ trace_xfbtree_alloc_block(xfbt, cur, bno);
+
+ /* Fail if the block address exceeds the maximum for the buftarg. */
+ if (!xfbtree_verify_bno(xfbt, bno)) {
+ ASSERT(xfbtree_verify_bno(xfbt, bno));
+ *stat = 0;
+ return 0;
+ }
+
+ new->l = cpu_to_be64(bno);
+ *stat = 1;
+ return 0;
+}
+
+/* Free a block from our in-memory btree. */
+int
+xfbtree_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+ xfs_daddr_t daddr = xfs_buf_daddr(bp);
+ xfbno_t bno = xfs_daddr_to_xfbno(daddr);
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ trace_xfbtree_free_block(xfbt, cur, bno);
+
+ if (bno + 1 == xfbt->highest_bno)
+ xfbt->highest_bno--;
+
+ return 0;
+}
+
+/* Return the minimum number of records for a btree block. */
+int
+xfbtree_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+
+ return xfbt->minrecs[level != 0];
+}
+
+/* Return the maximum number of records for a btree block. */
+int
+xfbtree_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+
+ return xfbt->maxrecs[level != 0];
+}
+
+/* If this log item is a buffer item that came from the xfbtree, return it. */
+static inline struct xfs_buf *
+xfbtree_buf_match(
+ struct xfbtree *xfbt,
+ const struct xfs_log_item *lip)
+{
+ const struct xfs_buf_log_item *bli;
+ struct xfs_buf *bp;
+
+ if (lip->li_type != XFS_LI_BUF)
+ return NULL;
+
+ bli = container_of(lip, struct xfs_buf_log_item, bli_item);
+ bp = bli->bli_buf;
+ if (bp->b_target != xfbt->target)
+ return NULL;
+
+ return bp;
+}
+
+/*
+ * Commit changes to the incore btree immediately by writing all dirty xfbtree
+ * buffers to the backing xfile. This detaches all xfbtree buffers from the
+ * transaction, even on failure. The buffer locks are dropped between the
+ * delwri queue and submit, so the caller must synchronize btree access.
+ *
+ * Normally we'd let the buffers commit with the transaction and get written to
+ * the xfile via the log, but online repair stages ephemeral btrees in memory
+ * and uses the btree_staging functions to write new btrees to disk atomically.
+ * The in-memory btree (and its backing store) are discarded at the end of the
+ * repair phase, which means that xfbtree buffers cannot commit with the rest
+ * of a transaction.
+ *
+ * In other words, online repair only needs the transaction to collect buffer
+ * pointers and to avoid buffer deadlocks, not to guarantee consistency of
+ * updates.
+ */
+int
+xfbtree_trans_commit(
+ struct xfbtree *xfbt,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *n;
+ bool tp_dirty = false;
+ int error = 0;
+
+ /*
+ * For each xfbtree buffer attached to the transaction, write the dirty
+ * buffers to the xfile and release them.
+ */
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
+
+ if (!bp) {
+ if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ tp_dirty |= true;
+ continue;
+ }
+
+ trace_xfbtree_trans_commit_buf(xfbt, bp);
+
+ xmbuf_trans_bdetach(tp, bp);
+
+ /*
+ * If the buffer fails verification, note the failure but
+ * continue walking the transaction items so that we remove all
+ * ephemeral btree buffers.
+ */
+ if (!error)
+ error = xmbuf_finalize(bp);
+
+ xfs_buf_relse(bp);
+ }
+
+ /*
+ * Reset the transaction's dirty flag to reflect the dirty state of the
+ * log items that are still attached.
+ */
+ tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
+ (tp_dirty ? XFS_TRANS_DIRTY : 0);
+
+ return error;
+}
+
+/*
+ * Cancel changes to the incore btree by detaching all the xfbtree buffers.
+ * Changes are not undone, so callers must not access the btree ever again.
+ */
+void
+xfbtree_trans_cancel(
+ struct xfbtree *xfbt,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *n;
+ bool tp_dirty = false;
+
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
+
+ if (!bp) {
+ if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ tp_dirty |= true;
+ continue;
+ }
+
+ trace_xfbtree_trans_cancel_buf(xfbt, bp);
+
+ xmbuf_trans_bdetach(tp, bp);
+ xfs_buf_relse(bp);
+ }
+
+ /*
+ * Reset the transaction's dirty flag to reflect the dirty state of the
+ * log items that are still attached.
+ */
+ tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
+ (tp_dirty ? XFS_TRANS_DIRTY : 0);
+}
diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h
new file mode 100644
index 000000000000..1c3825786ec8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_mem.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BTREE_MEM_H__
+#define __XFS_BTREE_MEM_H__
+
+typedef uint64_t xfbno_t;
+
+#define XFBNO_BLOCKSIZE (XMBUF_BLOCKSIZE)
+#define XFBNO_BBSHIFT (XMBUF_BLOCKSHIFT - BBSHIFT)
+#define XFBNO_BBSIZE (XFBNO_BLOCKSIZE >> BBSHIFT)
+
+static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno)
+{
+ return blkno << XFBNO_BBSHIFT;
+}
+
+static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr)
+{
+ return daddr >> XFBNO_BBSHIFT;
+}
+
+struct xfbtree {
+ /* buffer cache target for this in-memory btree */
+ struct xfs_buftarg *target;
+
+ /* Highest block number that has been written to. */
+ xfbno_t highest_bno;
+
+ /* Owner of this btree. */
+ unsigned long long owner;
+
+ /* Btree header */
+ union xfs_btree_ptr root;
+ unsigned int nlevels;
+
+ /* Minimum and maximum records per block. */
+ unsigned int maxrecs[2];
+ unsigned int minrecs[2];
+};
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno)
+{
+ return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno));
+}
+
+void xfbtree_set_root(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int inc);
+void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur);
+
+int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level);
+int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level);
+
+int xfbtree_alloc_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr,
+ int *stat);
+int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+/* Callers must set xfbt->target and xfbt->owner before calling this */
+int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt,
+ struct xfs_buftarg *btp, const struct xfs_btree_ops *ops);
+void xfbtree_destroy(struct xfbtree *xfbt);
+
+int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp);
+void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp);
+#else
+# define xfbtree_verify_bno(...) (false)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_BTREE_MEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index e276eba87cb1..694929703152 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -39,63 +39,6 @@
*/
/*
- * Don't allow staging cursors to be duplicated because they're supposed to be
- * kept private to a single thread.
- */
-STATIC struct xfs_btree_cur *
-xfs_btree_fakeroot_dup_cursor(
- struct xfs_btree_cur *cur)
-{
- ASSERT(0);
- return NULL;
-}
-
-/*
- * Don't allow block allocation for a staging cursor, because staging cursors
- * do not support regular btree modifications.
- *
- * Bulk loading uses a separate callback to obtain new blocks from a
- * preallocated list, which prevents ENOSPC failures during loading.
- */
-STATIC int
-xfs_btree_fakeroot_alloc_block(
- struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *start_bno,
- union xfs_btree_ptr *new_bno,
- int *stat)
-{
- ASSERT(0);
- return -EFSCORRUPTED;
-}
-
-/*
- * Don't allow block freeing for a staging cursor, because staging cursors
- * do not support regular btree modifications.
- */
-STATIC int
-xfs_btree_fakeroot_free_block(
- struct xfs_btree_cur *cur,
- struct xfs_buf *bp)
-{
- ASSERT(0);
- return -EFSCORRUPTED;
-}
-
-/* Initialize a pointer to the root block from the fakeroot. */
-STATIC void
-xfs_btree_fakeroot_init_ptr_from_cur(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- struct xbtree_afakeroot *afake;
-
- ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
-
- afake = cur->bc_ag.afake;
- ptr->s = cpu_to_be32(afake->af_root);
-}
-
-/*
* Bulk Loading for AG Btrees
* ==========================
*
@@ -109,47 +52,20 @@ xfs_btree_fakeroot_init_ptr_from_cur(
* cursor into a regular btree cursor.
*/
-/* Update the btree root information for a per-AG fake root. */
-STATIC void
-xfs_btree_afakeroot_set_root(
- struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *ptr,
- int inc)
-{
- struct xbtree_afakeroot *afake = cur->bc_ag.afake;
-
- ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- afake->af_root = be32_to_cpu(ptr->s);
- afake->af_levels += inc;
-}
-
/*
* Initialize a AG-rooted btree cursor with the given AG btree fake root.
- * The btree cursor's bc_ops will be overridden as needed to make the staging
- * functionality work.
*/
void
xfs_btree_stage_afakeroot(
struct xfs_btree_cur *cur,
struct xbtree_afakeroot *afake)
{
- struct xfs_btree_ops *nops;
-
ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
- ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+ ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_tp == NULL);
- nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
- memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
- nops->alloc_block = xfs_btree_fakeroot_alloc_block;
- nops->free_block = xfs_btree_fakeroot_free_block;
- nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
- nops->set_root = xfs_btree_afakeroot_set_root;
- nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
-
cur->bc_ag.afake = afake;
cur->bc_nlevels = afake->af_levels;
- cur->bc_ops = nops;
cur->bc_flags |= XFS_BTREE_STAGING;
}
@@ -163,17 +79,15 @@ void
xfs_btree_commit_afakeroot(
struct xfs_btree_cur *cur,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- const struct xfs_btree_ops *ops)
+ struct xfs_buf *agbp)
{
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
ASSERT(cur->bc_tp == NULL);
trace_xfs_btree_commit_afakeroot(cur);
- kmem_free((void *)cur->bc_ops);
+ cur->bc_ag.afake = NULL;
cur->bc_ag.agbp = agbp;
- cur->bc_ops = ops;
cur->bc_flags &= ~XFS_BTREE_STAGING;
cur->bc_tp = tp;
}
@@ -211,29 +125,16 @@ xfs_btree_commit_afakeroot(
void
xfs_btree_stage_ifakeroot(
struct xfs_btree_cur *cur,
- struct xbtree_ifakeroot *ifake,
- struct xfs_btree_ops **new_ops)
+ struct xbtree_ifakeroot *ifake)
{
- struct xfs_btree_ops *nops;
-
ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_tp == NULL);
- nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
- memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
- nops->alloc_block = xfs_btree_fakeroot_alloc_block;
- nops->free_block = xfs_btree_fakeroot_free_block;
- nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
- nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
-
cur->bc_ino.ifake = ifake;
cur->bc_nlevels = ifake->if_levels;
- cur->bc_ops = nops;
+ cur->bc_ino.forksize = ifake->if_fork_size;
cur->bc_flags |= XFS_BTREE_STAGING;
-
- if (new_ops)
- *new_ops = nops;
}
/*
@@ -246,18 +147,15 @@ void
xfs_btree_commit_ifakeroot(
struct xfs_btree_cur *cur,
struct xfs_trans *tp,
- int whichfork,
- const struct xfs_btree_ops *ops)
+ int whichfork)
{
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
ASSERT(cur->bc_tp == NULL);
trace_xfs_btree_commit_ifakeroot(cur);
- kmem_free((void *)cur->bc_ops);
cur->bc_ino.ifake = NULL;
cur->bc_ino.whichfork = whichfork;
- cur->bc_ops = ops;
cur->bc_flags &= ~XFS_BTREE_STAGING;
cur->bc_tp = tp;
}
@@ -397,8 +295,7 @@ xfs_btree_bload_prep_block(
struct xfs_btree_block *new_block;
int ret;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1) {
+ if (xfs_btree_at_iroot(cur, level)) {
struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
size_t new_size;
@@ -406,14 +303,12 @@ xfs_btree_bload_prep_block(
/* Allocate a new incore btree root block. */
new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
- ifp->if_broot = kmem_zalloc(new_size, 0);
+ ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
ifp->if_broot_bytes = (int)new_size;
/* Initialize it and send it out. */
- xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
- XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
- nr_this_block, cur->bc_ino.ip->i_ino,
- cur->bc_flags);
+ xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
+ level, nr_this_block, cur->bc_ino.ip->i_ino);
*bpp = NULL;
*blockp = ifp->if_broot;
@@ -704,7 +599,7 @@ xfs_btree_bload_compute_geometry(
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
&avg_per_block, &level_blocks, &dontcare64);
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
/*
* If all the items we want to store at this level
* would fit in the inode root block, then we have our
@@ -763,7 +658,7 @@ xfs_btree_bload_compute_geometry(
return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
bbl->nr_blocks = nr_blocks - 1;
else
bbl->nr_blocks = nr_blocks;
@@ -890,7 +785,7 @@ xfs_btree_bload(
}
/* Initialize the new root. */
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
cur->bc_ino.ifake->if_blocks = total_blocks - 1;
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
index 055ea43b1e18..0c9c2ffb127a 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -22,7 +22,7 @@ struct xbtree_afakeroot {
void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
struct xbtree_afakeroot *afake);
void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
- struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+ struct xfs_buf *agbp);
/* Fake root for an inode-rooted btree. */
struct xbtree_ifakeroot {
@@ -41,10 +41,9 @@ struct xbtree_ifakeroot {
/* Cursor interactions with fake roots for inode-rooted btrees. */
void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
- struct xbtree_ifakeroot *ifake,
- struct xfs_btree_ops **new_ops);
+ struct xbtree_ifakeroot *ifake);
void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
- int whichfork, const struct xfs_btree_ops *ops);
+ int whichfork);
/* Bulk loading of staged btrees. */
typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur,
@@ -76,8 +75,7 @@ struct xfs_btree_bload {
/*
* This function should return the size of the in-core btree root
- * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
- * types.
+ * block. It is only necessary for XFS_BTREE_TYPE_INODE btrees.
*/
xfs_btree_bload_iroot_size_fn iroot_size;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 5457188bb4de..718d071bb21a 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -23,6 +23,7 @@
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
/*
* xfs_da_btree.c
@@ -85,7 +86,8 @@ xfs_da_state_alloc(
{
struct xfs_da_state *state;
- state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
+ state = kmem_cache_zalloc(xfs_da_state_cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
state->args = args;
state->mp = args->dp->i_mount;
return state;
@@ -352,6 +354,8 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = {
static int
xfs_da3_node_set_type(
struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ int whichfork,
struct xfs_buf *bp)
{
struct xfs_da_blkinfo *info = bp->b_addr;
@@ -373,6 +377,7 @@ xfs_da3_node_set_type(
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp,
info, sizeof(*info));
xfs_trans_brelse(tp, bp);
+ xfs_dirattr_mark_sick(dp, whichfork);
return -EFSCORRUPTED;
}
}
@@ -391,7 +396,7 @@ xfs_da3_node_read(
&xfs_da3_node_buf_ops);
if (error || !*bpp || !tp)
return error;
- return xfs_da3_node_set_type(tp, *bpp);
+ return xfs_da3_node_set_type(tp, dp, whichfork, *bpp);
}
int
@@ -408,6 +413,8 @@ xfs_da3_node_read_mapped(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno,
XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0,
bpp, &xfs_da3_node_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, whichfork);
if (error || !*bpp)
return error;
@@ -418,7 +425,7 @@ xfs_da3_node_read_mapped(
if (!tp)
return 0;
- return xfs_da3_node_set_type(tp, *bpp);
+ return xfs_da3_node_set_type(tp, dp, whichfork, *bpp);
}
/*
@@ -631,6 +638,7 @@ xfs_da3_split(
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
xfs_buf_mark_corrupt(oldblk->bp);
+ xfs_da_mark_sick(state->args);
error = -EFSCORRUPTED;
goto out;
}
@@ -644,6 +652,7 @@ xfs_da3_split(
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
xfs_buf_mark_corrupt(oldblk->bp);
+ xfs_da_mark_sick(state->args);
error = -EFSCORRUPTED;
goto out;
}
@@ -1635,6 +1644,7 @@ xfs_da3_node_lookup_int(
if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1650,6 +1660,7 @@ xfs_da3_node_lookup_int(
/* Tree taller than we can handle; bail out! */
if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1658,6 +1669,7 @@ xfs_da3_node_lookup_int(
expected_level = nodehdr.level - 1;
else if (expected_level != nodehdr.level) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
} else
expected_level--;
@@ -1709,12 +1721,16 @@ xfs_da3_node_lookup_int(
}
/* We can't point back to the root. */
- if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk))
+ if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
}
- if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0))
+ if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
/*
* A leaf block that ends in the hashval that we are interested in
@@ -1732,6 +1748,7 @@ xfs_da3_node_lookup_int(
args->blkno = blk->blkno;
} else {
ASSERT(0);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
@@ -2182,7 +2199,8 @@ xfs_da_grow_inode_int(
* If we didn't get it and the block might work if fragmented,
* try without the CONTIG flag. Loop until we get it all.
*/
- mapp = kmem_alloc(sizeof(*mapp) * count, 0);
+ mapp = kmalloc(sizeof(*mapp) * count,
+ GFP_KERNEL | __GFP_NOFAIL);
for (b = *bno, mapi = 0; b < *bno + count; ) {
c = (int)(*bno + count - b);
nmap = min(XFS_BMAP_MAX_NMAP, c);
@@ -2219,7 +2237,7 @@ xfs_da_grow_inode_int(
out_free_map:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2297,8 +2315,10 @@ xfs_da3_swap_lastblock(
error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, lastoff == 0))
+ if (XFS_IS_CORRUPT(mp, lastoff == 0)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
/*
* Read the last block in the btree space.
*/
@@ -2348,6 +2368,7 @@ xfs_da3_swap_lastblock(
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->forw) != last_blkno ||
sib_info->magic != dead_info->magic)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2368,6 +2389,7 @@ xfs_da3_swap_lastblock(
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->back) != last_blkno ||
sib_info->magic != dead_info->magic)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2390,6 +2412,7 @@ xfs_da3_swap_lastblock(
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp,
level >= 0 && level != par_hdr.level + 1)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2401,6 +2424,7 @@ xfs_da3_swap_lastblock(
entno++)
continue;
if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2426,6 +2450,7 @@ xfs_da3_swap_lastblock(
xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
if (XFS_IS_CORRUPT(mp, par_blkno == 0)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2435,6 +2460,7 @@ xfs_da3_swap_lastblock(
par_node = par_buf->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2518,7 +2544,8 @@ xfs_dabuf_map(
int error = 0, nirecs, i;
if (nfsb > 1)
- irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS);
+ irecs = kzalloc(sizeof(irec) * nfsb,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
nirecs = nfsb;
error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs,
@@ -2531,7 +2558,8 @@ xfs_dabuf_map(
* larger one that needs to be free by the caller.
*/
if (nirecs > 1) {
- map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
+ map = kzalloc(nirecs * sizeof(struct xfs_buf_map),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
if (!map) {
error = -ENOMEM;
goto out_free_irecs;
@@ -2557,12 +2585,13 @@ xfs_dabuf_map(
*nmaps = nirecs;
out_free_irecs:
if (irecs != &irec)
- kmem_free(irecs);
+ kfree(irecs);
return error;
invalid_mapping:
/* Caller ok with no mapping. */
if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) {
+ xfs_dirattr_mark_sick(dp, whichfork);
error = -EFSCORRUPTED;
if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
xfs_alert(mp, "%s: bno %u inode %llu",
@@ -2613,7 +2642,7 @@ xfs_da_get_buf(
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2644,6 +2673,8 @@ xfs_da_read_buf(
error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0,
&bp, ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, whichfork);
if (error)
goto out_free;
@@ -2654,7 +2685,7 @@ xfs_da_read_buf(
*bpp = bp;
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2685,7 +2716,7 @@ xfs_da_reada_buf(
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 24f9d1461f9a..060e5c96b70f 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -159,6 +159,17 @@ struct xfs_da3_intnode {
#define XFS_DIR3_FT_MAX 9
+#define XFS_DIR3_FTYPE_STR \
+ { XFS_DIR3_FT_UNKNOWN, "unknown" }, \
+ { XFS_DIR3_FT_REG_FILE, "file" }, \
+ { XFS_DIR3_FT_DIR, "directory" }, \
+ { XFS_DIR3_FT_CHRDEV, "char" }, \
+ { XFS_DIR3_FT_BLKDEV, "block" }, \
+ { XFS_DIR3_FT_FIFO, "fifo" }, \
+ { XFS_DIR3_FT_SOCK, "sock" }, \
+ { XFS_DIR3_FT_SYMLINK, "symlink" }, \
+ { XFS_DIR3_FT_WHT, "whiteout" }
+
/*
* Byte offset in data block and shortform entry.
*/
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 66a17910d021..c13276095cc0 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -819,16 +819,16 @@ xfs_defer_can_append(
/* Create a new pending item at the end of the transaction list. */
static inline struct xfs_defer_pending *
xfs_defer_alloc(
- struct xfs_trans *tp,
+ struct list_head *dfops,
const struct xfs_defer_op_type *ops)
{
struct xfs_defer_pending *dfp;
dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
dfp->dfp_ops = ops;
INIT_LIST_HEAD(&dfp->dfp_work);
- list_add_tail(&dfp->dfp_list, &tp->t_dfops);
+ list_add_tail(&dfp->dfp_list, dfops);
return dfp;
}
@@ -846,7 +846,7 @@ xfs_defer_add(
dfp = xfs_defer_find_last(tp, ops);
if (!dfp || !xfs_defer_can_append(dfp, ops))
- dfp = xfs_defer_alloc(tp, ops);
+ dfp = xfs_defer_alloc(&tp->t_dfops, ops);
xfs_defer_add_item(dfp, li);
trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
@@ -870,7 +870,7 @@ xfs_defer_add_barrier(
if (dfp)
return;
- xfs_defer_alloc(tp, &xfs_barrier_defer_type);
+ xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type);
trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
}
@@ -885,14 +885,9 @@ xfs_defer_start_recovery(
struct list_head *r_dfops,
const struct xfs_defer_op_type *ops)
{
- struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops);
- dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
- GFP_NOFS | __GFP_NOFAIL);
- dfp->dfp_ops = ops;
dfp->dfp_intent = lip;
- INIT_LIST_HEAD(&dfp->dfp_work);
- list_add_tail(&dfp->dfp_list, r_dfops);
}
/*
@@ -979,7 +974,7 @@ xfs_defer_ops_capture(
return ERR_PTR(error);
/* Create an object to capture the defer ops. */
- dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&dfc->dfc_list);
INIT_LIST_HEAD(&dfc->dfc_dfops);
@@ -1011,7 +1006,7 @@ xfs_defer_ops_capture(
* transaction.
*/
for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
- ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL);
ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
}
@@ -1038,7 +1033,7 @@ xfs_defer_ops_capture_abort(
for (i = 0; i < dfc->dfc_held.dr_inos; i++)
xfs_irele(dfc->dfc_held.dr_ip[i]);
- kmem_free(dfc);
+ kfree(dfc);
}
/*
@@ -1114,7 +1109,7 @@ xfs_defer_ops_continue(
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
tp->t_flags |= dfc->dfc_tpflags;
- kmem_free(dfc);
+ kfree(dfc);
}
/* Release the resources captured and continued during recovery. */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index a76673281514..4821519efad4 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -18,6 +18,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_health.h"
const struct xfs_name xfs_name_dotdot = {
.name = (const unsigned char *)"..",
@@ -25,6 +26,12 @@ const struct xfs_name xfs_name_dotdot = {
.type = XFS_DIR3_FT_DIR,
};
+const struct xfs_name xfs_name_dot = {
+ .name = (const unsigned char *)".",
+ .len = 1,
+ .type = XFS_DIR3_FT_DIR,
+};
+
/*
* Convert inode mode to directory entry filetype
*/
@@ -104,13 +111,13 @@ xfs_da_mount(
ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE);
- mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_MAYFAIL);
- mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_MAYFAIL);
+ mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!mp->m_dir_geo || !mp->m_attr_geo) {
- kmem_free(mp->m_dir_geo);
- kmem_free(mp->m_attr_geo);
+ kfree(mp->m_dir_geo);
+ kfree(mp->m_attr_geo);
return -ENOMEM;
}
@@ -178,8 +185,8 @@ void
xfs_da_unmount(
struct xfs_mount *mp)
{
- kmem_free(mp->m_dir_geo);
- kmem_free(mp->m_attr_geo);
+ kfree(mp->m_dir_geo);
+ kfree(mp->m_attr_geo);
}
/*
@@ -236,7 +243,7 @@ xfs_dir_init(
if (error)
return error;
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -244,7 +251,7 @@ xfs_dir_init(
args->dp = dp;
args->trans = tp;
error = xfs_dir2_sf_create(args, pdp->i_ino);
- kmem_free(args);
+ kfree(args);
return error;
}
@@ -273,7 +280,7 @@ xfs_dir_createname(
XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -313,7 +320,7 @@ xfs_dir_createname(
rval = xfs_dir2_node_addname(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -333,7 +340,8 @@ xfs_dir_cilookup_result(
!(args->op_flags & XFS_DA_OP_CILOOKUP))
return -EEXIST;
- args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+ args->value = kmalloc(len,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL);
if (!args->value)
return -ENOMEM;
@@ -364,15 +372,8 @@ xfs_dir_lookup(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
- /*
- * We need to use KM_NOFS here so that lockdep will not throw false
- * positive deadlock warnings on a non-transactional lookup path. It is
- * safe to recurse into inode recalim in that case, but lockdep can't
- * easily be taught about it. Hence KM_NOFS avoids having to add more
- * lockdep Doing this avoids having to add a bunch of lockdep class
- * annotations into the reclaim path for the ilock.
- */
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
@@ -419,7 +420,7 @@ out_check_rval:
}
out_free:
xfs_iunlock(dp, lock_mode);
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -441,7 +442,7 @@ xfs_dir_removename(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -477,7 +478,7 @@ xfs_dir_removename(
else
rval = xfs_dir2_node_removename(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -502,7 +503,7 @@ xfs_dir_replace(
if (rval)
return rval;
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -538,7 +539,7 @@ xfs_dir_replace(
else
rval = xfs_dir2_node_replace(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -626,8 +627,10 @@ xfs_dir2_isblock(
return 0;
*isblock = true;
- if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
+ if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 19af22a16c41..8497d041f316 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -22,6 +22,19 @@ struct xfs_dir3_icfree_hdr;
struct xfs_dir3_icleaf_hdr;
extern const struct xfs_name xfs_name_dotdot;
+extern const struct xfs_name xfs_name_dot;
+
+static inline bool
+xfs_dir2_samename(
+ const struct xfs_name *n1,
+ const struct xfs_name *n2)
+{
+ if (n1 == n2)
+ return true;
+ if (n1->len != n2->len)
+ return false;
+ return !memcmp(n1->name, n2->name, n1->len);
+}
/*
* Convert inode mode to directory entry filetype
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 3c256d4cc40b..a2da007adb46 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -20,6 +20,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_health.h"
/*
* Local function prototypes.
@@ -152,6 +153,7 @@ xfs_dir3_block_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -1108,7 +1110,7 @@ xfs_dir2_sf_to_block(
* Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
- sfp = kmem_alloc(ifp->if_bytes, 0);
+ sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL);
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
@@ -1253,7 +1255,7 @@ xfs_dir2_sf_to_block(
sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
/* Done with the temporary buffer */
- kmem_free(sfp);
+ kfree(sfp);
/*
* Sort the leaf entries by hash value.
*/
@@ -1268,6 +1270,6 @@ xfs_dir2_sf_to_block(
xfs_dir3_data_check(dp, bp);
return 0;
out_free:
- kmem_free(sfp);
+ kfree(sfp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index dbcf58979a59..7a6d965bea71 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_health.h"
static xfs_failaddr_t xfs_dir2_data_freefind_verify(
struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
@@ -433,6 +434,7 @@ xfs_dir3_data_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -1198,6 +1200,7 @@ xfs_dir2_data_use_free(
corrupt:
xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount,
hdr, sizeof(*hdr), __FILE__, __LINE__, fa);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index cb9e950a911d..08dda5ce9d91 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -19,6 +19,7 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_health.h"
/*
* Local function declarations.
@@ -1393,8 +1394,10 @@ xfs_dir2_leaf_removename(
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[db]) != oldbest) {
xfs_buf_mark_corrupt(lbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
+
/*
* Mark the former data entry unused.
*/
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 7a03aeb9f4c9..be0b8834028c 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_health.h"
/*
* Function declarations.
@@ -231,6 +232,7 @@ __xfs_dir3_free_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -443,6 +445,7 @@ xfs_dir2_leaf_to_node(
if (be32_to_cpu(ltp->bestcount) >
(uint)dp->i_disk_size / args->geo->blksize) {
xfs_buf_mark_corrupt(lbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -517,6 +520,7 @@ xfs_dir2_leafn_add(
*/
if (index < 0) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -736,6 +740,7 @@ xfs_dir2_leafn_lookup_for_addname(
cpu_to_be16(NULLDATAOFF))) {
if (curfdb != newfdb)
xfs_trans_brelse(tp, curbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
curfdb = newfdb;
@@ -804,6 +809,7 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_dir3_leaf_check(dp, bp);
if (leafhdr.count <= 0) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1739,6 +1745,7 @@ xfs_dir2_node_add_datablk(
} else {
xfs_alert(mp, " ... fblk is NULL");
}
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index e1f83fc7b6ad..17a20384c8b7 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -276,7 +276,7 @@ xfs_dir2_block_to_sf(
* format the data into. Once we have formatted the data, we can free
* the block and copy the formatted data into the inode literal area.
*/
- sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0);
+ sfp = kmalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
/*
@@ -350,7 +350,7 @@ xfs_dir2_block_to_sf(
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(sfp);
+ kfree(sfp);
return error;
}
@@ -524,7 +524,7 @@ xfs_dir2_sf_addname_hard(
* Copy the old directory to the stack buffer.
*/
old_isize = (int)dp->i_disk_size;
- buf = kmem_alloc(old_isize, 0);
+ buf = kmalloc(old_isize, GFP_KERNEL | __GFP_NOFAIL);
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
memcpy(oldsfp, dp->i_df.if_data, old_isize);
/*
@@ -576,7 +576,7 @@ xfs_dir2_sf_addname_hard(
sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = new_isize;
xfs_dir2_sf_check(args);
}
@@ -1151,7 +1151,7 @@ xfs_dir2_sf_toino4(
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, 0);
+ buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL);
ASSERT(oldsfp->i8count == 1);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1190,7 +1190,7 @@ xfs_dir2_sf_toino4(
/*
* Clean up the inode.
*/
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
@@ -1223,7 +1223,7 @@ xfs_dir2_sf_toino8(
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, 0);
+ buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL);
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1262,7 +1262,7 @@ xfs_dir2_sf_toino8(
/*
* Clean up the inode.
*/
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 382ab1e71c0b..2b2f9050fbfb 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -477,15 +477,9 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
/*
- * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
- * arrays below.
- */
-#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1)
-
-/*
- * The second word of agf_levels in the first a.g. overlaps the EFS
- * superblock's magic number. Since the magic numbers valid for EFS
- * are > 64k, our value cannot be confused for an EFS superblock's.
+ * agf_cnt_level in the first AGF overlaps the EFS superblock's magic number.
+ * Since the magic numbers valid for EFS are > 64k, our value cannot be confused
+ * for an EFS superblock.
*/
typedef struct xfs_agf {
@@ -499,8 +493,13 @@ typedef struct xfs_agf {
/*
* Freespace and rmap information
*/
- __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
- __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
+ __be32 agf_bno_root; /* bnobt root block */
+ __be32 agf_cnt_root; /* cntbt root block */
+ __be32 agf_rmap_root; /* rmapbt root block */
+
+ __be32 agf_bno_level; /* bnobt btree levels */
+ __be32 agf_cnt_level; /* cntbt btree levels */
+ __be32 agf_rmap_level; /* rmapbt btree levels */
__be32 agf_flfirst; /* first freelist block's index */
__be32 agf_fllast; /* last freelist block's index */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 6360073865db..ca1b17d01437 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -195,6 +195,8 @@ struct xfs_fsop_geom {
#define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */
#define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */
#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */
+#define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */
+#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */
/* Output for XFS_FS_COUNTS */
typedef struct xfs_fsop_counts {
@@ -292,6 +294,7 @@ struct xfs_ag_geometry {
#define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */
#define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */
#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */
+#define XFS_AG_GEOM_SICK_INODES (1 << 10) /* bad inodes were seen */
/*
* Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
@@ -709,9 +712,12 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */
#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */
#define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */
+#define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */
+#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */
+#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 25
+#define XFS_SCRUB_TYPE_NR 28
/* i: Repair this metadata. */
#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 6296993ff8f3..3c64b5f9bd68 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -26,21 +26,40 @@
* and the "sick" field tells us if that piece was found to need repairs.
* Therefore we can conclude that for a given sick flag value:
*
- * - checked && sick => metadata needs repair
- * - checked && !sick => metadata is ok
- * - !checked => has not been examined since mount
+ * - checked && sick => metadata needs repair
+ * - checked && !sick => metadata is ok
+ * - !checked && sick => errors have been observed during normal operation,
+ * but the metadata has not been checked thoroughly
+ * - !checked && !sick => has not been examined since mount
+ *
+ * Evidence of health problems can be sorted into three basic categories:
+ *
+ * a) Primary evidence, which signals that something is defective within the
+ * general grouping of metadata.
+ *
+ * b) Secondary evidence, which are side effects of primary problem but are
+ * not themselves problems. These can be forgotten when the primary
+ * health problems are addressed.
+ *
+ * c) Indirect evidence, which points to something being wrong in another
+ * group, but we had to release resources and this is all that's left of
+ * that state.
*/
struct xfs_mount;
struct xfs_perag;
struct xfs_inode;
struct xfs_fsop_geom;
+struct xfs_btree_cur;
+struct xfs_da_args;
/* Observable health issues for metadata spanning the entire filesystem. */
#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */
#define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */
#define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */
#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */
+#define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */
+#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */
/* Observable health issues for realtime volume metadata. */
#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */
@@ -57,6 +76,7 @@ struct xfs_fsop_geom;
#define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */
#define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */
#define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */
+#define XFS_SICK_AG_INODES (1 << 10) /* inactivated bad inodes */
/* Observable health issues for inode metadata. */
#define XFS_SICK_INO_CORE (1 << 0) /* inode core */
@@ -73,11 +93,16 @@ struct xfs_fsop_geom;
#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */
#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */
+/* Don't propagate sick status to ag health summary during inactivation */
+#define XFS_SICK_INO_FORGET (1 << 12)
+
/* Primary evidence of health problems in a given group. */
#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
XFS_SICK_FS_UQUOTA | \
XFS_SICK_FS_GQUOTA | \
- XFS_SICK_FS_PQUOTA)
+ XFS_SICK_FS_PQUOTA | \
+ XFS_SICK_FS_QUOTACHECK | \
+ XFS_SICK_FS_NLINKS)
#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \
XFS_SICK_RT_SUMMARY)
@@ -107,29 +132,86 @@ struct xfs_fsop_geom;
XFS_SICK_INO_DIR_ZAPPED | \
XFS_SICK_INO_SYMLINK_ZAPPED)
-/* These functions must be provided by the xfs implementation. */
+/* Secondary state related to (but not primary evidence of) health problems. */
+#define XFS_SICK_FS_SECONDARY (0)
+#define XFS_SICK_RT_SECONDARY (0)
+#define XFS_SICK_AG_SECONDARY (0)
+#define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET)
+
+/* Evidence of health problems elsewhere. */
+#define XFS_SICK_FS_INDIRECT (0)
+#define XFS_SICK_RT_INDIRECT (0)
+#define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES)
+#define XFS_SICK_INO_INDIRECT (0)
+
+/* All health masks. */
+#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \
+ XFS_SICK_FS_SECONDARY | \
+ XFS_SICK_FS_INDIRECT)
+
+#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \
+ XFS_SICK_RT_SECONDARY | \
+ XFS_SICK_RT_INDIRECT)
+
+#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \
+ XFS_SICK_AG_SECONDARY | \
+ XFS_SICK_AG_INDIRECT)
+
+#define XFS_SICK_INO_ALL (XFS_SICK_INO_PRIMARY | \
+ XFS_SICK_INO_SECONDARY | \
+ XFS_SICK_INO_INDIRECT | \
+ XFS_SICK_INO_ZAPPED)
+
+/*
+ * These functions must be provided by the xfs implementation. Function
+ * behavior with respect to the first argument should be as follows:
+ *
+ * xfs_*_mark_sick: Set the sick flags and do not set checked flags.
+ * Runtime code should call this upon encountering
+ * a corruption.
+ *
+ * xfs_*_mark_corrupt: Set the sick and checked flags simultaneously.
+ * Fsck tools should call this when corruption is
+ * found.
+ *
+ * xfs_*_mark_healthy: Clear the sick flags and set the checked flags.
+ * Fsck tools should call this after correcting errors.
+ *
+ * xfs_*_measure_sickness: Return the sick and check status in the provided
+ * out parameters.
+ */
void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
unsigned int *checked);
void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
unsigned int *checked);
+void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
+ unsigned int mask);
void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask);
void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
unsigned int *checked);
void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_corrupt(struct xfs_inode *ip, unsigned int mask);
void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
unsigned int *checked);
void xfs_health_unmount(struct xfs_mount *mp);
+void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork);
+void xfs_btree_mark_sick(struct xfs_btree_cur *cur);
+void xfs_dirattr_mark_sick(struct xfs_inode *ip, int whichfork);
+void xfs_da_mark_sick(struct xfs_da_args *args);
/* Now some helpers. */
@@ -197,4 +279,7 @@ void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
+#define xfs_metadata_is_sick(error) \
+ (unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC))
+
#endif /* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 2361a22035b0..e5ac3e5430c4 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -27,6 +27,7 @@
#include "xfs_log.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
/*
* Lookup a record by ino in the btree given by cur.
@@ -140,13 +141,13 @@ xfs_inobt_complain_bad_rec(
struct xfs_mount *mp = cur->bc_mp;
xfs_warn(mp,
- "%s Inode BTree record corruption in AG %d detected at %pS!",
- cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
- cur->bc_ag.pag->pag_agno, fa);
+ "%sbt record corruption in AG %d detected at %pS!",
+ cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
irec->ir_free, irec->ir_holemask);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -205,14 +206,17 @@ xfs_inobt_insert(
struct xfs_buf *agbp,
xfs_agino_t newino,
xfs_agino_t newlen,
- xfs_btnum_t btnum)
+ bool is_finobt)
{
struct xfs_btree_cur *cur;
xfs_agino_t thisino;
int i;
int error;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
+ if (is_finobt)
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
+ else
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
for (thisino = newino;
thisino < newino + newlen;
@@ -528,16 +532,14 @@ __xfs_inobt_rec_merge(
}
/*
- * Insert a new sparse inode chunk into the associated inode btree. The inode
- * record for the sparse chunk is pre-aligned to a startino that should match
- * any pre-existing sparse inode record in the tree. This allows sparse chunks
- * to fill over time.
+ * Insert a new sparse inode chunk into the associated inode allocation btree.
+ * The inode record for the sparse chunk is pre-aligned to a startino that
+ * should match any pre-existing sparse inode record in the tree. This allows
+ * sparse chunks to fill over time.
*
- * This function supports two modes of handling preexisting records depending on
- * the merge flag. If merge is true, the provided record is merged with the
+ * If no preexisting record exists, the provided record is inserted.
+ * If there is a preexisting record, the provided record is merged with the
* existing record and updated in place. The merged record is returned in nrec.
- * If merge is false, an existing record is replaced with the provided record.
- * If no preexisting record exists, the provided record is always inserted.
*
* It is considered corruption if a merge is requested and not possible. Given
* the sparse inode alignment constraints, this should never happen.
@@ -547,9 +549,7 @@ xfs_inobt_insert_sprec(
struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- int btnum,
- struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
- bool merge) /* merge or replace */
+ struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
@@ -557,7 +557,7 @@ xfs_inobt_insert_sprec(
int i;
struct xfs_inobt_rec_incore rec;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
/* the new record is pre-aligned so we know where to look */
error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
@@ -571,6 +571,7 @@ xfs_inobt_insert_sprec(
if (error)
goto error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -579,45 +580,45 @@ xfs_inobt_insert_sprec(
}
/*
- * A record exists at this startino. Merge or replace the record
- * depending on what we've been asked to do.
+ * A record exists at this startino. Merge the records.
*/
- if (merge) {
- error = xfs_inobt_get_rec(cur, &rec, &i);
- if (error)
- goto error;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto error;
- }
- if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
- error = -EFSCORRUPTED;
- goto error;
- }
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
- /*
- * This should never fail. If we have coexisting records that
- * cannot merge, something is seriously wrong.
- */
- if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
- error = -EFSCORRUPTED;
- goto error;
- }
+ /*
+ * This should never fail. If we have coexisting records that
+ * cannot merge, something is seriously wrong.
+ */
+ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
- trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
- rec.ir_holemask, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
+ rec.ir_holemask, nrec->ir_startino,
+ nrec->ir_holemask);
- /* merge to nrec to output the updated record */
- __xfs_inobt_rec_merge(nrec, &rec);
+ /* merge to nrec to output the updated record */
+ __xfs_inobt_rec_merge(nrec, &rec);
- trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
+ nrec->ir_holemask);
- error = xfs_inobt_rec_check_count(mp, nrec);
- if (error)
- goto error;
- }
+ error = xfs_inobt_rec_check_count(mp, nrec);
+ if (error)
+ goto error;
error = xfs_inobt_update(cur, nrec);
if (error)
@@ -632,6 +633,59 @@ error:
}
/*
+ * Insert a new sparse inode chunk into the free inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * The new record is always inserted, overwriting a pre-existing record if
+ * there is one.
+ */
+STATIC int
+xfs_finobt_insert_sprec(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_btree_cur *cur;
+ int error;
+ int i;
+
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
+
+ /* the new record is pre-aligned so we know where to look */
+ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ /* if nothing there, insert a new record and return */
+ if (i == 0) {
+ error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+ nrec->ir_count, nrec->ir_freecount,
+ nrec->ir_free, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ } else {
+ error = xfs_inobt_update(cur, nrec);
+ if (error)
+ goto error;
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+
+/*
* Allocate new inodes in the allocation group specified by agbp. Returns 0 if
* inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
* the caller knows it can try another AG, a hard -ENOSPC when over the maximum
@@ -857,8 +911,7 @@ sparse_alloc:
* if necessary. If a merge does occur, rec is updated to the
* merged record.
*/
- error = xfs_inobt_insert_sprec(pag, tp, agbp,
- XFS_BTNUM_INO, &rec, true);
+ error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
@@ -882,21 +935,19 @@ sparse_alloc:
* existing record with this one.
*/
if (xfs_has_finobt(args.mp)) {
- error = xfs_inobt_insert_sprec(pag, tp, agbp,
- XFS_BTNUM_FINO, &rec, false);
+ error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
if (error)
return error;
}
} else {
/* full chunk - insert new records to both btrees */
- error = xfs_inobt_insert(pag, tp, agbp, newino, newlen,
- XFS_BTNUM_INO);
+ error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
if (error)
return error;
if (xfs_has_finobt(args.mp)) {
error = xfs_inobt_insert(pag, tp, agbp, newino,
- newlen, XFS_BTNUM_FINO);
+ newlen, true);
if (error)
return error;
}
@@ -949,8 +1000,10 @@ xfs_ialloc_next_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
}
return 0;
@@ -974,8 +1027,10 @@ xfs_ialloc_get_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
}
return 0;
@@ -1030,7 +1085,7 @@ xfs_dialloc_ag_inobt(
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -1053,6 +1108,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1061,6 +1117,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, j != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1219,6 +1276,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1228,6 +1286,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1237,6 +1296,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1297,8 +1357,10 @@ xfs_dialloc_ag_finobt_near(
error = xfs_inobt_get_rec(lcur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(lcur);
return -EFSCORRUPTED;
+ }
/*
* See if we've landed in the parent inode record. The finobt
@@ -1322,12 +1384,14 @@ xfs_dialloc_ag_finobt_near(
if (error)
goto error_rcur;
if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
+ xfs_btree_mark_sick(lcur);
error = -EFSCORRUPTED;
goto error_rcur;
}
}
if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
+ xfs_btree_mark_sick(lcur);
error = -EFSCORRUPTED;
goto error_rcur;
}
@@ -1383,8 +1447,10 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
}
@@ -1395,14 +1461,18 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -1424,14 +1494,18 @@ xfs_dialloc_ag_update_inobt(
error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
@@ -1440,8 +1514,10 @@ xfs_dialloc_ag_update_inobt(
if (XFS_IS_CORRUPT(cur->bc_mp,
rec.ir_free != frec->ir_free ||
- rec.ir_freecount != frec->ir_freecount))
+ rec.ir_freecount != frec->ir_freecount)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return xfs_inobt_update(cur, &rec);
}
@@ -1483,7 +1559,7 @@ xfs_dialloc_ag(
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(cur);
if (error)
@@ -1526,7 +1602,7 @@ xfs_dialloc_ag(
* the original freecount. If all is well, make the equivalent update to
* the inobt using the finobt record and offset information.
*/
- icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ icur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(icur);
if (error)
@@ -1943,7 +2019,7 @@ xfs_difree_inobt(
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(cur);
if (error)
@@ -1958,6 +2034,7 @@ xfs_difree_inobt(
goto error0;
}
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1968,6 +2045,7 @@ xfs_difree_inobt(
goto error0;
}
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2068,7 +2146,7 @@ xfs_difree_finobt(
int error;
int i;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
@@ -2080,6 +2158,7 @@ xfs_difree_finobt(
* something is out of sync.
*/
if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2106,6 +2185,7 @@ xfs_difree_finobt(
if (error)
goto error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2116,6 +2196,7 @@ xfs_difree_finobt(
if (XFS_IS_CORRUPT(mp,
rec.ir_free != ibtrec->ir_free ||
rec.ir_freecount != ibtrec->ir_freecount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2265,7 +2346,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -2604,6 +2685,8 @@ xfs_read_agi(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
if (error)
return error;
if (tp)
@@ -2765,7 +2848,7 @@ xfs_ialloc_count_inodes(
struct xfs_ialloc_count_inodes ci = {0};
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+ ASSERT(xfs_btree_is_ino(cur->bc_ops));
error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
if (error)
return error;
@@ -2982,7 +3065,7 @@ xfs_ialloc_check_shrink(
if (!xfs_has_sparseinodes(pag->pag_mount))
return 0;
- cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agibp);
/* Look up the inobt record that would correspond to the new EOFS. */
agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
@@ -2995,6 +3078,7 @@ xfs_ialloc_check_shrink(
goto out;
if (!has) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
error = -EFSCORRUPTED;
goto out;
}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 42a5e1f227a0..cc661fca6ff5 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -17,6 +17,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_rmap.h"
@@ -37,7 +38,15 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_btnum);
+ cur->bc_ag.agbp);
+}
+
+STATIC struct xfs_btree_cur *
+xfs_finobt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+ cur->bc_ag.agbp);
}
STATIC void
@@ -81,9 +90,9 @@ xfs_inobt_mod_blockcount(
if (!xfs_has_inobtcounts(cur->bc_mp))
return;
- if (cur->bc_btnum == XFS_BTNUM_FINO)
+ if (xfs_btree_is_fino(cur->bc_ops))
be32_add_cpu(&agi->agi_fblocks, howmuch);
- else if (cur->bc_btnum == XFS_BTNUM_INO)
+ else
be32_add_cpu(&agi->agi_iblocks, howmuch);
xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
}
@@ -300,7 +309,7 @@ xfs_inobt_verify(
* xfs_perag_initialised_agi(pag)) if we ever do.
*/
if (xfs_has_crc(mp)) {
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
}
@@ -310,7 +319,7 @@ xfs_inobt_verify(
if (level >= M_IGEO(mp)->inobt_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp,
+ return xfs_btree_agblock_verify(bp,
M_IGEO(mp)->inobt_mxr[level != 0]);
}
@@ -320,7 +329,7 @@ xfs_inobt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_inobt_verify(bp);
@@ -344,7 +353,7 @@ xfs_inobt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -398,9 +407,17 @@ xfs_inobt_keys_contiguous(
be32_to_cpu(key2->inobt.ir_startino));
}
-static const struct xfs_btree_ops xfs_inobt_ops = {
+const struct xfs_btree_ops xfs_inobt_ops = {
+ .name = "ino",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_inobt_rec_t),
.key_len = sizeof(xfs_inobt_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_INO_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2),
+ .sick_mask = XFS_SICK_AG_INOBT,
.dup_cursor = xfs_inobt_dup_cursor,
.set_root = xfs_inobt_set_root,
@@ -420,11 +437,19 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.keys_contiguous = xfs_inobt_keys_contiguous,
};
-static const struct xfs_btree_ops xfs_finobt_ops = {
+const struct xfs_btree_ops xfs_finobt_ops = {
+ .name = "fino",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_inobt_rec_t),
.key_len = sizeof(xfs_inobt_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
- .dup_cursor = xfs_inobt_dup_cursor,
+ .lru_refs = XFS_INO_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2),
+ .sick_mask = XFS_SICK_AG_FINOBT,
+
+ .dup_cursor = xfs_finobt_dup_cursor,
.set_root = xfs_finobt_set_root,
.alloc_block = xfs_finobt_alloc_block,
.free_block = xfs_finobt_free_block,
@@ -443,65 +468,54 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
};
/*
- * Initialize a new inode btree cursor.
+ * Create an inode btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-static struct xfs_btree_cur *
-xfs_inobt_init_common(
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
struct xfs_perag *pag,
- struct xfs_trans *tp, /* transaction pointer */
- xfs_btnum_t btnum) /* ialloc or free ino btree */
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
- cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops,
M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
- if (btnum == XFS_BTNUM_INO) {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
- cur->bc_ops = &xfs_inobt_ops;
- } else {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
- cur->bc_ops = &xfs_finobt_ops;
- }
-
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agi *agi = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ }
return cur;
}
-/* Create an inode btree cursor. */
+/*
+ * Create a free inode btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
struct xfs_btree_cur *
-xfs_inobt_init_cursor(
+xfs_finobt_init_cursor(
struct xfs_perag *pag,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_btnum_t btnum)
+ struct xfs_buf *agbp)
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = agbp->b_addr;
- cur = xfs_inobt_init_common(pag, tp, btnum);
- if (btnum == XFS_BTNUM_INO)
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- else
- cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops,
+ M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
+ cur->bc_ag.pag = xfs_perag_hold(pag);
cur->bc_ag.agbp = agbp;
- return cur;
-}
+ if (agbp) {
+ struct xfs_agi *agi = agbp->b_addr;
-/* Create an inode btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_inobt_stage_cursor(
- struct xfs_perag *pag,
- struct xbtree_afakeroot *afake,
- xfs_btnum_t btnum)
-{
- struct xfs_btree_cur *cur;
-
- cur = xfs_inobt_init_common(pag, NULL, btnum);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ }
return cur;
}
@@ -521,7 +535,7 @@ xfs_inobt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- if (cur->bc_btnum == XFS_BTNUM_INO) {
+ if (xfs_btree_is_ino(cur->bc_ops)) {
fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
agi->agi_root = cpu_to_be32(afake->af_root);
agi->agi_level = cpu_to_be32(afake->af_levels);
@@ -530,7 +544,7 @@ xfs_inobt_commit_staged_btree(
fields |= XFS_AGI_IBLOCKS;
}
xfs_ialloc_log_agi(tp, agbp, fields);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
} else {
fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
agi->agi_free_root = cpu_to_be32(afake->af_root);
@@ -540,7 +554,7 @@ xfs_inobt_commit_staged_btree(
fields |= XFS_AGI_IBLOCKS;
}
xfs_ialloc_log_agi(tp, agbp, fields);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
}
@@ -721,45 +735,21 @@ xfs_inobt_max_size(
XFS_INODES_PER_CHUNK);
}
-/* Read AGI and create inobt cursor. */
-int
-xfs_inobt_cur(
- struct xfs_perag *pag,
- struct xfs_trans *tp,
- xfs_btnum_t which,
- struct xfs_btree_cur **curpp,
- struct xfs_buf **agi_bpp)
-{
- struct xfs_btree_cur *cur;
- int error;
-
- ASSERT(*agi_bpp == NULL);
- ASSERT(*curpp == NULL);
-
- error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
- if (error)
- return error;
-
- cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which);
- *curpp = cur;
- return 0;
-}
-
static int
-xfs_inobt_count_blocks(
+xfs_finobt_count_blocks(
struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
struct xfs_buf *agbp = NULL;
- struct xfs_btree_cur *cur = NULL;
+ struct xfs_btree_cur *cur;
int error;
- error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error)
return error;
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_btree_count_blocks(cur, tree_blocks);
xfs_btree_del_cursor(cur, error);
xfs_trans_brelse(tp, agbp);
@@ -807,8 +797,7 @@ xfs_finobt_calc_reserves(
if (xfs_has_inobtcounts(pag->pag_mount))
error = xfs_finobt_read_blocks(pag, tp, &tree_len);
else
- error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO,
- &tree_len);
+ error = xfs_finobt_count_blocks(pag, tp, &tree_len);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 3262c3fe5ebe..6472ec1ecbb4 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -46,10 +46,10 @@ struct xfs_perag;
(maxrecs) * sizeof(xfs_inobt_key_t) + \
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
-extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
- struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
- struct xbtree_afakeroot *afake, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
/* ir_holemask to inode allocation bitmap conversion */
@@ -66,9 +66,6 @@ int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp,
xfs_extlen_t *ask, xfs_extlen_t *used);
extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
-int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp,
- xfs_btnum_t btnum, struct xfs_btree_cur **curpp,
- struct xfs_buf **agi_bpp);
void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index f4e6b200cdf8..8796f2b3e534 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -394,11 +394,18 @@ xfs_iext_leaf_key(
return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
}
+static inline void *
+xfs_iext_alloc_node(
+ int size)
+{
+ return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+}
+
static void
xfs_iext_grow(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_node *node = xfs_iext_alloc_node(NODE_SIZE);
int i;
if (ifp->if_height == 1) {
@@ -454,7 +461,7 @@ xfs_iext_split_node(
int *nr_entries)
{
struct xfs_iext_node *node = *nodep;
- struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_node *new = xfs_iext_alloc_node(NODE_SIZE);
const int nr_move = KEYS_PER_NODE / 2;
int nr_keep = nr_move + (KEYS_PER_NODE & 1);
int i = 0;
@@ -542,7 +549,7 @@ xfs_iext_split_leaf(
int *nr_entries)
{
struct xfs_iext_leaf *leaf = cur->leaf;
- struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_leaf *new = xfs_iext_alloc_node(NODE_SIZE);
const int nr_move = RECS_PER_LEAF / 2;
int nr_keep = nr_move + (RECS_PER_LEAF & 1);
int i;
@@ -583,7 +590,7 @@ xfs_iext_alloc_root(
{
ASSERT(ifp->if_bytes == 0);
- ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+ ifp->if_data = xfs_iext_alloc_node(sizeof(struct xfs_iext_rec));
ifp->if_height = 1;
/* now that we have a node step into it */
@@ -603,7 +610,8 @@ xfs_iext_realloc_root(
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL);
+ new = krealloc(ifp->if_data, new_size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
ifp->if_data = new;
cur->leaf = new;
@@ -743,7 +751,7 @@ xfs_iext_remove_node(
again:
ASSERT(node->ptrs[pos]);
ASSERT(node->ptrs[pos] == victim);
- kmem_free(victim);
+ kfree(victim);
nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
offset = node->keys[0];
@@ -789,7 +797,7 @@ again:
ASSERT(node == ifp->if_data);
ifp->if_data = node->ptrs[0];
ifp->if_height--;
- kmem_free(node);
+ kfree(node);
}
}
@@ -863,7 +871,7 @@ xfs_iext_free_last_leaf(
struct xfs_ifork *ifp)
{
ifp->if_height--;
- kmem_free(ifp->if_data);
+ kfree(ifp->if_data);
ifp->if_data = NULL;
}
@@ -1044,7 +1052,7 @@ xfs_iext_destroy_node(
}
}
- kmem_free(node);
+ kfree(node);
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 137a65bda95d..d0dcce462bf4 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_ialloc.h"
#include "xfs_dir2.h"
+#include "xfs_health.h"
#include <linux/iversion.h>
@@ -132,9 +133,14 @@ xfs_imap_to_bp(
struct xfs_imap *imap,
struct xfs_buf **bpp)
{
- return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- imap->im_len, XBF_UNMAPPED, bpp,
- &xfs_inode_buf_ops);
+ int error;
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
+ XFS_SICK_AG_INODES);
+ return error;
}
static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index f4569e18a8d0..7d660a973909 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -25,6 +25,8 @@
#include "xfs_attr_leaf.h"
#include "xfs_types.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
struct kmem_cache *xfs_ifork_cache;
@@ -50,7 +52,8 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- char *new_data = kmem_alloc(mem_size, KM_NOFS);
+ char *new_data = kmalloc(mem_size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
memcpy(new_data, data, size);
if (zero_terminate)
@@ -77,7 +80,7 @@ xfs_iformat_local(
/*
* If the size is unreasonable, then something
* is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
+ * kmalloc() or memcpy() below.
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
@@ -87,6 +90,7 @@ xfs_iformat_local(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_local", dip, sizeof(*dip),
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
@@ -116,7 +120,7 @@ xfs_iformat_extents(
/*
* If the number of extents is unreasonable, then something is wrong and
- * we just bail out rather than crash in kmem_alloc() or memcpy() below.
+ * we just bail out rather than crash in kmalloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
@@ -124,6 +128,7 @@ xfs_iformat_extents(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
@@ -143,6 +148,7 @@ xfs_iformat_extents(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(2)",
dp, sizeof(*dp), fa);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return xfs_bmap_complain_bad_rec(ip, whichfork,
fa, &new);
}
@@ -201,11 +207,13 @@ xfs_iformat_btree(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_btree", dfp, size,
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
ifp->if_broot_bytes = size;
- ifp->if_broot = kmem_alloc(size, KM_NOFS);
+ ifp->if_broot = kmalloc(size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
ASSERT(ifp->if_broot != NULL);
/*
* Copy and convert from the on-disk structure
@@ -265,12 +273,14 @@ xfs_iformat_data_fork(
default:
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
dip, sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
break;
default:
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
}
@@ -342,6 +352,7 @@ xfs_iformat_attr_fork(
default:
xfs_inode_verifier_error(ip, error, __func__, dip,
sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
error = -EFSCORRUPTED;
break;
}
@@ -399,7 +410,8 @@ xfs_iroot_realloc(
*/
if (ifp->if_broot_bytes == 0) {
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
- ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
+ ifp->if_broot = kmalloc(new_size,
+ GFP_KERNEL | __GFP_NOFAIL);
ifp->if_broot_bytes = (int)new_size;
return;
}
@@ -414,7 +426,7 @@ xfs_iroot_realloc(
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
ifp->if_broot = krealloc(ifp->if_broot, new_size,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -440,7 +452,7 @@ xfs_iroot_realloc(
else
new_size = 0;
if (new_size > 0) {
- new_broot = kmem_alloc(new_size, KM_NOFS);
+ new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
/*
* First copy over the btree block header.
*/
@@ -470,7 +482,7 @@ xfs_iroot_realloc(
(int)new_size);
memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
}
- kmem_free(ifp->if_broot);
+ kfree(ifp->if_broot);
ifp->if_broot = new_broot;
ifp->if_broot_bytes = (int)new_size;
if (ifp->if_broot)
@@ -488,7 +500,7 @@ xfs_iroot_realloc(
*
* If the amount of space needed has decreased below the size of the
* inline buffer, then switch to using the inline buffer. Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * use krealloc() or kmalloc() to adjust the size of the buffer
* to what is needed.
*
* ip -- the inode whose if_data area is changing
@@ -509,7 +521,7 @@ xfs_idata_realloc(
if (byte_diff) {
ifp->if_data = krealloc(ifp->if_data, new_size,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
if (new_size == 0)
ifp->if_data = NULL;
ifp->if_bytes = new_size;
@@ -524,13 +536,13 @@ xfs_idestroy_fork(
struct xfs_ifork *ifp)
{
if (ifp->if_broot != NULL) {
- kmem_free(ifp->if_broot);
+ kfree(ifp->if_broot);
ifp->if_broot = NULL;
}
switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
- kmem_free(ifp->if_data);
+ kfree(ifp->if_data);
ifp->if_data = NULL;
break;
case XFS_DINODE_FMT_EXTENTS:
@@ -562,7 +574,7 @@ xfs_iextents_copy(
struct xfs_bmbt_irec rec;
int64_t copied = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
ASSERT(ifp->if_bytes > 0);
for_each_xfs_iext(ifp, &icur, &rec) {
@@ -689,7 +701,7 @@ xfs_ifork_init_cow(
return;
ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
}
@@ -802,3 +814,12 @@ xfs_iext_count_upgrade(
return 0;
}
+
+/* Decide if a file mapping is on the realtime device or not. */
+bool
+xfs_ifork_is_realtime(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ return XFS_IS_REALTIME_INODE(ip) && whichfork != XFS_ATTR_FORK;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 96303249d28a..bd53eb951b65 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -260,6 +260,7 @@ int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
int nr_to_add);
int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
uint nr_to_add);
+bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork);
/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 269573c82808..16872972e1e9 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -838,10 +838,12 @@ struct xfs_cud_log_format {
#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31)
#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30)
+#define XFS_BMAP_EXTENT_REALTIME (1U << 29)
#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \
XFS_BMAP_EXTENT_ATTR_FORK | \
- XFS_BMAP_EXTENT_UNWRITTEN)
+ XFS_BMAP_EXTENT_UNWRITTEN | \
+ XFS_BMAP_EXTENT_REALTIME)
/*
* This is the structure used to lay out an bui log item in the
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 6709a7f8bad5..511c912d515c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -23,6 +23,7 @@
#include "xfs_refcount.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_refcount_intent_cache;
@@ -156,6 +157,7 @@ xfs_refcount_complain_bad_rec(
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -238,6 +240,7 @@ xfs_refcount_insert(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -268,12 +271,14 @@ xfs_refcount_delete(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
error = xfs_btree_delete(cur, i);
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -398,6 +403,7 @@ xfs_refcount_split_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -425,6 +431,7 @@ xfs_refcount_split_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -470,6 +477,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -478,6 +486,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -487,6 +496,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -498,6 +508,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -542,6 +553,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -550,6 +562,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -561,6 +574,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -608,6 +622,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -616,6 +631,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -627,6 +643,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -674,6 +691,7 @@ xfs_refcount_find_left_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -693,6 +711,7 @@ xfs_refcount_find_left_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -767,6 +786,7 @@ xfs_refcount_find_right_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -786,6 +806,7 @@ xfs_refcount_find_right_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1056,7 +1077,7 @@ xfs_refcount_still_have_space(
* to handle each of the shape changes to the refcount btree.
*/
overhead = xfs_allocfree_block_count(cur->bc_mp,
- cur->bc_ag.refc.shape_changes);
+ cur->bc_refc.shape_changes);
overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
@@ -1064,17 +1085,17 @@ xfs_refcount_still_have_space(
* Only allow 2 refcount extent updates per transaction if the
* refcount continue update "error" has been injected.
*/
- if (cur->bc_ag.refc.nr_ops > 2 &&
+ if (cur->bc_refc.nr_ops > 2 &&
XFS_TEST_ERROR(false, cur->bc_mp,
XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
- if (cur->bc_ag.refc.nr_ops == 0)
+ if (cur->bc_refc.nr_ops == 0)
return true;
else if (overhead > cur->bc_tp->t_log_res)
return false;
- return cur->bc_tp->t_log_res - overhead >
- cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ return cur->bc_tp->t_log_res - overhead >
+ cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -1134,7 +1155,7 @@ xfs_refcount_adjust_extents(
* Either cover the hole (increment) or
* delete the range (decrement).
*/
- cur->bc_ag.refc.nr_ops++;
+ cur->bc_refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -1142,6 +1163,7 @@ xfs_refcount_adjust_extents(
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp,
found_tmp != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1180,6 +1202,7 @@ xfs_refcount_adjust_extents(
*/
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1193,7 +1216,7 @@ xfs_refcount_adjust_extents(
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &ext);
- cur->bc_ag.refc.nr_ops++;
+ cur->bc_refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
@@ -1203,6 +1226,7 @@ xfs_refcount_adjust_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1281,7 +1305,7 @@ xfs_refcount_adjust(
if (shape_changed)
shape_changes++;
if (shape_changes)
- cur->bc_ag.refc.shape_changes++;
+ cur->bc_refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
@@ -1327,8 +1351,10 @@ xfs_refcount_continue_op(
struct xfs_perag *pag = cur->bc_ag.pag;
if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
- ri->ri_blockcount)))
+ ri->ri_blockcount))) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
@@ -1374,8 +1400,8 @@ xfs_refcount_finish_one(
*/
rcur = *pcur;
if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
- nr_ops = rcur->bc_ag.refc.nr_ops;
- shape_changes = rcur->bc_ag.refc.shape_changes;
+ nr_ops = rcur->bc_refc.nr_ops;
+ shape_changes = rcur->bc_refc.shape_changes;
xfs_refcount_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -1387,8 +1413,8 @@ xfs_refcount_finish_one(
return error;
rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
- rcur->bc_ag.refc.nr_ops = nr_ops;
- rcur->bc_ag.refc.shape_changes = shape_changes;
+ rcur->bc_refc.nr_ops = nr_ops;
+ rcur->bc_refc.shape_changes = shape_changes;
}
*pcur = rcur;
@@ -1449,7 +1475,7 @@ __xfs_refcount_add(
blockcount);
ri = kmem_cache_alloc(xfs_refcount_intent_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
@@ -1535,6 +1561,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1552,6 +1579,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1585,6 +1613,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1682,6 +1711,7 @@ xfs_refcount_adjust_cow_extents(
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1697,6 +1727,7 @@ xfs_refcount_adjust_cow_extents(
/* Adding a CoW reservation, there should be nothing here. */
if (XFS_IS_CORRUPT(cur->bc_mp,
agbno + aglen > ext.rc_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1714,6 +1745,7 @@ xfs_refcount_adjust_cow_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1721,14 +1753,17 @@ xfs_refcount_adjust_cow_extents(
case XFS_REFCOUNT_ADJUST_COW_FREE:
/* Removing a CoW reservation, there should be one extent. */
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1740,6 +1775,7 @@ xfs_refcount_adjust_cow_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1889,8 +1925,10 @@ xfs_refcount_recover_extent(
struct xfs_refcount_recovery *rr;
if (XFS_IS_CORRUPT(cur->bc_mp,
- be32_to_cpu(rec->refc.rc_refcount) != 1))
+ be32_to_cpu(rec->refc.rc_refcount) != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
rr = kmalloc(sizeof(struct xfs_refcount_recovery),
GFP_KERNEL | __GFP_NOFAIL);
@@ -1900,6 +1938,7 @@ xfs_refcount_recover_extent(
if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ xfs_btree_mark_sick(cur);
kfree(rr);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 0d80bd99147c..ca59f6c89f3e 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -16,6 +16,7 @@
#include "xfs_refcount.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_bit.h"
@@ -77,8 +78,6 @@ xfs_refcountbt_alloc_block(
xfs_refc_block(args.mp)));
if (error)
goto out_error;
- trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- args.agbno, 1);
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
@@ -107,8 +106,6 @@ xfs_refcountbt_free_block(
struct xfs_agf *agf = agbp->b_addr;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
- trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
@@ -220,7 +217,7 @@ xfs_refcountbt_verify(
if (!xfs_has_reflink(mp))
return __this_address;
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
@@ -242,7 +239,7 @@ xfs_refcountbt_verify(
} else if (level >= mp->m_refc_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_refc_mxr[level != 0]);
}
STATIC void
@@ -251,7 +248,7 @@ xfs_refcountbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_refcountbt_verify(bp);
@@ -275,7 +272,7 @@ xfs_refcountbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -321,9 +318,17 @@ xfs_refcountbt_keys_contiguous(
be32_to_cpu(key2->refc.rc_startblock));
}
-static const struct xfs_btree_ops xfs_refcountbt_ops = {
+const struct xfs_btree_ops xfs_refcountbt_ops = {
+ .name = "refcount",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(struct xfs_refcount_rec),
.key_len = sizeof(struct xfs_refcount_key),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_REFC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2),
+ .sick_mask = XFS_SICK_AG_REFCNTBT,
.dup_cursor = xfs_refcountbt_dup_cursor,
.set_root = xfs_refcountbt_set_root,
@@ -344,59 +349,32 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
};
/*
- * Initialize a new refcount btree cursor.
+ * Create a new refcount btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-static struct xfs_btree_cur *
-xfs_refcountbt_init_common(
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct xfs_buf *agbp,
struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops,
mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
-
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
- cur->bc_ag.refc.nr_ops = 0;
- cur->bc_ag.refc.shape_changes = 0;
- cur->bc_ops = &xfs_refcountbt_ops;
- return cur;
-}
-
-/* Create a btree cursor. */
-struct xfs_btree_cur *
-xfs_refcountbt_init_cursor(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_buf *agbp,
- struct xfs_perag *pag)
-{
- struct xfs_agf *agf = agbp->b_addr;
- struct xfs_btree_cur *cur;
-
- cur = xfs_refcountbt_init_common(mp, tp, pag);
- cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ cur->bc_refc.nr_ops = 0;
+ cur->bc_refc.shape_changes = 0;
cur->bc_ag.agbp = agbp;
- return cur;
-}
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
-/* Create a btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_refcountbt_stage_cursor(
- struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag)
-{
- struct xfs_btree_cur *cur;
-
- cur = xfs_refcountbt_init_common(mp, NULL, pag);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ }
return cur;
}
@@ -421,7 +399,7 @@ xfs_refcountbt_commit_staged_btree(
xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
XFS_AGF_REFCOUNT_ROOT |
XFS_AGF_REFCOUNT_LEVEL);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in a refcount btree block. */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index d66b37259bed..1e0ab25f6c68 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -48,8 +48,6 @@ struct xbtree_afakeroot;
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp,
struct xfs_perag *pag);
-struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag);
extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 76bf7f48cb5a..ef16f6f9cef6 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -23,6 +23,7 @@
#include "xfs_error.h"
#include "xfs_inode.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_rmap_intent_cache;
@@ -56,8 +57,10 @@ xfs_rmap_lookup_le(
error = xfs_rmap_get_rec(cur, irec, &get_stat);
if (error)
return error;
- if (!get_stat)
+ if (!get_stat) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -132,6 +135,7 @@ xfs_rmap_insert(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -145,6 +149,7 @@ xfs_rmap_insert(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -174,6 +179,7 @@ xfs_rmap_delete(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -182,6 +188,7 @@ xfs_rmap_delete(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -208,10 +215,10 @@ xfs_rmap_btrec_to_irec(
/* Simple checks for rmap records. */
xfs_failaddr_t
xfs_rmap_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_rmap_irec *irec)
{
- struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_mount *mp = pag->pag_mount;
bool is_inode;
bool is_unwritten;
bool is_bmbt;
@@ -226,8 +233,8 @@ xfs_rmap_check_irec(
return __this_address;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock,
- irec->rm_blockcount))
+ if (!xfs_verify_agbext(pag, irec->rm_startblock,
+ irec->rm_blockcount))
return __this_address;
}
@@ -262,6 +269,16 @@ xfs_rmap_check_irec(
return NULL;
}
+static inline xfs_failaddr_t
+xfs_rmap_check_btrec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec)
+{
+ if (xfs_btree_is_mem_rmap(cur->bc_ops))
+ return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
+ return xfs_rmap_check_irec(cur->bc_ag.pag, irec);
+}
+
static inline int
xfs_rmap_complain_bad_rec(
struct xfs_btree_cur *cur,
@@ -270,13 +287,18 @@ xfs_rmap_complain_bad_rec(
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_warn(mp,
- "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
- cur->bc_ag.pag->pag_agno, fa);
+ if (xfs_btree_is_mem_rmap(cur->bc_ops))
+ xfs_warn(mp,
+ "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+ else
+ xfs_warn(mp,
+ "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
+ cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
irec->rm_blockcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -299,7 +321,7 @@ xfs_rmap_get_rec(
fa = xfs_rmap_btrec_to_irec(rec, irec);
if (!fa)
- fa = xfs_rmap_check_irec(cur, irec);
+ fa = xfs_rmap_check_btrec(cur, irec);
if (fa)
return xfs_rmap_complain_bad_rec(cur, fa, irec);
@@ -512,7 +534,7 @@ xfs_rmap_lookup_le_range(
*/
static int
xfs_rmap_free_check_owner(
- struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
uint64_t ltoff,
struct xfs_rmap_irec *rec,
xfs_filblks_t len,
@@ -520,6 +542,7 @@ xfs_rmap_free_check_owner(
uint64_t offset,
unsigned int flags)
{
+ struct xfs_mount *mp = cur->bc_mp;
int error = 0;
if (owner == XFS_RMAP_OWN_UNKNOWN)
@@ -529,12 +552,14 @@ xfs_rmap_free_check_owner(
if (XFS_IS_CORRUPT(mp,
(flags & XFS_RMAP_UNWRITTEN) !=
(rec->rm_flags & XFS_RMAP_UNWRITTEN))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
/* Make sure the owner matches what we expect to find in the tree. */
if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -546,16 +571,19 @@ xfs_rmap_free_check_owner(
if (flags & XFS_RMAP_BMBT_BLOCK) {
if (XFS_IS_CORRUPT(mp,
!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
} else {
if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
if (XFS_IS_CORRUPT(mp,
offset + len > ltoff + rec->rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -618,6 +646,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -639,6 +668,7 @@ xfs_rmap_unmap(
if (XFS_IS_CORRUPT(mp,
bno <
ltrec.rm_startblock + ltrec.rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -665,6 +695,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -677,12 +708,13 @@ xfs_rmap_unmap(
ltrec.rm_startblock > bno ||
ltrec.rm_startblock + ltrec.rm_blockcount <
bno + len)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Check owner information. */
- error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, len, owner,
+ error = xfs_rmap_free_check_owner(cur, ltoff, &ltrec, len, owner,
offset, flags);
if (error)
goto out_error;
@@ -697,6 +729,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -788,6 +821,86 @@ out_error:
return error;
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of rmapbt live updates. If
+ * the compiler supports jump labels, the static branch will be replaced by a
+ * nop sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_rmap_hooks_switch);
+
+void
+xfs_rmap_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_rmap_hooks_switch);
+}
+
+void
+xfs_rmap_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_rmap_hooks_switch);
+}
+
+/* Call downstream hooks for a reverse mapping update. */
+static inline void
+xfs_rmap_update_hook(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ enum xfs_rmap_intent_type op,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) {
+ struct xfs_rmap_update_params p = {
+ .startblock = startblock,
+ .blockcount = blockcount,
+ .unwritten = unwritten,
+ .oinfo = *oinfo, /* struct copy */
+ };
+
+ if (pag)
+ xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
+ }
+}
+
+/* Call the specified function during a reverse mapping update. */
+int
+xfs_rmap_hook_add(
+ struct xfs_perag *pag,
+ struct xfs_rmap_hook *hook)
+{
+ return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+}
+
+/* Stop calling the specified function during a reverse mapping update. */
+void
+xfs_rmap_hook_del(
+ struct xfs_perag *pag,
+ struct xfs_rmap_hook *hook)
+{
+ xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+}
+
+/* Configure rmap update hook functions. */
+void
+xfs_rmap_hook_setup(
+ struct xfs_rmap_hook *hook,
+ notifier_fn_t mod_fn)
+{
+ xfs_hook_setup(&hook->rmap_hook, mod_fn);
+}
+#else
+# define xfs_rmap_update_hook(t, p, o, s, b, u, oi) do { } while (0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Remove a reference to an extent in the rmap btree.
*/
@@ -808,7 +921,7 @@ xfs_rmap_free(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
-
+ xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo);
error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -900,6 +1013,7 @@ xfs_rmap_map(
if (XFS_IS_CORRUPT(mp,
have_lt != 0 &&
ltrec.rm_startblock + ltrec.rm_blockcount > bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -917,10 +1031,12 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -974,6 +1090,7 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1021,6 +1138,7 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1055,6 +1173,7 @@ xfs_rmap_alloc(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+ xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo);
error = xfs_rmap_map(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -1116,6 +1235,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1153,12 +1273,14 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp,
LEFT.rm_startblock + LEFT.rm_blockcount >
bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1181,6 +1303,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1193,10 +1316,12 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1227,6 +1352,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1246,6 +1372,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1257,6 +1384,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1264,6 +1392,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1275,6 +1404,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1282,6 +1412,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1305,6 +1436,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1312,6 +1444,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1331,6 +1464,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1342,6 +1476,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1349,6 +1484,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1419,6 +1555,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1461,6 +1598,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1476,6 +1614,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1509,6 +1648,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1522,6 +1662,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1534,6 +1675,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1606,6 +1748,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1634,6 +1777,7 @@ xfs_rmap_convert_shared(
if (XFS_IS_CORRUPT(mp,
LEFT.rm_startblock + LEFT.rm_blockcount >
bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1652,10 +1796,12 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1706,6 +1852,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1732,6 +1879,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1758,6 +1906,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1781,6 +1930,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1816,6 +1966,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1861,6 +2012,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1896,6 +2048,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1934,6 +2087,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2023,6 +2177,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2033,12 +2188,14 @@ xfs_rmap_unmap_shared(
ltrec.rm_startblock > bno ||
ltrec.rm_startblock + ltrec.rm_blockcount <
bno + len)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Make sure the owner matches what we expect to find in the tree. */
if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2047,16 +2204,19 @@ xfs_rmap_unmap_shared(
if (XFS_IS_CORRUPT(mp,
(flags & XFS_RMAP_UNWRITTEN) !=
(ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Check the offset. */
if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2113,6 +2273,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2142,6 +2303,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2221,6 +2383,7 @@ xfs_rmap_map_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2273,6 +2436,7 @@ xfs_rmap_map_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2335,15 +2499,12 @@ xfs_rmap_map_raw(
{
struct xfs_owner_info oinfo;
- oinfo.oi_owner = rmap->rm_owner;
- oinfo.oi_offset = rmap->rm_offset;
- oinfo.oi_flags = 0;
- if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
- if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+ xfs_owner_info_pack(&oinfo, rmap->rm_owner, rmap->rm_offset,
+ rmap->rm_flags);
- if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ if ((rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN)) ||
+ XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
return xfs_rmap_map(cur, rmap->rm_startblock,
rmap->rm_blockcount,
rmap->rm_flags & XFS_RMAP_UNWRITTEN,
@@ -2373,7 +2534,7 @@ xfs_rmap_query_range_helper(
fa = xfs_rmap_btrec_to_irec(rec, &irec);
if (!fa)
- fa = xfs_rmap_check_irec(cur, &irec);
+ fa = xfs_rmap_check_btrec(cur, &irec);
if (fa)
return xfs_rmap_complain_bad_rec(cur, fa, &irec);
@@ -2428,6 +2589,38 @@ xfs_rmap_finish_one_cleanup(
xfs_trans_brelse(tp, agbp);
}
+/* Commit an rmap operation into the ondisk tree. */
+int
+__xfs_rmap_finish_intent(
+ struct xfs_btree_cur *rcur,
+ enum xfs_rmap_intent_type op,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool unwritten)
+{
+ switch (op) {
+ case XFS_RMAP_ALLOC:
+ case XFS_RMAP_MAP:
+ return xfs_rmap_map(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_MAP_SHARED:
+ return xfs_rmap_map_shared(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_FREE:
+ case XFS_RMAP_UNMAP:
+ return xfs_rmap_unmap(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_UNMAP_SHARED:
+ return xfs_rmap_unmap_shared(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_CONVERT:
+ return xfs_rmap_convert(rcur, bno, len, !unwritten, oinfo);
+ case XFS_RMAP_CONVERT_SHARED:
+ return xfs_rmap_convert_shared(rcur, bno, len, !unwritten,
+ oinfo);
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+}
+
/*
* Process one of the deferred rmap operations. We pass back the
* btree cursor to maintain our lock on the rmapbt between calls.
@@ -2476,10 +2669,14 @@ xfs_rmap_finish_one(
* allocate blocks.
*/
error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp);
- if (error)
+ if (error) {
+ xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
return error;
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
+ }
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+ xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
return -EFSCORRUPTED;
+ }
rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
}
@@ -2490,39 +2687,14 @@ xfs_rmap_finish_one(
unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
- switch (ri->ri_type) {
- case XFS_RMAP_ALLOC:
- case XFS_RMAP_MAP:
- error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount,
- unwritten, &oinfo);
- break;
- case XFS_RMAP_MAP_SHARED:
- error = xfs_rmap_map_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, unwritten, &oinfo);
- break;
- case XFS_RMAP_FREE:
- case XFS_RMAP_UNMAP:
- error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount,
- unwritten, &oinfo);
- break;
- case XFS_RMAP_UNMAP_SHARED:
- error = xfs_rmap_unmap_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, unwritten, &oinfo);
- break;
- case XFS_RMAP_CONVERT:
- error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount,
- !unwritten, &oinfo);
- break;
- case XFS_RMAP_CONVERT_SHARED:
- error = xfs_rmap_convert_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, !unwritten, &oinfo);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- }
+ error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+ ri->ri_bmap.br_blockcount, &oinfo, unwritten);
+ if (error)
+ return error;
- return error;
+ xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno,
+ ri->ri_bmap.br_blockcount, unwritten, &oinfo);
+ return 0;
}
/*
@@ -2559,7 +2731,7 @@ __xfs_rmap_add(
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 3c98d9d50afb..9d01fe689497 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -186,6 +186,10 @@ void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
struct xfs_btree_cur *rcur, int error);
int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur);
+int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,
+ enum xfs_rmap_intent_type op, xfs_agblock_t bno,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+ bool unwritten);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
@@ -195,7 +199,7 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a,
union xfs_btree_rec;
xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec);
-xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
const struct xfs_rmap_irec *irec);
int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
@@ -235,4 +239,29 @@ extern struct kmem_cache *xfs_rmap_intent_cache;
int __init xfs_rmap_intent_init_cache(void);
void xfs_rmap_intent_destroy_cache(void);
+/*
+ * Parameters for tracking reverse mapping changes. The hook function arg
+ * parameter is enum xfs_rmap_intent_type, and the rest is below.
+ */
+struct xfs_rmap_update_params {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+ struct xfs_owner_info oinfo;
+ bool unwritten;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+
+struct xfs_rmap_hook {
+ struct xfs_hook rmap_hook;
+};
+
+void xfs_rmap_hook_disable(void);
+void xfs_rmap_hook_enable(void);
+
+int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn);
+#endif
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 6c81b20e97d2..9e759efa81cc 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -16,11 +16,14 @@
#include "xfs_btree_staging.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
static struct kmem_cache *xfs_rmapbt_cur_cache;
@@ -65,13 +68,12 @@ xfs_rmapbt_set_root(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- int btnum = cur->bc_btnum;
ASSERT(ptr->s != 0);
- agf->agf_roots[btnum] = ptr->s;
- be32_add_cpu(&agf->agf_levels[btnum], inc);
- cur->bc_ag.pag->pagf_levels[btnum] += inc;
+ agf->agf_rmap_root = ptr->s;
+ be32_add_cpu(&agf->agf_rmap_level, inc);
+ cur->bc_ag.pag->pagf_rmap_level += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -94,8 +96,6 @@ xfs_rmapbt_alloc_block(
&bno, 1);
if (error)
return error;
-
- trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
if (bno == NULLAGBLOCK) {
*stat = 0;
return 0;
@@ -125,8 +125,6 @@ xfs_rmapbt_free_block(
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
- trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
- bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
@@ -226,7 +224,7 @@ xfs_rmapbt_init_ptr_from_cur(
ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
- ptr->s = agf->agf_roots[cur->bc_btnum];
+ ptr->s = agf->agf_rmap_root;
}
/*
@@ -340,18 +338,29 @@ xfs_rmapbt_verify(
if (!xfs_has_rmapbt(mp))
return __this_address;
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
level = be16_to_cpu(block->bb_level);
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+ unsigned int maxlevel = pag->pagf_rmap_level;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Online repair could be rewriting the free space btrees, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ maxlevel = max_t(unsigned int, maxlevel,
+ pag->pagf_repair_rmap_level);
+#endif
+ if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_rmap_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
}
static void
@@ -360,7 +369,7 @@ xfs_rmapbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_rmapbt_verify(bp);
@@ -384,7 +393,7 @@ xfs_rmapbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -476,9 +485,19 @@ xfs_rmapbt_keys_contiguous(
be32_to_cpu(key2->rmap.rm_startblock));
}
-static const struct xfs_btree_ops xfs_rmapbt_ops = {
+const struct xfs_btree_ops xfs_rmapbt_ops = {
+ .name = "rmap",
+ .type = XFS_BTREE_TYPE_AG,
+ .geom_flags = XFS_BTGEO_OVERLAPPING,
+
.rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
.key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2),
+ .sick_mask = XFS_SICK_AG_RMAPBT,
.dup_cursor = xfs_rmapbt_dup_cursor,
.set_root = xfs_rmapbt_set_root,
@@ -498,55 +517,176 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
.keys_contiguous = xfs_rmapbt_keys_contiguous,
};
-static struct xfs_btree_cur *
-xfs_rmapbt_init_common(
+/*
+ * Create a new reverse mapping btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct xfs_buf *agbp,
struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- /* Overlapping btree; 2 keys per pointer. */
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
- cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
- cur->bc_ops = &xfs_rmapbt_ops;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
+ }
return cur;
}
-/* Create a new reverse mapping btree cursor. */
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline unsigned int
+xfs_rmapbt_mem_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64));
+}
+
+/*
+ * Validate an in-memory rmap btree block. Callers are allowed to generate an
+ * in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rmapbt_mem_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (level >= xfs_rmapbt_maxlevels_ondisk())
+ return __this_address;
+
+ maxrecs = xfs_rmapbt_mem_block_maxrecs(
+ XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+xfs_rmapbt_mem_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
+ .name = "xfs_rmapbt_mem",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
+ .verify_read = xfs_rmapbt_mem_rw_verify,
+ .verify_write = xfs_rmapbt_mem_rw_verify,
+ .verify_struct = xfs_rmapbt_mem_verify,
+};
+
+const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
+ .name = "mem_rmap",
+ .type = XFS_BTREE_TYPE_MEM,
+ .geom_flags = XFS_BTGEO_OVERLAPPING,
+
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = xfs_rmapbt_key_diff,
+ .buf_ops = &xfs_rmapbt_mem_buf_ops,
+ .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .keys_inorder = xfs_rmapbt_keys_inorder,
+ .recs_inorder = xfs_rmapbt_recs_inorder,
+ .keys_contiguous = xfs_rmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
struct xfs_btree_cur *
-xfs_rmapbt_init_cursor(
- struct xfs_mount *mp,
+xfs_rmapbt_mem_cursor(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- struct xfs_perag *pag)
+ struct xfbtree *xfbt)
{
- struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = pag->pag_mount;
- cur = xfs_rmapbt_init_common(mp, tp, pag);
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- cur->bc_ag.agbp = agbp;
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
+ xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
+ cur->bc_mem.xfbtree = xfbt;
+ cur->bc_nlevels = xfbt->nlevels;
+
+ cur->bc_mem.pag = xfs_perag_hold(pag);
return cur;
}
-/* Create a new reverse mapping btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_rmapbt_stage_cursor(
+/* Create an in-memory rmap btree. */
+int
+xfs_rmapbt_mem_init(
struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag)
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp,
+ xfs_agnumber_t agno)
{
- struct xfs_btree_cur *cur;
+ xfbt->owner = agno;
+ return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops);
+}
- cur = xfs_rmapbt_init_common(mp, NULL, pag);
- xfs_btree_stage_afakeroot(cur, afake);
- return cur;
+/* Compute the max possible height for reverse mapping btrees in memory. */
+static unsigned int
+xfs_rmapbt_mem_maxlevels(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * How tall can an in-memory rmap btree become if we filled the entire
+ * AG with rmap records?
+ */
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
}
+#else
+# define xfs_rmapbt_mem_maxlevels() (0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
/*
* Install a new reverse mapping btree root. Caller is responsible for
@@ -563,12 +703,12 @@ xfs_rmapbt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
- agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ agf->agf_rmap_root = cpu_to_be32(afake->af_root);
+ agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
XFS_AGF_RMAP_BLOCKS);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in a reverse mapping btree block. */
@@ -618,7 +758,8 @@ xfs_rmapbt_maxlevels_ondisk(void)
* like if it consumes almost all the blocks in the AG due to maximal
* sharing factor.
*/
- return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
+ return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
+ xfs_rmapbt_mem_maxlevels());
}
/* Compute the maximum height of an rmap btree. */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 3244715dd111..eb90d89e8086 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -10,6 +10,7 @@ struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
struct xbtree_afakeroot;
+struct xfbtree;
/* rmaps only exist on crc enabled filesystems */
#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
@@ -44,8 +45,6 @@ struct xbtree_afakeroot;
struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_perag *pag);
-struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag);
void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
int xfs_rmapbt_maxrecs(int blocklen, int leaf);
@@ -64,4 +63,9 @@ unsigned int xfs_rmapbt_maxlevels_ondisk(void);
int __init xfs_rmapbt_init_cur_cache(void);
void xfs_rmapbt_destroy_cur_cache(void);
+struct xfs_btree_cur *xfs_rmapbt_mem_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int xfs_rmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp, xfs_agnumber_t agno);
+
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index e31663cb7b43..f246d6dbf4ec 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -17,6 +17,7 @@
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_rtbitmap.h"
+#include "xfs_health.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -115,13 +116,19 @@ xfs_rtbuf_get(
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
+ if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) {
+ xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+ XFS_SICK_RT_BITMAP);
return -EFSCORRUPTED;
+ }
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+ XFS_SICK_RT_BITMAP);
if (error)
return error;
@@ -934,7 +941,7 @@ xfs_rtfree_extent(
struct timespec64 atime;
ASSERT(mp->m_rbmip->i_itemp != NULL);
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
error = xfs_rtcheck_alloc_range(&args, start, len);
if (error)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5bb6e2bd6dee..d991eec05436 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1290,6 +1290,8 @@ xfs_sb_read_secondary(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB);
if (error)
return error;
xfs_buf_set_ref(bp, XFS_SSB_REF);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 4220d3584c1b..dfd61fa8332e 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -43,6 +43,60 @@ extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+/* btree ops */
+extern const struct xfs_btree_ops xfs_bnobt_ops;
+extern const struct xfs_btree_ops xfs_cntbt_ops;
+extern const struct xfs_btree_ops xfs_inobt_ops;
+extern const struct xfs_btree_ops xfs_finobt_ops;
+extern const struct xfs_btree_ops xfs_bmbt_ops;
+extern const struct xfs_btree_ops xfs_refcountbt_ops;
+extern const struct xfs_btree_ops xfs_rmapbt_ops;
+extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
+
+static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_bnobt_ops;
+}
+
+static inline bool xfs_btree_is_cnt(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_cntbt_ops;
+}
+
+static inline bool xfs_btree_is_bmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_bmbt_ops;
+}
+
+static inline bool xfs_btree_is_ino(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_inobt_ops;
+}
+
+static inline bool xfs_btree_is_fino(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_finobt_ops;
+}
+
+static inline bool xfs_btree_is_refcount(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_refcountbt_ops;
+}
+
+static inline bool xfs_btree_is_rmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rmapbt_ops;
+}
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rmapbt_mem_ops;
+}
+#else
+# define xfs_btree_is_mem_rmap(...) (false)
+#endif
+
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
int xfs_log_calc_minimum_size(struct xfs_mount *);
@@ -128,19 +182,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
-
-/*
- * Symlink decoding/encoding functions
- */
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
- uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
- uint32_t size, struct xfs_buf *bp);
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_inode *ip, struct xfs_ifork *ifp);
-xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
-
/* Computed inode geometry for the filesystem. */
struct xfs_ino_geometry {
/* Maximum inode count in this filesystem. */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 160aa20aa441..ffb1317a9212 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -16,7 +16,10 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
-
+#include "xfs_symlink_remote.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_health.h"
/*
* Each contiguous block has a header, so it is not just a simple pathlen
@@ -227,3 +230,153 @@ xfs_symlink_shortform_verify(
return __this_address;
return NULL;
}
+
+/* Read a remote symlink target into the buffer. */
+int
+xfs_symlink_remote_read(
+ struct xfs_inode *ip,
+ char *link)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_buf *bp;
+ xfs_daddr_t d;
+ char *cur_chunk;
+ int pathlen = ip->i_disk_size;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int byte_cnt;
+ int n;
+ int error = 0;
+ int fsblocks = 0;
+ int offset;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ fsblocks = xfs_symlink_blocks(mp, pathlen);
+ error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ goto out;
+
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+ error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+ &bp, &xfs_symlink_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ if (error)
+ return error;
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ if (pathlen < byte_cnt)
+ byte_cnt = pathlen;
+
+ cur_chunk = bp->b_addr;
+ if (xfs_has_crc(mp)) {
+ if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
+ byte_cnt, bp)) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ error = -EFSCORRUPTED;
+ xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/0x%x,0x%llx)",
+ offset, byte_cnt, ip->i_ino);
+ xfs_buf_relse(bp);
+ goto out;
+
+ }
+
+ cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+ }
+
+ memcpy(link + offset, cur_chunk, byte_cnt);
+
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_buf_relse(bp);
+ }
+ ASSERT(pathlen == 0);
+
+ link[ip->i_disk_size] = '\0';
+ error = 0;
+
+ out:
+ return error;
+}
+
+/* Write the symlink target into the inode. */
+int
+xfs_symlink_write_target(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ const char *target_path,
+ int pathlen,
+ xfs_fsblock_t fs_blocks,
+ uint resblks)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_mount *mp = tp->t_mountp;
+ const char *cur_chunk;
+ struct xfs_buf *bp;
+ xfs_daddr_t d;
+ int byte_cnt;
+ int nmaps;
+ int offset = 0;
+ int n;
+ int error;
+
+ /*
+ * If the symlink will fit into the inode, write it inline.
+ */
+ if (pathlen <= xfs_inode_data_fork_size(ip)) {
+ xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
+
+ ip->i_disk_size = pathlen;
+ ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+ return 0;
+ }
+
+ nmaps = XFS_SYMLINK_MAPS;
+ error = xfs_bmapi_write(tp, ip, 0, fs_blocks, XFS_BMAPI_METADATA,
+ resblks, mval, &nmaps);
+ if (error)
+ return error;
+
+ ip->i_disk_size = pathlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ cur_chunk = target_path;
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ char *buf;
+
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ byte_cnt = min(byte_cnt, pathlen);
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt,
+ bp);
+
+ memcpy(buf, cur_chunk, byte_cnt);
+
+ cur_chunk += byte_cnt;
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+ xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+ (char *)bp->b_addr);
+ }
+ ASSERT(pathlen == 0);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h
new file mode 100644
index 000000000000..a63bd38ae4fa
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.h
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SYMLINK_REMOTE_H
+#define __XFS_SYMLINK_REMOTE_H
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp);
+xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
+int xfs_symlink_remote_read(struct xfs_inode *ip, char *link);
+int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip,
+ const char *target_path, int pathlen, xfs_fsblock_t fs_blocks,
+ uint resblks);
+
+#endif /* __XFS_SYMLINK_REMOTE_H */
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 70e97ea6eee7..69fc5b981352 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -31,7 +31,7 @@ xfs_trans_ijoin(
{
struct xfs_inode_log_item *iip;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (ip->i_itemp == NULL)
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
@@ -60,7 +60,7 @@ xfs_trans_ichgtime(
struct timespec64 tv;
ASSERT(tp);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
tv = current_time(inode);
@@ -90,7 +90,7 @@ xfs_trans_log_inode(
struct inode *inode = VFS_I(ip);
ASSERT(iip);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
tp->t_flags |= XFS_TRANS_DIRTY;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 62e02d5380ad..76eb9e328835 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -80,11 +80,13 @@ typedef void * xfs_failaddr_t;
/*
* Inode fork identifiers.
*/
-#define XFS_DATA_FORK 0
-#define XFS_ATTR_FORK 1
-#define XFS_COW_FORK 2
+#define XFS_STAGING_FORK (-1) /* fake fork for staging a btree */
+#define XFS_DATA_FORK (0)
+#define XFS_ATTR_FORK (1)
+#define XFS_COW_FORK (2)
#define XFS_WHICHFORK_STRINGS \
+ { XFS_STAGING_FORK, "staging" }, \
{ XFS_DATA_FORK, "data" }, \
{ XFS_ATTR_FORK, "attr" }, \
{ XFS_COW_FORK, "cow" }
@@ -114,24 +116,6 @@ typedef enum {
{ XFS_LOOKUP_LEi, "le" }, \
{ XFS_LOOKUP_GEi, "ge" }
-/*
- * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
- * please keep the TRACE_DEFINE_ENUMs for it up to date.
- */
-typedef enum {
- XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
- XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
-} xfs_btnum_t;
-
-#define XFS_BTNUM_STRINGS \
- { XFS_BTNUM_BNOi, "bnobt" }, \
- { XFS_BTNUM_CNTi, "cntbt" }, \
- { XFS_BTNUM_RMAPi, "rmapbt" }, \
- { XFS_BTNUM_BMAPi, "bmbt" }, \
- { XFS_BTNUM_INOi, "inobt" }, \
- { XFS_BTNUM_FINOi, "finobt" }, \
- { XFS_BTNUM_REFCi, "refcbt" }
-
struct xfs_name {
const unsigned char *name;
int len;
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
deleted file mode 100644
index 79155eec341b..000000000000
--- a/fs/xfs/mrlock.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_MRLOCK_H__
-#define __XFS_SUPPORT_MRLOCK_H__
-
-#include <linux/rwsem.h>
-
-typedef struct {
- struct rw_semaphore mr_lock;
-#if defined(DEBUG) || defined(XFS_WARN)
- int mr_writer;
-#endif
-} mrlock_t;
-
-#if defined(DEBUG) || defined(XFS_WARN)
-#define mrinit(mrp, name) \
- do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
-#else
-#define mrinit(mrp, name) \
- do { init_rwsem(&(mrp)->mr_lock); } while (0)
-#endif
-
-#define mrlock_init(mrp, t,n,s) mrinit(mrp, n)
-#define mrfree(mrp) do { } while (0)
-
-static inline void mraccess_nested(mrlock_t *mrp, int subclass)
-{
- down_read_nested(&mrp->mr_lock, subclass);
-}
-
-static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
-{
- down_write_nested(&mrp->mr_lock, subclass);
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 1;
-#endif
-}
-
-static inline int mrtryaccess(mrlock_t *mrp)
-{
- return down_read_trylock(&mrp->mr_lock);
-}
-
-static inline int mrtryupdate(mrlock_t *mrp)
-{
- if (!down_write_trylock(&mrp->mr_lock))
- return 0;
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 1;
-#endif
- return 1;
-}
-
-static inline void mrunlock_excl(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 0;
-#endif
- up_write(&mrp->mr_lock);
-}
-
-static inline void mrunlock_shared(mrlock_t *mrp)
-{
- up_read(&mrp->mr_lock);
-}
-
-static inline void mrdemote(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 0;
-#endif
- downgrade_write(&mrp->mr_lock);
-}
-
-#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h
index ed08f76ff4f3..e488e1f4f63d 100644
--- a/fs/xfs/scrub/agb_bitmap.h
+++ b/fs/xfs/scrub/agb_bitmap.h
@@ -65,4 +65,9 @@ int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
+static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b)
+{
+ return xbitmap32_count_set_regions(&b->agbitmap);
+}
+
#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 6c6e5eba42c8..e954f07679dd 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -556,28 +556,28 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Check the AGF btree roots and levels */
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+ agbno = be32_to_cpu(agf->agf_bno_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+ agbno = be32_to_cpu(agf->agf_cnt_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ level = be32_to_cpu(agf->agf_bno_level);
if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ level = be32_to_cpu(agf->agf_cnt_level);
if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (xfs_has_rmapbt(mp)) {
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+ agbno = be32_to_cpu(agf->agf_rmap_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ level = be32_to_cpu(agf->agf_rmap_level);
if (level <= 0 || level > mp->m_rmap_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 26bd1ff68f1b..427054b65b23 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -174,8 +174,7 @@ xrep_agf_find_btrees(
* We relied on the rmapbt to reconstruct the AGF. If we get a
* different root then something's seriously wrong.
*/
- if (fab[XREP_AGF_RMAPBT].root !=
- be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi]))
+ if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
return -EFSCORRUPTED;
/* We must find the refcountbt root if that feature is enabled. */
@@ -224,20 +223,14 @@ xrep_agf_set_roots(
struct xfs_agf *agf,
struct xrep_find_ag_btree *fab)
{
- agf->agf_roots[XFS_BTNUM_BNOi] =
- cpu_to_be32(fab[XREP_AGF_BNOBT].root);
- agf->agf_levels[XFS_BTNUM_BNOi] =
- cpu_to_be32(fab[XREP_AGF_BNOBT].height);
+ agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
+ agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
- agf->agf_roots[XFS_BTNUM_CNTi] =
- cpu_to_be32(fab[XREP_AGF_CNTBT].root);
- agf->agf_levels[XFS_BTNUM_CNTi] =
- cpu_to_be32(fab[XREP_AGF_CNTBT].height);
+ agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
+ agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
- agf->agf_levels[XFS_BTNUM_RMAPi] =
- cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
+ agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
+ agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
if (xfs_has_reflink(sc->mp)) {
agf->agf_refcount_root =
@@ -262,8 +255,7 @@ xrep_agf_calc_from_btrees(
int error;
/* Update the AGF counters from the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
if (error)
goto err;
@@ -276,8 +268,7 @@ xrep_agf_calc_from_btrees(
agf->agf_longest = cpu_to_be32(raa.longest);
/* Update the AGF counters from the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -333,12 +324,9 @@ xrep_agf_commit_new(
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
- pag->pagf_levels[XFS_BTNUM_BNOi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
- pag->pagf_levels[XFS_BTNUM_CNTi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- pag->pagf_levels[XFS_BTNUM_RMAPi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+ pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+ pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
@@ -559,16 +547,14 @@ xrep_agfl_collect_blocks(
goto out_bmp;
/* Find all blocks currently being used by the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_bmp;
/* Find all blocks currently being used by the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
@@ -908,7 +894,7 @@ xrep_agi_calc_from_btrees(
xfs_agino_t freecount;
int error;
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
@@ -928,8 +914,7 @@ xrep_agi_calc_from_btrees(
if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
xfs_agblock_t blocks;
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp,
- XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 45edda096869..d421b253923e 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -687,8 +687,8 @@ xrep_abt_reset_counters(
* height values before re-initializing the perag info from the updated
* AGF to capture all the new values.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
+ pag->pagf_repair_bno_level = pag->pagf_bno_level;
+ pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
/* Reinitialize with the values we just logged. */
return xrep_reinit_pagf(sc);
@@ -735,10 +735,11 @@ xrep_abt_build_new_trees(
ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
/* Allocate cursors for the staged btrees. */
- bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
- pag, XFS_BTNUM_BNO);
- cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
- pag, XFS_BTNUM_CNT);
+ bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
+
+ cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
/* Last chance to abort before we start committing fixes. */
if (xchk_should_terminate(sc, &error))
@@ -765,10 +766,8 @@ xrep_abt_build_new_trees(
* height so that we don't trip the verifiers when writing the new
* btree blocks to disk.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
- ra->new_bnobt.bload.btree_height;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
- ra->new_cntbt.bload.btree_height;
+ pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
+ pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
/* Load the free space by length tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
@@ -807,8 +806,8 @@ xrep_abt_build_new_trees(
return xrep_roll_ag_trans(sc);
err_levels:
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+ pag->pagf_repair_bno_level = 0;
+ pag->pagf_repair_cnt_level = 0;
err_cur:
xfs_btree_del_cursor(cnt_cur, error);
xfs_btree_del_cursor(bno_cur, error);
@@ -838,8 +837,8 @@ xrep_abt_remove_old_trees(
* Now that we've zapped all the old allocbt blocks we can turn off
* the alternate height mechanism.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+ pag->pagf_repair_bno_level = 0;
+ pag->pagf_repair_cnt_level = 0;
return 0;
}
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 1449bb5262d9..0cb8d43912a8 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -566,3 +566,17 @@ xbitmap32_test(
*len = bn->bn_start - start;
return false;
}
+
+/* Count the number of set regions in this bitmap. */
+uint32_t
+xbitmap32_count_set_regions(
+ struct xbitmap32 *bitmap)
+{
+ struct xbitmap32_node *bn;
+ uint32_t nr = 0;
+
+ for_each_xbitmap32_extent(bn, bitmap)
+ nr++;
+
+ return nr;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 2df8911606d6..710c1ac5e323 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -62,4 +62,6 @@ int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
bool xbitmap32_empty(struct xbitmap32 *bitmap);
bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
+uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap);
+
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index b169cddde6da..24a15bf784f1 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -924,7 +924,7 @@ xchk_bmap(
if (!ifp)
return -ENOENT;
- info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+ info.is_rt = xfs_ifork_is_realtime(ip, whichfork);
info.whichfork = whichfork;
info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
info.sc = sc;
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index a4bb89fdd510..1e656fab5e41 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -639,7 +639,13 @@ xrep_bmap_build_new_fork(
rb->new_bmapbt.bload.get_records = xrep_bmap_get_records;
rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block;
rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size;
- bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake);
+
+ /*
+ * Allocate a new bmap btree cursor for reloading an inode block mapping
+ * data structure.
+ */
+ bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK);
+ xfs_btree_stage_ifakeroot(bmap_cur, ifake);
/*
* Figure out the size and format of the new fork, then fill it with
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 1935b9ce1885..fe678a0438bc 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -47,7 +47,7 @@ __xchk_btree_process_error(
*error = 0;
fallthrough;
default:
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
trace_xchk_ifork_btree_op_error(sc, cur, level,
*error, ret_ip);
else
@@ -91,7 +91,7 @@ __xchk_btree_set_corrupt(
{
sc->sm->sm_flags |= errflag;
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
trace_xchk_ifork_btree_error(sc, cur, level,
ret_ip);
else
@@ -168,7 +168,7 @@ xchk_btree_rec(
if (xfs_btree_keycmp_lt(cur, &key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, 1);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Is high_key(rec) no larger than the parent high key? */
@@ -215,7 +215,7 @@ xchk_btree_key(
if (xfs_btree_keycmp_lt(cur, key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, level);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Is this block's high key no larger than the parent high key? */
@@ -236,22 +236,18 @@ xchk_btree_ptr_ok(
int level,
union xfs_btree_ptr *ptr)
{
- bool res;
-
/* A btree rooted in an inode has no block pointer to the root. */
- if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
level == bs->cur->bc_nlevels)
return true;
/* Otherwise, check the pointers. */
- if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
- res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
- else
- res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
- if (!res)
+ if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+ return false;
+ }
- return res;
+ return true;
}
/* Check that a btree block's sibling matches what we expect it. */
@@ -374,18 +370,21 @@ xchk_btree_check_block_owner(
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
- xfs_btnum_t btnum;
bool init_sa;
int error = 0;
if (!bs->cur)
return 0;
- btnum = bs->cur->bc_btnum;
agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
- init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
+ /*
+ * If the btree being examined is not itself a per-AG btree, initialize
+ * sc->sa so that we can check for the presence of an ownership record
+ * in the rmap btree for the AG containing the block.
+ */
+ init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG;
if (init_sa) {
error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
@@ -399,11 +398,11 @@ xchk_btree_check_block_owner(
* have to nullify it (to shut down further block owner checks) if
* self-xref encounters problems.
*/
- if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
+ if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops))
bs->cur = NULL;
xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
- if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
+ if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops))
bs->cur = NULL;
out_free:
@@ -429,7 +428,7 @@ xchk_btree_check_owner(
* up.
*/
if (bp == NULL) {
- if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE)
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
}
@@ -442,7 +441,7 @@ xchk_btree_check_owner(
* duplicate cursors. Therefore, save the buffer daddr for
* later scanning.
*/
- if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
+ if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) {
struct check_owner *co;
co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
@@ -475,7 +474,7 @@ xchk_btree_check_iroot_minrecs(
* existing filesystems, so instead we disable the check for data fork
* bmap btrees when there's an attr fork.
*/
- if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
+ if (xfs_btree_is_bmap(bs->cur->bc_ops) &&
bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
xfs_inode_has_attr_fork(bs->sc->ip))
return false;
@@ -508,7 +507,7 @@ xchk_btree_check_minrecs(
* child block might be less than the standard minrecs, but that's ok
* provided that there's only one direct child of the root.
*/
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
level == cur->bc_nlevels - 2) {
struct xfs_btree_block *root_block;
struct xfs_buf *root_bp;
@@ -562,7 +561,7 @@ xchk_btree_block_check_keys(
return;
}
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Make sure the high key of this block matches the parent. */
@@ -585,7 +584,6 @@ xchk_btree_get_block(
struct xfs_btree_block **pblock,
struct xfs_buf **pbp)
{
- xfs_failaddr_t failed_at;
int error;
*pblock = NULL;
@@ -597,13 +595,7 @@ xchk_btree_get_block(
return error;
xfs_btree_get_block(bs->cur, level, pbp);
- if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
- failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
- level, *pbp);
- else
- failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
- level, *pbp);
- if (failed_at) {
+ if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
}
@@ -664,7 +656,7 @@ xchk_btree_block_keys(
if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
xchk_btree_set_corrupt(bs->sc, cur, 1);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Get high keys */
@@ -728,7 +720,7 @@ xchk_btree(
* error codes for us.
*/
level = cur->bc_nlevels - 1;
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
goto out;
error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 81f2b96bb5a7..abff79a77c72 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -29,6 +29,8 @@
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -82,6 +84,15 @@ __xchk_process_error(
sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
sc->sm, *error);
break;
+ case -ECANCELED:
+ /*
+ * ECANCELED here means that the caller set one of the scrub
+ * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+ * quickly. Set error to zero and do not continue.
+ */
+ trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
+ *error = 0;
+ break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
@@ -89,8 +100,7 @@ __xchk_process_error(
*error = 0;
fallthrough;
default:
- trace_xchk_op_error(sc, agno, bno, *error,
- ret_ip);
+ trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
break;
}
return false;
@@ -136,6 +146,16 @@ __xchk_fblock_process_error(
/* Used to restart an op with deadlock avoidance. */
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
break;
+ case -ECANCELED:
+ /*
+ * ECANCELED here means that the caller set one of the scrub
+ * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+ * quickly. Set error to zero and do not continue.
+ */
+ trace_xchk_file_op_error(sc, whichfork, offset, *error,
+ ret_ip);
+ *error = 0;
+ break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
@@ -227,6 +247,19 @@ xchk_block_set_corrupt(
trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
}
+#ifdef CONFIG_XFS_QUOTA
+/* Record a corrupt quota counter. */
+void
+xchk_qcheck_set_corrupt(
+ struct xfs_scrub *sc,
+ unsigned int dqtype,
+ xfs_dqid_t id)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
+}
+#endif
+
/* Record a corruption while cross-referencing. */
void
xchk_block_xref_set_corrupt(
@@ -427,7 +460,7 @@ xchk_perag_read_headers(
* Grab the AG headers for the attached perag structure and wait for pending
* intents to drain.
*/
-static int
+int
xchk_perag_drain_and_lock(
struct xfs_scrub *sc)
{
@@ -555,46 +588,50 @@ xchk_ag_btcur_init(
{
struct xfs_mount *mp = sc->mp;
- if (sa->agf_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
+ if (sa->agf_bp) {
/* Set up a bnobt cursor for cross-referencing. */
- sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sa->pag, XFS_BTNUM_BNO);
- }
+ sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
+ XFS_SCRUB_TYPE_BNOBT);
- if (sa->agf_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
/* Set up a cntbt cursor for cross-referencing. */
- sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sa->pag, XFS_BTNUM_CNT);
- }
-
- /* Set up a inobt cursor for cross-referencing. */
- if (sa->agi_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
- sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
- XFS_BTNUM_INO);
- }
-
- /* Set up a finobt cursor for cross-referencing. */
- if (sa->agi_bp && xfs_has_finobt(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
- sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
- XFS_BTNUM_FINO);
- }
-
- /* Set up a rmapbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_has_rmapbt(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
- sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
+ XFS_SCRUB_TYPE_CNTBT);
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (xfs_has_rmapbt(mp)) {
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
+ XFS_SCRUB_TYPE_RMAPBT);
+ }
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (xfs_has_reflink(mp)) {
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
+ XFS_SCRUB_TYPE_REFCNTBT);
+ }
}
- /* Set up a refcountbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_has_reflink(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
- sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
- sa->agf_bp, sa->pag);
+ if (sa->agi_bp) {
+ /* Set up a inobt cursor for cross-referencing. */
+ sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
+ sa->agi_bp);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
+ XFS_SCRUB_TYPE_INOBT);
+
+ /* Set up a finobt cursor for cross-referencing. */
+ if (xfs_has_finobt(mp)) {
+ sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
+ sa->agi_bp);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
+ XFS_SCRUB_TYPE_FINOBT);
+ }
}
}
@@ -653,6 +690,13 @@ xchk_trans_cancel(
sc->tp = NULL;
}
+int
+xchk_trans_alloc_empty(
+ struct xfs_scrub *sc)
+{
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
/*
* Grab an empty transaction so that we can re-grab locked buffers if
* one of our btrees turns out to be cyclic.
@@ -672,7 +716,7 @@ xchk_trans_alloc(
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
resblks, 0, 0, &sc->tp);
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ return xchk_trans_alloc_empty(sc);
}
/* Set us up with a transaction and an empty context. */
@@ -1259,6 +1303,15 @@ xchk_fsgates_enable(
if (scrub_fsgates & XCHK_FSGATES_DRAIN)
xfs_drain_wait_enable();
+ if (scrub_fsgates & XCHK_FSGATES_QUOTA)
+ xfs_dqtrx_hook_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
+ xfs_dir_hook_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_RMAP)
+ xfs_rmap_hook_enable();
+
sc->flags |= scrub_fsgates;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index da09580b454a..89f7bbec887e 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -32,6 +32,7 @@ xchk_should_terminate(
}
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+int xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
@@ -54,6 +55,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc,
void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork,
xfs_fileoff_t offset);
+#ifdef CONFIG_XFS_QUOTA
+void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype,
+ xfs_dqid_t id);
+#endif
void xchk_block_xref_set_corrupt(struct xfs_scrub *sc,
struct xfs_buf *bp);
@@ -105,6 +110,7 @@ xchk_setup_rtsummary(struct xfs_scrub *sc)
#ifdef CONFIG_XFS_QUOTA
int xchk_ino_dqattach(struct xfs_scrub *sc);
int xchk_setup_quota(struct xfs_scrub *sc);
+int xchk_setup_quotacheck(struct xfs_scrub *sc);
#else
static inline int
xchk_ino_dqattach(struct xfs_scrub *sc)
@@ -116,12 +122,19 @@ xchk_setup_quota(struct xfs_scrub *sc)
{
return -ENOENT;
}
+static inline int
+xchk_setup_quotacheck(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
#endif
int xchk_setup_fscounters(struct xfs_scrub *sc);
+int xchk_setup_nlinks(struct xfs_scrub *sc);
void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
+int xchk_perag_drain_and_lock(struct xfs_scrub *sc);
/*
* Grab all AG resources, treating the inability to grab the perag structure as
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 1e82c727af8e..4de3f0f40f48 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -609,6 +609,6 @@ xrep_bmap_cow(
out_bitmap:
xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
xoff_bitmap_destroy(&xc->bad_fileoffs);
- kmem_free(xc);
+ kfree(xc);
return error;
}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index d86ab51af928..076a310b8eb0 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -93,11 +93,11 @@ xchk_dir_actor(
return -ECANCELED;
}
- if (!strncmp(".", name->name, name->len)) {
+ if (xfs_dir2_samename(name, &xfs_name_dot)) {
/* If this is "." then check that the inum matches the dir. */
if (ino != dp->i_ino)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- } else if (!strncmp("..", name->name, name->len)) {
+ } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) {
/*
* If this is ".." in the root inode, check that the inum
* matches this dir.
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 5799e9a94f1f..d310737c8823 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -22,6 +22,7 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
+#include "scrub/fscounters.h"
/*
* FS Summary Counters
@@ -48,17 +49,6 @@
* our tolerance for mismatch between expected and actual counter values.
*/
-struct xchk_fscounters {
- struct xfs_scrub *sc;
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- uint64_t frextents;
- unsigned long long icount_min;
- unsigned long long icount_max;
- bool frozen;
-};
-
/*
* Since the expected value computation is lockless but only browses incore
* values, the percpu counters should be fairly close to each other. However,
@@ -235,14 +225,19 @@ xchk_setup_fscounters(
* Pause all writer activity in the filesystem while we're scrubbing to
* reduce the likelihood of background perturbations to the counters
* throwing off our calculations.
+ *
+ * If we're repairing, we need to prevent any other thread from
+ * changing the global fs summary counters while we're repairing them.
+ * This requires the fs to be frozen, which will disable background
+ * reclaim and purge all inactive inodes.
*/
- if (sc->flags & XCHK_TRY_HARDER) {
+ if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
error = xchk_fscounters_freeze(sc);
if (error)
return error;
}
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ return xchk_trans_alloc_empty(sc);
}
/*
@@ -254,7 +249,9 @@ xchk_setup_fscounters(
* set the INCOMPLETE flag even when a negative errno is returned. This care
* must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
* ECANCELED) that are absorbed into a scrub state flag update by
- * xchk_*_process_error.
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
*/
/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
@@ -482,6 +479,10 @@ xchk_fscount_within_range(
if (curr_value == expected)
return true;
+ /* We require exact matches when repair is running. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return false;
+
min_value = min(old_value, curr_value);
max_value = max(old_value, curr_value);
diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h
new file mode 100644
index 000000000000..461a13d25f4b
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSCOUNTERS_H__
+#define __XFS_SCRUB_FSCOUNTERS_H__
+
+struct xchk_fscounters {
+ struct xfs_scrub *sc;
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ uint64_t frextents;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+ bool frozen;
+};
+
+#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
new file mode 100644
index 000000000000..94cdb852bee4
--- /dev/null
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/fscounters.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * We correct errors in the filesystem summary counters by setting them to the
+ * values computed during the obligatory scrub phase. However, we must be
+ * careful not to allow any other thread to change the counters while we're
+ * computing and setting new values. To achieve this, we freeze the
+ * filesystem for the whole operation if the REPAIR flag is set. The checking
+ * function is stricter when we've frozen the fs.
+ */
+
+/*
+ * Reset the superblock counters. Caller is responsible for freezing the
+ * filesystem during the calculation and reset phases.
+ */
+int
+xrep_fscounters(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_fscounters *fsc = sc->buf;
+
+ /*
+ * Reinitialize the in-core counters from what we computed. We froze
+ * the filesystem, so there shouldn't be anyone else trying to modify
+ * these counters.
+ */
+ if (!fsc->frozen) {
+ ASSERT(fsc->frozen);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xrep_reset_counters(mp, fsc);
+
+ percpu_counter_set(&mp->m_icount, fsc->icount);
+ percpu_counter_set(&mp->m_ifree, fsc->ifree);
+ percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+ percpu_counter_set(&mp->m_frextents, fsc->frextents);
+ mp->m_sb.sb_frextents = fsc->frextents;
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 531006910ca9..9020a6bef7f1 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -14,6 +14,7 @@
#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/health.h"
+#include "scrub/common.h"
/*
* Scrub and In-Core Filesystem Health Assessments
@@ -105,6 +106,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA },
[XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA },
[XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS },
+ [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK },
+ [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS },
};
/* Return the health status mask for this scrub type. */
@@ -148,6 +151,24 @@ xchk_file_looks_zapped(
}
/*
+ * Scrub gave the filesystem a clean bill of health, so clear all the indirect
+ * markers of past problems (at least for the fs and ags) so that we can be
+ * healthy again.
+ */
+STATIC void
+xchk_mark_all_healthy(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT);
+ xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT);
+ for_each_perag(mp, agno, pag)
+ xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT);
+}
+
+/*
* Update filesystem health assessments based on what we found and did.
*
* If the scrubber finds errors, we mark sick whatever's mentioned in
@@ -164,6 +185,18 @@ xchk_update_health(
struct xfs_perag *pag;
bool bad;
+ /*
+ * The HEALTHY scrub type is a request from userspace to clear all the
+ * indirect flags after a clean scan of the entire filesystem. As such
+ * there's no sick flag defined for it, so we branch here ahead of the
+ * mask check.
+ */
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY &&
+ !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xchk_mark_all_healthy(sc->mp);
+ return;
+ }
+
if (!sc->sick_mask)
return;
@@ -173,7 +206,7 @@ xchk_update_health(
case XHG_AG:
pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_ag_mark_sick(pag, sc->sick_mask);
+ xfs_ag_mark_corrupt(pag, sc->sick_mask);
else
xfs_ag_mark_healthy(pag, sc->sick_mask);
xfs_perag_put(pag);
@@ -181,20 +214,30 @@ xchk_update_health(
case XHG_INO:
if (!sc->ip)
return;
- if (bad)
- xfs_inode_mark_sick(sc->ip, sc->sick_mask);
- else
+ if (bad) {
+ unsigned int mask = sc->sick_mask;
+
+ /*
+ * If we're coming in for repairs then we don't want
+ * sickness flags to propagate to the incore health
+ * status if the inode gets inactivated before we can
+ * fix it.
+ */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ mask |= XFS_SICK_INO_FORGET;
+ xfs_inode_mark_corrupt(sc->ip, mask);
+ } else
xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
break;
case XHG_FS:
if (bad)
- xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+ xfs_fs_mark_corrupt(sc->mp, sc->sick_mask);
else
xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
break;
case XHG_RT:
if (bad)
- xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+ xfs_rt_mark_corrupt(sc->mp, sc->sick_mask);
else
xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
break;
@@ -205,13 +248,13 @@ xchk_update_health(
}
/* Is the given per-AG btree healthy enough for scanning? */
-bool
-xchk_ag_btree_healthy_enough(
+void
+xchk_ag_btree_del_cursor_if_sick(
struct xfs_scrub *sc,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_btree_cur **curp,
+ unsigned int sm_type)
{
- unsigned int mask = 0;
+ unsigned int mask = (*curp)->bc_ops->sick_mask;
/*
* We always want the cursor if it's the same type as whatever we're
@@ -220,41 +263,8 @@ xchk_ag_btree_healthy_enough(
* Otherwise, we're only interested in the btree for cross-referencing.
* If we know the btree is bad then don't bother, just set XFAIL.
*/
- switch (btnum) {
- case XFS_BTNUM_BNO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
- return true;
- mask = XFS_SICK_AG_BNOBT;
- break;
- case XFS_BTNUM_CNT:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
- return true;
- mask = XFS_SICK_AG_CNTBT;
- break;
- case XFS_BTNUM_INO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
- return true;
- mask = XFS_SICK_AG_INOBT;
- break;
- case XFS_BTNUM_FINO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
- return true;
- mask = XFS_SICK_AG_FINOBT;
- break;
- case XFS_BTNUM_RMAP:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
- return true;
- mask = XFS_SICK_AG_RMAPBT;
- break;
- case XFS_BTNUM_REFC:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
- return true;
- mask = XFS_SICK_AG_REFCNTBT;
- break;
- default:
- ASSERT(0);
- return true;
- }
+ if (sc->sm->sm_type == sm_type)
+ return;
/*
* If we just repaired some AG metadata, sc->sick_mask will reflect all
@@ -266,10 +276,42 @@ xchk_ag_btree_healthy_enough(
type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
mask &= ~sc->sick_mask;
- if (xfs_ag_has_sickness(pag, mask)) {
+ if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
- return false;
+ xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR);
+ *curp = NULL;
+ }
+}
+
+/*
+ * Quick scan to double-check that there isn't any evidence of lingering
+ * primary health problems. If we're still clear, then the health update will
+ * take care of clearing the indirect evidence.
+ */
+int
+xchk_health_record(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ unsigned int sick;
+ unsigned int checked;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ if (sick & XFS_SICK_FS_PRIMARY)
+ xchk_set_corrupt(sc);
+
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ if (sick & XFS_SICK_RT_PRIMARY)
+ xchk_set_corrupt(sc);
+
+ for_each_perag(mp, agno, pag) {
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ if (sick & XFS_SICK_AG_PRIMARY)
+ xchk_set_corrupt(sc);
}
- return true;
+ return 0;
}
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index a731b2467399..63fc426eb5ae 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -8,9 +8,10 @@
unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
void xchk_update_health(struct xfs_scrub *sc);
-bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
- xfs_btnum_t btnum);
+void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc,
+ struct xfs_btree_cur **curp, unsigned int sm_type);
void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask);
bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask);
+int xchk_health_record(struct xfs_scrub *sc);
#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index a720fc62262a..750d7b0cd25a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -76,7 +76,7 @@ xchk_inobt_xref_finobt(
int has_record;
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_FINO);
+ ASSERT(xfs_btree_is_fino(cur->bc_ops));
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
if (error)
@@ -179,7 +179,7 @@ xchk_finobt_xref_inobt(
int has_record;
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+ ASSERT(xfs_btree_is_ino(cur->bc_ops));
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
if (error)
@@ -514,7 +514,7 @@ xchk_iallocbt_rec_alignment(
* Otherwise, we expect that the finobt record is aligned to the
* cluster alignment as told by the superblock.
*/
- if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ if (xfs_btree_is_fino(bs->cur->bc_ops)) {
unsigned int imask;
imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
@@ -649,8 +649,7 @@ out:
*/
STATIC void
xchk_iallocbt_xref_rmap_btreeblks(
- struct xfs_scrub *sc,
- int which)
+ struct xfs_scrub *sc)
{
xfs_filblks_t blocks;
xfs_extlen_t inobt_blocks = 0;
@@ -688,7 +687,6 @@ xchk_iallocbt_xref_rmap_btreeblks(
STATIC void
xchk_iallocbt_xref_rmap_inodes(
struct xfs_scrub *sc,
- int which,
unsigned long long inodes)
{
xfs_filblks_t blocks;
@@ -719,17 +717,14 @@ xchk_iallocbt(
.next_startino = NULLAGINO,
.next_cluster_ino = NULLAGINO,
};
- xfs_btnum_t which;
int error;
switch (sc->sm->sm_type) {
case XFS_SCRUB_TYPE_INOBT:
cur = sc->sa.ino_cur;
- which = XFS_BTNUM_INO;
break;
case XFS_SCRUB_TYPE_FINOBT:
cur = sc->sa.fino_cur;
- which = XFS_BTNUM_FINO;
break;
default:
ASSERT(0);
@@ -741,7 +736,7 @@ xchk_iallocbt(
if (error)
return error;
- xchk_iallocbt_xref_rmap_btreeblks(sc, which);
+ xchk_iallocbt_xref_rmap_btreeblks(sc);
/*
* If we're scrubbing the inode btree, inode_blocks is the number of
@@ -750,9 +745,8 @@ xchk_iallocbt(
* knows about. We can't do this for the finobt since it only points
* to inode chunks with free inodes.
*/
- if (which == XFS_BTNUM_INO)
- xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
-
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+ xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes);
return error;
}
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
index b3f7182dd2f5..a00ec7ae1792 100644
--- a/fs/xfs/scrub/ialloc_repair.c
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -369,7 +369,7 @@ xrep_ibt_check_inode_ext(
* On a sparse inode fs, this cluster could be part of a sparse chunk.
* Sparse clusters must be aligned to sparse chunk alignment.
*/
- if (xfs_has_sparseinodes(mp) &&
+ if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
(!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
!IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
return -EFSCORRUPTED;
@@ -663,8 +663,8 @@ xrep_ibt_build_new_trees(
ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
ri->new_inobt.bload.get_records = xrep_ibt_get_records;
- ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake,
- XFS_BTNUM_INO);
+ ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
+ xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
xfarray_length(ri->inode_records));
if (error)
@@ -684,8 +684,8 @@ xrep_ibt_build_new_trees(
ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
ri->new_finobt.bload.get_records = xrep_fibt_get_records;
- fino_cur = xfs_inobt_stage_cursor(sc->sa.pag,
- &ri->new_finobt.afake, XFS_BTNUM_FINO);
+ fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
+ xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
error = xfs_btree_bload_compute_geometry(fino_cur,
&ri->new_finobt.bload, ri->finobt_recs);
if (error)
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 0ca62d59f84a..eab380e95ef4 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -37,12 +37,15 @@
#include "xfs_attr_leaf.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
/*
* Inode Record Repair
@@ -126,6 +129,10 @@ struct xrep_inode {
/* Must we remove all access from this file? */
bool zap_acls;
+
+ /* Inode scanner to see if we can find the ftype from dirents */
+ struct xchk_iscan ftype_iscan;
+ uint8_t alleged_ftype;
};
/*
@@ -227,26 +234,233 @@ xrep_dinode_header(
dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
}
-/* Turn di_mode into /something/ recognizable. */
-STATIC void
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ if (ino != sc->sm->sm_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ /*
+ * Uhoh, more than one parent for this inode and they don't agree on
+ * the file type?
+ */
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
+ ri->alleged_ftype != name->type) {
+ trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
+ ri->alleged_ftype);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember the ftype. */
+ trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
+ ri->alleged_ftype = name->type;
+ return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_walk_directory(
+ struct xrep_inode *ri,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = ri->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Try to find the mode of the inode being repaired by looking for directories
+ * that point down to this file.
+ */
+STATIC int
+xrep_dinode_find_mode(
+ struct xrep_inode *ri,
+ uint16_t *mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_inode *dp;
+ int error;
+
+ /* No ftype means we have no other metadata to consult. */
+ if (!xfs_has_ftype(sc->mp)) {
+ *mode = S_IFREG;
+ return 0;
+ }
+
+ /*
+ * Scan all directories for parents that might point down to this
+ * inode. Skip the inode being repaired during the scan since it
+ * cannot be its own parent. Note that we still hold the AGI locked
+ * so there's a real possibility that _iscan_iter can return EBUSY.
+ */
+ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
+ ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
+ ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
+ while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
+ if (S_ISDIR(VFS_I(dp)->i_mode))
+ error = xrep_dinode_findmode_walk_directory(ri, dp);
+ xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
+ xchk_irele(sc, dp);
+ if (error < 0)
+ break;
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&ri->ftype_iscan);
+ xchk_iscan_teardown(&ri->ftype_iscan);
+
+ if (error == -EBUSY) {
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
+ /*
+ * If we got an EBUSY after finding at least one
+ * dirent, that means the scan found an inode on the
+ * inactivation list and could not open it. Accept the
+ * alleged ftype and install a new mode below.
+ */
+ error = 0;
+ } else if (!(sc->flags & XCHK_TRY_HARDER)) {
+ /*
+ * Otherwise, retry the operation one time to see if
+ * the reason for the delay is an inode from the same
+ * cluster buffer waiting on the inactivation list.
+ */
+ error = -EDEADLOCK;
+ }
+ }
+ if (error)
+ return error;
+
+ /*
+ * Convert the discovered ftype into the file mode. If all else fails,
+ * return S_IFREG.
+ */
+ switch (ri->alleged_ftype) {
+ case XFS_DIR3_FT_DIR:
+ *mode = S_IFDIR;
+ break;
+ case XFS_DIR3_FT_WHT:
+ case XFS_DIR3_FT_CHRDEV:
+ *mode = S_IFCHR;
+ break;
+ case XFS_DIR3_FT_BLKDEV:
+ *mode = S_IFBLK;
+ break;
+ case XFS_DIR3_FT_FIFO:
+ *mode = S_IFIFO;
+ break;
+ case XFS_DIR3_FT_SOCK:
+ *mode = S_IFSOCK;
+ break;
+ case XFS_DIR3_FT_SYMLINK:
+ *mode = S_IFLNK;
+ break;
+ default:
+ *mode = S_IFREG;
+ break;
+ }
+ return 0;
+}
+
+/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */
+STATIC int
xrep_dinode_mode(
struct xrep_inode *ri,
struct xfs_dinode *dip)
{
struct xfs_scrub *sc = ri->sc;
uint16_t mode = be16_to_cpu(dip->di_mode);
+ int error;
trace_xrep_dinode_mode(sc, dip);
if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
- return;
+ return 0;
+
+ /* Try to fix the mode. If we cannot, then leave everything alone. */
+ error = xrep_dinode_find_mode(ri, &mode);
+ switch (error) {
+ case -EINTR:
+ case -EBUSY:
+ case -EDEADLOCK:
+ /* temporary failure or fatal signal */
+ return error;
+ case 0:
+ /* found mode */
+ break;
+ default:
+ /* some other error, assume S_IFREG */
+ mode = S_IFREG;
+ break;
+ }
/* bad mode, so we set it to a file that only root can read */
- mode = S_IFREG;
dip->di_mode = cpu_to_be16(mode);
dip->di_uid = 0;
dip->di_gid = 0;
ri->zap_acls = true;
+ return 0;
}
/* Fix any conflicting flags that the verifiers complain about. */
@@ -1107,12 +1321,15 @@ xrep_dinode_core(
/* Fix everything the verifier will complain about. */
dip = xfs_buf_offset(bp, ri->imap.im_boffset);
xrep_dinode_header(sc, dip);
- xrep_dinode_mode(ri, dip);
+ iget_error = xrep_dinode_mode(ri, dip);
+ if (iget_error)
+ goto write;
xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
xrep_dinode_size(ri, dip);
xrep_dinode_extsize_hints(sc, dip);
xrep_dinode_zap_forks(ri, dip);
+write:
/* Write out the inode. */
trace_xrep_dinode_fixed(sc, dip);
xfs_dinode_calc_crc(sc->mp, dip);
@@ -1128,7 +1345,8 @@ xrep_dinode_core(
* accessing the inode. If iget fails, we still need to commit the
* changes.
*/
- iget_error = xchk_iget(sc, ino, &sc->ip);
+ if (!iget_error)
+ iget_error = xchk_iget(sc, ino, &sc->ip);
if (!iget_error)
xchk_ilock(sc, XFS_IOLOCK_EXCL);
@@ -1496,6 +1714,13 @@ xrep_inode(
ASSERT(ri != NULL);
error = xrep_dinode_problems(ri);
+ if (error == -EBUSY) {
+ /*
+ * Directory scan to recover inode mode encountered a
+ * busy inode, so we did not continue repairing things.
+ */
+ return 0;
+ }
if (error)
return error;
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
new file mode 100644
index 000000000000..ec3478bc505e
--- /dev/null
+++ b/fs/xfs/scrub/iscan.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_icache.h"
+#include "scrub/scrub.h"
+#include "scrub/iscan.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Live File Scan
+ * ==============
+ *
+ * Live file scans walk every inode in a live filesystem. This is more or
+ * less like a regular iwalk, except that when we're advancing the scan cursor,
+ * we must ensure that inodes cannot be added or deleted anywhere between the
+ * old cursor value and the new cursor value. If we're advancing the cursor
+ * by one inode, the caller must hold that inode; if we're finding the next
+ * inode to scan, we must grab the AGI and hold it until we've updated the
+ * scan cursor.
+ *
+ * Callers are expected to use this code to scan all files in the filesystem to
+ * construct a new metadata index of some kind. The scan races against other
+ * live updates, which means there must be a provision to update the new index
+ * when updates are made to inodes that already been scanned. The iscan lock
+ * can be used in live update hook code to stop the scan and protect this data
+ * structure.
+ *
+ * To keep the new index up to date with other metadata updates being made to
+ * the live filesystem, it is assumed that the caller will add hooks as needed
+ * to be notified when a metadata update occurs. The inode scanner must tell
+ * the hook code when an inode has been visited with xchk_iscan_mark_visit.
+ * Hook functions can use xchk_iscan_want_live_update to decide if the
+ * scanner's observations must be updated.
+ */
+
+/*
+ * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so
+ * that the scan ignores that inode.
+ */
+STATIC void
+xchk_iscan_mask_skipino(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_inobt_rec_incore *rec,
+ xfs_agino_t lastrecino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino);
+ xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino);
+
+ if (pag->pag_agno != skip_agno)
+ return;
+ if (skip_agino < rec->ir_startino)
+ return;
+ if (skip_agino > lastrecino)
+ return;
+
+ rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1);
+}
+
+/*
+ * Set *cursor to the next allocated inode after whatever it's set to now.
+ * If there are no more inodes in this AG, cursor is set to NULLAGINO.
+ */
+STATIC int
+xchk_iscan_find_next(
+ struct xchk_iscan *iscan,
+ struct xfs_buf *agi_bp,
+ struct xfs_perag *pag,
+ xfs_inofree_t *allocmaskp,
+ xfs_agino_t *cursor,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ xfs_agnumber_t agno = pag->pag_agno;
+ xfs_agino_t lastino = NULLAGINO;
+ xfs_agino_t first, last;
+ xfs_agino_t agino = *cursor;
+ int has_rec;
+ int error;
+
+ /* If the cursor is beyond the end of this AG, move to the next one. */
+ xfs_agino_range(mp, agno, &first, &last);
+ if (agino > last) {
+ *cursor = NULLAGINO;
+ return 0;
+ }
+
+ /*
+ * Look up the inode chunk for the current cursor position. If there
+ * is no chunk here, we want the next one.
+ */
+ cur = xfs_inobt_init_cursor(pag, tp, agi_bp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec);
+ if (!error && !has_rec)
+ error = xfs_btree_increment(cur, 0, &has_rec);
+ for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) {
+ xfs_inofree_t allocmask;
+
+ /*
+ * If we've run out of inobt records in this AG, move the
+ * cursor on to the next AG and exit. The caller can try
+ * again with the next AG.
+ */
+ if (!has_rec) {
+ *cursor = NULLAGINO;
+ break;
+ }
+
+ error = xfs_inobt_get_rec(cur, &rec, &has_rec);
+ if (error)
+ break;
+ if (!has_rec) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ /* Make sure that we always move forward. */
+ if (lastino != NULLAGINO &&
+ XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+ lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+ /*
+ * If this record only covers inodes that come before the
+ * cursor, advance to the next record.
+ */
+ if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ continue;
+
+ if (iscan->skip_ino)
+ xchk_iscan_mask_skipino(iscan, pag, &rec, lastino);
+
+ /*
+ * If the incoming lookup put us in the middle of an inobt
+ * record, mark it and the previous inodes "free" so that the
+ * search for allocated inodes will start at the cursor.
+ * We don't care about ir_freecount here.
+ */
+ if (agino >= rec.ir_startino)
+ rec.ir_free |= xfs_inobt_maskn(0,
+ agino + 1 - rec.ir_startino);
+
+ /*
+ * If there are allocated inodes in this chunk, find them
+ * and update the scan cursor.
+ */
+ allocmask = ~rec.ir_free;
+ if (hweight64(allocmask) > 0) {
+ int next = xfs_lowbit64(allocmask);
+
+ ASSERT(next >= 0);
+ *cursor = rec.ir_startino + next;
+ *allocmaskp = allocmask >> next;
+ *nr_inodesp = XFS_INODES_PER_CHUNK - next;
+ break;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Advance both the scan and the visited cursors.
+ *
+ * The inumber address space for a given filesystem is sparse, which means that
+ * the scan cursor can jump a long ways in a single iter() call. There are no
+ * inodes in these sparse areas, so we must move the visited cursor forward at
+ * the same time so that the scan user can receive live updates for inodes that
+ * may get created once we release the AGI buffer.
+ */
+static inline void
+xchk_iscan_move_cursor(
+ struct xchk_iscan *iscan,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t cursor, visited;
+
+ BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO);
+
+ /*
+ * Special-case ino == 0 here so that we never set visited_ino to
+ * NULLFSINO when wrapping around EOFS, for that will let through all
+ * live updates.
+ */
+ cursor = XFS_AGINO_TO_INO(mp, agno, agino);
+ if (cursor == 0)
+ visited = XFS_MAXINUMBER;
+ else
+ visited = cursor - 1;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = cursor;
+ iscan->__visited_ino = visited;
+ trace_xchk_iscan_move_cursor(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Prepare to return agno/agino to the iscan caller by moving the lastino
+ * cursor to the previous inode. Do this while we still hold the AGI so that
+ * no other threads can create or delete inodes in this AG.
+ */
+static inline void
+xchk_iscan_finish(
+ struct xchk_iscan *iscan)
+{
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = NULLFSINO;
+
+ /* All live updates will be applied from now on */
+ iscan->__visited_ino = NULLFSINO;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance ino to the next inode that the inobt thinks is allocated, being
+ * careful to jump to the next AG if we've reached the right end of this AG's
+ * inode btree. Advancing ino effectively means that we've pushed the inode
+ * scan forward, so set the iscan cursor to (ino - 1) so that our live update
+ * predicates will track inode allocations in that part of the inode number
+ * key space once we release the AGI buffer.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_advance(
+ struct xchk_iscan *iscan,
+ struct xfs_perag **pagp,
+ struct xfs_buf **agi_bpp,
+ xfs_inofree_t *allocmaskp,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi_bp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ int ret;
+
+ ASSERT(iscan->cursor_ino >= iscan->__visited_ino);
+
+ do {
+ if (xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+
+ agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino);
+ pag = xfs_perag_get(mp, agno);
+ if (!pag)
+ return -ECANCELED;
+
+ ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ if (ret)
+ goto out_pag;
+
+ agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino);
+ ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp,
+ &agino, nr_inodesp);
+ if (ret)
+ goto out_buf;
+
+ if (agino != NULLAGINO) {
+ /*
+ * Found the next inode in this AG, so return it along
+ * with the AGI buffer and the perag structure to
+ * ensure it cannot go away.
+ */
+ xchk_iscan_move_cursor(iscan, agno, agino);
+ *agi_bpp = agi_bp;
+ *pagp = pag;
+ return 1;
+ }
+
+ /*
+ * Did not find any more inodes in this AG, move on to the next
+ * AG.
+ */
+ agno = (agno + 1) % mp->m_sb.sb_agcount;
+ xchk_iscan_move_cursor(iscan, agno, 0);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ trace_xchk_iscan_advance_ag(iscan);
+ } while (iscan->cursor_ino != iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+ return 0;
+
+out_buf:
+ xfs_trans_brelse(sc->tp, agi_bp);
+out_pag:
+ xfs_perag_put(pag);
+ return ret;
+}
+
+/*
+ * Grabbing the inode failed, so we need to back up the scan and ask the caller
+ * to try to _advance the scan again. Returns -EBUSY if we've run out of retry
+ * opportunities, -ECANCELED if the process has a fatal signal pending, or
+ * -EAGAIN if we should try again.
+ */
+STATIC int
+xchk_iscan_iget_retry(
+ struct xchk_iscan *iscan,
+ bool wait)
+{
+ ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1);
+
+ if (!iscan->iget_timeout ||
+ time_is_before_jiffies(iscan->__iget_deadline))
+ return -EBUSY;
+
+ if (wait) {
+ unsigned long relax;
+
+ /*
+ * Sleep for a period of time to let the rest of the system
+ * catch up. If we return early, someone sent a kill signal to
+ * the calling process.
+ */
+ relax = msecs_to_jiffies(iscan->iget_retry_delay);
+ trace_xchk_iscan_iget_retry_wait(iscan);
+
+ if (schedule_timeout_killable(relax) ||
+ xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+ }
+
+ iscan->cursor_ino--;
+ return -EAGAIN;
+}
+
+/*
+ * Grab an inode as part of an inode scan. While scanning this inode, the
+ * caller must ensure that no other threads can modify the inode until a call
+ * to xchk_iscan_visit succeeds.
+ *
+ * Returns the number of incore inodes grabbed; -EAGAIN if the caller should
+ * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode;
+ * -ECANCELED if there's a fatal signal pending; or some other negative errno.
+ */
+STATIC int
+xchk_iscan_iget(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_buf *agi_bp,
+ xfs_inofree_t allocmask,
+ uint8_t nr_inodes)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t ino = iscan->cursor_ino;
+ unsigned int idx = 0;
+ unsigned int i;
+ int error;
+
+ ASSERT(iscan->__inodes[0] == NULL);
+
+ /* Fill the first slot in the inode array. */
+ error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ &iscan->__inodes[idx]);
+
+ trace_xchk_iscan_iget(iscan, error);
+
+ if (error == -ENOENT || error == -EAGAIN) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * It's possible that this inode has lost all of its links but
+ * hasn't yet been inactivated. If we don't have a transaction
+ * or it's not writable, flush the inodegc workers and wait.
+ */
+ xfs_inodegc_flush(mp);
+ return xchk_iscan_iget_retry(iscan, true);
+ }
+
+ if (error == -EINVAL) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * We thought the inode was allocated, but the inode btree
+ * lookup failed, which means that it was freed since the last
+ * time we advanced the cursor. Back up and try again. This
+ * should never happen since still hold the AGI buffer from the
+ * inobt check, but we need to be careful about infinite loops.
+ */
+ return xchk_iscan_iget_retry(iscan, false);
+ }
+
+ if (error) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return error;
+ }
+ idx++;
+ ino++;
+ allocmask >>= 1;
+
+ /*
+ * Now that we've filled the first slot in __inodes, try to fill the
+ * rest of the batch with consecutively ordered inodes. to reduce the
+ * number of _iter calls. Make a bitmap of unallocated inodes from the
+ * zeroes in the inuse bitmap; these inodes will not be scanned, but
+ * the _want_live_update predicate will pass through all live updates.
+ *
+ * If we can't iget an allocated inode, stop and return what we have.
+ */
+ mutex_lock(&iscan->lock);
+ iscan->__batch_ino = ino - 1;
+ iscan->__skipped_inomask = 0;
+ mutex_unlock(&iscan->lock);
+
+ for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) {
+ if (!(allocmask & 1)) {
+ ASSERT(!(iscan->__skipped_inomask & (1ULL << i)));
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ iscan->__skipped_inomask |= (1ULL << i);
+ mutex_unlock(&iscan->lock);
+ continue;
+ }
+
+ ASSERT(iscan->__inodes[idx] == NULL);
+
+ error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ &iscan->__inodes[idx]);
+ if (error)
+ break;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ mutex_unlock(&iscan->lock);
+ idx++;
+ }
+
+ trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return idx;
+}
+
+/*
+ * Advance the visit cursor to reflect skipped inodes beyond whatever we
+ * scanned.
+ */
+STATIC void
+xchk_iscan_finish_batch(
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t highest_skipped;
+
+ mutex_lock(&iscan->lock);
+
+ if (iscan->__batch_ino != NULLFSINO) {
+ highest_skipped = iscan->__batch_ino +
+ xfs_highbit64(iscan->__skipped_inomask);
+ iscan->__visited_ino = max(iscan->__visited_ino,
+ highest_skipped);
+
+ trace_xchk_iscan_skip(iscan);
+ }
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return up to
+ * 64 consecutive allocated inodes starting with the cursor position.
+ */
+STATIC int
+xchk_iscan_iter_batch(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ int ret;
+
+ xchk_iscan_finish_batch(iscan);
+
+ if (iscan->iget_timeout)
+ iscan->__iget_deadline = jiffies +
+ msecs_to_jiffies(iscan->iget_timeout);
+
+ do {
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_inofree_t allocmask = 0;
+ uint8_t nr_inodes = 0;
+
+ ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask,
+ &nr_inodes);
+ if (ret != 1)
+ return ret;
+
+ if (xchk_iscan_aborted(iscan)) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ ret = -ECANCELED;
+ break;
+ }
+
+ ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return the
+ * incore inode structure associated with it.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be
+ * grabbed, or the usual negative errno.
+ *
+ * If the function returns -EBUSY and the caller can handle skipping an inode,
+ * it may call this function again to continue the scan with the next allocated
+ * inode.
+ */
+int
+xchk_iscan_iter(
+ struct xchk_iscan *iscan,
+ struct xfs_inode **ipp)
+{
+ unsigned int i;
+ int error;
+
+ /* Find a cached inode, or go get another batch. */
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i])
+ goto foundit;
+ }
+
+ error = xchk_iscan_iter_batch(iscan);
+ if (error <= 0)
+ return error;
+
+ ASSERT(iscan->__inodes[0] != NULL);
+ i = 0;
+
+foundit:
+ /* Give the caller our reference. */
+ *ipp = iscan->__inodes[i];
+ iscan->__inodes[i] = NULL;
+ return 1;
+}
+
+/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */
+void
+xchk_iscan_iter_finish(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ unsigned int i;
+
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i]) {
+ xchk_irele(sc, iscan->__inodes[i]);
+ iscan->__inodes[i] = NULL;
+ }
+ }
+}
+
+/* Mark this inode scan finished and release resources. */
+void
+xchk_iscan_teardown(
+ struct xchk_iscan *iscan)
+{
+ xchk_iscan_iter_finish(iscan);
+ xchk_iscan_finish(iscan);
+ mutex_destroy(&iscan->lock);
+}
+
+/* Pick an AG from which to start a scan. */
+static inline xfs_ino_t
+xchk_iscan_rotor(
+ struct xfs_mount *mp)
+{
+ static atomic_t agi_rotor;
+ unsigned int r = atomic_inc_return(&agi_rotor) - 1;
+
+ /*
+ * Rotoring *backwards* through the AGs, so we add one here before
+ * subtracting from the agcount to arrive at an AG number.
+ */
+ r = (r % mp->m_sb.sb_agcount) + 1;
+
+ return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0);
+}
+
+/*
+ * Set ourselves up to start an inode scan. If the @iget_timeout and
+ * @iget_retry_delay parameters are set, the scan will try to iget each inode
+ * for @iget_timeout milliseconds. If an iget call indicates that the inode is
+ * waiting to be inactivated, the CPU will relax for @iget_retry_delay
+ * milliseconds after pushing the inactivation workers.
+ */
+void
+xchk_iscan_start(
+ struct xfs_scrub *sc,
+ unsigned int iget_timeout,
+ unsigned int iget_retry_delay,
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t start_ino;
+
+ start_ino = xchk_iscan_rotor(sc->mp);
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ iscan->sc = sc;
+ clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+ iscan->iget_timeout = iget_timeout;
+ iscan->iget_retry_delay = iget_retry_delay;
+ iscan->__visited_ino = start_ino;
+ iscan->cursor_ino = start_ino;
+ iscan->scan_start_ino = start_ino;
+ mutex_init(&iscan->lock);
+ memset(iscan->__inodes, 0, sizeof(iscan->__inodes));
+
+ trace_xchk_iscan_start(iscan, start_ino);
+}
+
+/*
+ * Mark this inode as having been visited. Callers must hold a sufficiently
+ * exclusive lock on the inode to prevent concurrent modifications.
+ */
+void
+xchk_iscan_mark_visited(
+ struct xchk_iscan *iscan,
+ struct xfs_inode *ip)
+{
+ mutex_lock(&iscan->lock);
+ iscan->__visited_ino = ip->i_ino;
+ trace_xchk_iscan_visit(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Did we skip this inode because it wasn't allocated when we loaded the batch?
+ * If so, it is newly allocated and will not be scanned. All live updates to
+ * this inode must be passed to the caller to maintain scan correctness.
+ */
+static inline bool
+xchk_iscan_skipped(
+ const struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ if (iscan->__batch_ino == NULLFSINO)
+ return false;
+ if (ino < iscan->__batch_ino)
+ return false;
+ if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK)
+ return false;
+
+ return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino));
+}
+
+/*
+ * Do we need a live update for this inode? This is true if the scanner thread
+ * has visited this inode and the scan hasn't been aborted due to errors.
+ * Callers must hold a sufficiently exclusive lock on the inode to prevent
+ * scanners from reading any inode metadata.
+ */
+bool
+xchk_iscan_want_live_update(
+ struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ bool ret = false;
+
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ mutex_lock(&iscan->lock);
+
+ trace_xchk_iscan_want_live_update(iscan, ino);
+
+ /* Scan is finished, caller should receive all updates. */
+ if (iscan->__visited_ino == NULLFSINO) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * No inodes have been visited yet, so the visited cursor points at the
+ * start of the scan range. The caller should not receive any updates.
+ */
+ if (iscan->scan_start_ino == iscan->__visited_ino) {
+ ret = false;
+ goto unlock;
+ }
+
+ /*
+ * This inode was not allocated at the time of the iscan batch.
+ * The caller should receive all updates.
+ */
+ if (xchk_iscan_skipped(iscan, ino)) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor hasn't yet wrapped around the end of the FS. If
+ * @ino is inside the starred range, the caller should receive updates:
+ *
+ * 0 ------------ S ************ V ------------ EOFS
+ */
+ if (iscan->scan_start_ino <= iscan->__visited_ino) {
+ if (ino >= iscan->scan_start_ino &&
+ ino <= iscan->__visited_ino)
+ ret = true;
+
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor wrapped around the end of the FS. If @ino is
+ * inside the starred range, the caller should receive updates:
+ *
+ * 0 ************ V ------------ S ************ EOFS
+ */
+ if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino)
+ ret = true;
+
+unlock:
+ mutex_unlock(&iscan->lock);
+ return ret;
+}
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
new file mode 100644
index 000000000000..71f657552dfa
--- /dev/null
+++ b/fs/xfs/scrub/iscan.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ISCAN_H__
+#define __XFS_SCRUB_ISCAN_H__
+
+struct xchk_iscan {
+ struct xfs_scrub *sc;
+
+ /* Lock to protect the scan cursor. */
+ struct mutex lock;
+
+ /*
+ * This is the first inode in the inumber address space that we
+ * examined. When the scan wraps around back to here, the scan is
+ * finished.
+ */
+ xfs_ino_t scan_start_ino;
+
+ /* This is the inode that will be examined next. */
+ xfs_ino_t cursor_ino;
+
+ /* If nonzero and non-NULL, skip this inode when scanning. */
+ xfs_ino_t skip_ino;
+
+ /*
+ * This is the last inode that we've successfully scanned, either
+ * because the caller scanned it, or we moved the cursor past an empty
+ * part of the inode address space. Scan callers should only use the
+ * xchk_iscan_visit function to modify this.
+ */
+ xfs_ino_t __visited_ino;
+
+ /* Operational state of the livescan. */
+ unsigned long __opstate;
+
+ /* Give up on iterating @cursor_ino if we can't iget it by this time. */
+ unsigned long __iget_deadline;
+
+ /* Amount of time (in ms) that we will try to iget an inode. */
+ unsigned int iget_timeout;
+
+ /* Wait this many ms to retry an iget. */
+ unsigned int iget_retry_delay;
+
+ /*
+ * The scan grabs batches of inodes and stashes them here before
+ * handing them out with _iter. Unallocated inodes are set in the
+ * mask so that all updates to that inode are selected for live
+ * update propagation.
+ */
+ xfs_ino_t __batch_ino;
+ xfs_inofree_t __skipped_inomask;
+ struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK];
+};
+
+/* Set if the scan has been aborted due to some event in the fs. */
+#define XCHK_ISCAN_OPSTATE_ABORTED (1)
+
+static inline bool
+xchk_iscan_aborted(const struct xchk_iscan *iscan)
+{
+ return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_abort(struct xchk_iscan *iscan)
+{
+ set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
+ unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_teardown(struct xchk_iscan *iscan);
+
+int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
+void xchk_iscan_iter_finish(struct xchk_iscan *iscan);
+
+void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip);
+bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino);
+
+#endif /* __XFS_SCRUB_ISCAN_H__ */
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index bb6d980b4fcd..4a0271123d94 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -239,7 +239,11 @@ xrep_newbt_alloc_ag_blocks(
xrep_newbt_validate_ag_alloc_hint(xnr);
- error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
+ if (xnr->alloc_vextent)
+ error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+ else
+ error = xfs_alloc_vextent_near_bno(&args,
+ xnr->alloc_hint);
if (error)
return error;
if (args.fsbno == NULLFSBLOCK)
@@ -309,7 +313,11 @@ xrep_newbt_alloc_file_blocks(
xrep_newbt_validate_file_alloc_hint(xnr);
- error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
+ if (xnr->alloc_vextent)
+ error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+ else
+ error = xfs_alloc_vextent_start_ag(&args,
+ xnr->alloc_hint);
if (error)
return error;
if (args.fsbno == NULLFSBLOCK)
@@ -535,7 +543,7 @@ xrep_newbt_claim_block(
trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
xnr->oinfo.oi_owner);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
agbno));
else
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index 89f8e3970b1f..3d804d31af24 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -6,6 +6,8 @@
#ifndef __XFS_SCRUB_NEWBT_H__
#define __XFS_SCRUB_NEWBT_H__
+struct xfs_alloc_arg;
+
struct xrep_newbt_resv {
/* Link to list of extents that we've reserved. */
struct list_head list;
@@ -28,6 +30,11 @@ struct xrep_newbt_resv {
struct xrep_newbt {
struct xfs_scrub *sc;
+ /* Custom allocation function, or NULL for xfs_alloc_vextent */
+ int (*alloc_vextent)(struct xfs_scrub *sc,
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t alloc_hint);
+
/* List of extents that we've reserved. */
struct list_head resv_list;
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
new file mode 100644
index 000000000000..8a7d9557897c
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+#include "scrub/readdir.h"
+
+/*
+ * Live Inode Link Count Checking
+ * ==============================
+ *
+ * Inode link counts are "summary" metadata, in the sense that they are
+ * computed as the number of directory entries referencing each file on the
+ * filesystem. Therefore, we compute the correct link counts by creating a
+ * shadow link count structure and walking every inode.
+ */
+
+/* Set us up to scrub inode link counts. */
+int
+xchk_setup_nlinks(
+ struct xfs_scrub *sc)
+{
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting file link counts. For each file, we create a shadow link
+ * counting structure, then walk the entire directory tree, incrementing parent
+ * and child link counts for each directory entry seen.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the link counts for an inode that we've already scanned.
+ * This will cause our counts to be incorrect. Therefore, we hook all
+ * directory entry updates because that is when link count updates occur. By
+ * shadowing transaction updates in this manner, live nlink check can ensure by
+ * locking the inode and the shadow structure that its own copies are not out
+ * of date. Because the hook code runs in a different process context from the
+ * scrub code and the scrub state flags are not accessed atomically, failures
+ * in the hook code must abort the iscan and the scrubber must notice the
+ * aborted scan and set the incomplete flag.
+ *
+ * Note that we use jump labels and srcu notifier hooks to minimize the
+ * overhead when live nlinks is /not/ running. Locking order for nlink
+ * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
+ */
+
+/*
+ * Add a delta to an nlink counter, clamping the value to U32_MAX. Because
+ * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
+ * even if we lose some precision.
+ */
+static inline void
+careful_add(
+ xfs_nlink_t *nlinkp,
+ int delta)
+{
+ uint64_t new_value = (uint64_t)(*nlinkp) + delta;
+
+ BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
+ *nlinkp = min_t(uint64_t, new_value, U32_MAX);
+}
+
+/* Update incore link count information. Caller must hold the nlinks lock. */
+STATIC int
+xchk_nlinks_update_incore(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino,
+ int parents_delta,
+ int backrefs_delta,
+ int children_delta)
+{
+ struct xchk_nlink nl;
+ int error;
+
+ if (!xnc->nlinks)
+ return 0;
+
+ error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+ if (error)
+ return error;
+
+ trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
+ backrefs_delta, children_delta);
+
+ careful_add(&nl.parents, parents_delta);
+ careful_add(&nl.backrefs, backrefs_delta);
+ careful_add(&nl.children, children_delta);
+
+ nl.flags |= XCHK_NLINK_WRITTEN;
+ error = xfarray_store(xnc->nlinks, ino, &nl);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete.
+ */
+ error = -ECANCELED;
+ }
+ return error;
+}
+
+/*
+ * Apply a link count change from the regular filesystem into our shadow link
+ * count structure based on a directory update in progress.
+ */
+STATIC int
+xchk_nlinks_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xchk_nlink_ctrs *xnc;
+ int error;
+
+ xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+
+ trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
+ p->delta, p->name->name, p->name->len);
+
+ /*
+ * If we've already scanned @dp, update the number of parents that link
+ * to @ip. If @ip is a subdirectory, update the number of child links
+ * going out of @dp.
+ */
+ if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
+ 0, 0);
+ if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
+ error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+ 0, p->delta);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_abort;
+ }
+
+ /*
+ * If @ip is a subdirectory and we've already scanned it, update the
+ * number of backrefs pointing to @dp.
+ */
+ if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
+ xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+ p->delta, 0);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_abort;
+ }
+
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xnc->collect_iscan);
+ return NOTIFY_DONE;
+}
+
+/* Bump the observed link count for the inode referenced by this entry. */
+STATIC int
+xchk_nlinks_collect_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xchk_nlink_ctrs *xnc = priv;
+ bool dot = false, dotdot = false;
+ int error;
+
+ /* Does this name make sense? */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ if (name->len == 1 && name->name[0] == '.')
+ dot = true;
+ else if (name->len == 2 && name->name[0] == '.' &&
+ name->name[1] == '.')
+ dotdot = true;
+
+ /* Don't accept a '.' entry that points somewhere else. */
+ if (dot && ino != dp->i_ino) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ /* Don't accept an invalid inode number. */
+ if (!xfs_verify_dir_ino(sc->mp, ino)) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ /* Update the shadow link counts if we haven't already failed. */
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
+
+ mutex_lock(&xnc->lock);
+
+ /*
+ * If this is a dotdot entry, it is a back link from dp to ino. How
+ * we handle this depends on whether or not dp is the root directory.
+ *
+ * The root directory is its own parent, so we pretend the dotdot entry
+ * establishes the "parent" of the root directory. Increment the
+ * number of parents of the root directory.
+ *
+ * Otherwise, increment the number of backrefs pointing back to ino.
+ */
+ if (dotdot) {
+ if (dp == sc->mp->m_rootip)
+ error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+ else
+ error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * If this dirent is a forward link from dp to ino, increment the
+ * number of parents linking into ino.
+ */
+ if (!dot && !dotdot) {
+ error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * If this dirent is a forward link to a subdirectory, increment the
+ * number of child links of dp.
+ */
+ if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
+ error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
+ if (error)
+ goto out_unlock;
+ }
+
+ mutex_unlock(&xnc->lock);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xnc->lock);
+out_abort:
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(sc);
+ return error;
+}
+
+/* Walk a directory to bump the observed link counts of the children. */
+STATIC int
+xchk_nlinks_collect_dir(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = xnc->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /* Prevent anyone from changing this directory while we walk it. */
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * The dotdot entry of an unlinked directory still points to the last
+ * parent, but the parent no longer links to this directory. Skip the
+ * directory to avoid overcounting.
+ */
+ if (VFS_I(dp)->i_nlink == 0)
+ goto out_unlock;
+
+ /*
+ * We cannot count file links if the directory looks as though it has
+ * been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_abort;
+ }
+
+ error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (error)
+ goto out_abort;
+
+ xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
+ goto out_unlock;
+
+out_abort:
+ xchk_set_incomplete(sc);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return error;
+}
+
+/* If this looks like a valid pointer, count it. */
+static inline int
+xchk_nlinks_collect_metafile(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino)
+{
+ if (!xfs_verify_ino(xnc->sc->mp, ino))
+ return 0;
+
+ trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
+ return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+}
+
+/* Bump the link counts of metadata files rooted in the superblock. */
+STATIC int
+xchk_nlinks_collect_metafiles(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_mount *mp = xnc->sc->mp;
+ int error = -ECANCELED;
+
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ goto out_incomplete;
+
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
+ if (error)
+ goto out_abort;
+ mutex_unlock(&xnc->lock);
+
+ return 0;
+
+out_abort:
+ mutex_unlock(&xnc->lock);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(xnc->sc);
+ return error;
+}
+
+/* Advance the collection scan cursor for this non-directory file. */
+static inline int
+xchk_nlinks_collect_file(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *ip)
+{
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+}
+
+/* Walk all directories and count inode links. */
+STATIC int
+xchk_nlinks_collect(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /* Count the rt and quota files that are rooted in the superblock. */
+ error = xchk_nlinks_collect_metafiles(xnc);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Cancel the transaction to release the log grant space while we scan
+ * the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
+ if (S_ISDIR(VFS_I(ip)->i_mode))
+ error = xchk_nlinks_collect_dir(xnc, ip);
+ else
+ error = xchk_nlinks_collect_file(xnc, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->collect_iscan);
+ if (error) {
+ xchk_set_incomplete(sc);
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Switch out for a real transaction in preparation for building a new
+ * tree.
+ */
+ xchk_trans_cancel(sc);
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing file link counters. Walk each inode and compare the link
+ * counts against our shadow information; and then walk each shadow link count
+ * structure (that wasn't covered in the first part), comparing it against the
+ * file.
+ */
+
+/* Read the observed link count for comparison with the actual inode. */
+STATIC int
+xchk_nlinks_comparison_read(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino,
+ struct xchk_nlink *obs)
+{
+ struct xchk_nlink nl;
+ int error;
+
+ error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+ if (error)
+ return error;
+
+ nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
+
+ error = xfarray_store(xnc->nlinks, ino, &nl);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete. This
+ * shouldn't really happen outside of the collection phase.
+ */
+ xchk_set_incomplete(xnc->sc);
+ return -ECANCELED;
+ }
+ if (error)
+ return error;
+
+ /* Copy the counters, but do not expose the internal state. */
+ obs->parents = nl.parents;
+ obs->backrefs = nl.backrefs;
+ obs->children = nl.children;
+ obs->flags = 0;
+ return 0;
+}
+
+/* Check our link count against an inode. */
+STATIC int
+xchk_nlinks_compare_inode(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *ip)
+{
+ struct xchk_nlink obs;
+ struct xfs_scrub *sc = xnc->sc;
+ uint64_t total_links;
+ unsigned int actual_nlink;
+ int error;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ mutex_lock(&xnc->lock);
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ xchk_set_incomplete(xnc->sc);
+ error = -ECANCELED;
+ goto out_scanlock;
+ }
+
+ error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ /*
+ * If we don't have ftype to get an accurate count of the subdirectory
+ * entries in this directory, take advantage of the fact that on a
+ * consistent ftype=0 filesystem, the number of subdirectory
+ * backreferences (dotdot entries) pointing towards this directory
+ * should be equal to the number of subdirectory entries in the
+ * directory.
+ */
+ if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
+ obs.children = obs.backrefs;
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+
+ trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
+
+ /*
+ * If we found so many parents that we'd overflow i_nlink, we must flag
+ * this as a corruption. The VFS won't let users increase the link
+ * count, but it will let them decrease it.
+ */
+ if (total_links > XFS_MAXLINK) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ /* Link counts should match. */
+ if (total_links != actual_nlink) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
+ /*
+ * The collection phase ignores directories with zero link
+ * count, so we ignore them here too.
+ *
+ * The number of subdirectory backreferences (dotdot entries)
+ * pointing towards this directory should be equal to the
+ * number of subdirectory entries in the directory.
+ */
+ if (obs.children != obs.backrefs)
+ xchk_ino_xref_set_corrupt(sc, ip->i_ino);
+ } else {
+ /*
+ * Non-directories and unlinked directories should not have
+ * back references.
+ */
+ if (obs.backrefs != 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ /*
+ * Non-directories and unlinked directories should not have
+ * children.
+ */
+ if (obs.children != 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ }
+
+ if (ip == sc->mp->m_rootip) {
+ /*
+ * For the root of a directory tree, both the '.' and '..'
+ * entries should point to the root directory. The dotdot
+ * entry is counted as a parent of the root /and/ a backref of
+ * the root directory.
+ */
+ if (obs.parents != 1) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ } else if (actual_nlink > 0) {
+ /*
+ * Linked files that are not the root directory should have at
+ * least one parent.
+ */
+ if (obs.parents == 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ }
+
+out_corrupt:
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ error = -ECANCELED;
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/*
+ * Check our link count against an inode that wasn't checked previously. This
+ * is intended to catch directories with dangling links, though we could be
+ * racing with inode allocation in other threads.
+ */
+STATIC int
+xchk_nlinks_compare_inum(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino)
+{
+ struct xchk_nlink obs;
+ struct xfs_mount *mp = xnc->sc->mp;
+ struct xfs_trans *tp = xnc->sc->tp;
+ struct xfs_buf *agi_bp;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * The first iget failed, so try again with the variant that returns
+ * either an incore inode or the AGI buffer. If the function returns
+ * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
+ * can guarantee that the inode won't be allocated while we check for
+ * a zero link count in the observed link count data.
+ */
+ error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
+ if (!error) {
+ /* Actually got an inode, so use the inode compare. */
+ error = xchk_nlinks_compare_inode(xnc, ip);
+ xchk_irele(xnc->sc, ip);
+ return error;
+ }
+ if (error == -ENOENT || error == -EINVAL) {
+ /* No inode was found. Check for zero link count below. */
+ error = 0;
+ }
+ if (error)
+ goto out_agi;
+
+ /* Ensure that we have protected against inode allocation/freeing. */
+ if (agi_bp == NULL) {
+ ASSERT(agi_bp != NULL);
+ xchk_set_incomplete(xnc->sc);
+ return -ECANCELED;
+ }
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ xchk_set_incomplete(xnc->sc);
+ error = -ECANCELED;
+ goto out_agi;
+ }
+
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_comparison_read(xnc, ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ trace_xchk_nlinks_check_zero(mp, ino, &obs);
+
+ /*
+ * If we can't grab the inode, the link count had better be zero. We
+ * still hold the AGI to prevent inode allocation/freeing.
+ */
+ if (xchk_nlink_total(NULL, &obs) != 0) {
+ xchk_ino_set_corrupt(xnc->sc, ino);
+ error = -ECANCELED;
+ }
+
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+out_agi:
+ if (agi_bp)
+ xfs_trans_brelse(tp, agi_bp);
+ return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem to compare the link count. Move
+ * on if we can't grab an inode, since we'll revisit unchecked nlink records in
+ * the second part.
+ */
+static int
+xchk_nlinks_compare_iter(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ do {
+ error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+ } while (error == -EBUSY);
+
+ return error;
+}
+
+/* Compare the link counts we observed against the live information. */
+STATIC int
+xchk_nlinks_compare(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xchk_nlink nl;
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_inode *ip;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * Create a new empty transaction so that we can advance the iscan
+ * cursor without deadlocking if the inobt has a cycle and push on the
+ * inactivation workqueue.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ /*
+ * Use the inobt to walk all allocated inodes to compare the link
+ * counts. Inodes skipped by _compare_iter will be tried again in the
+ * next phase of the scan.
+ */
+ xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
+ while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
+ error = xchk_nlinks_compare_inode(xnc, ip);
+ xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->compare_iscan);
+ xchk_iscan_teardown(&xnc->compare_iscan);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * Walk all the non-null nlink observations that weren't checked in the
+ * previous step.
+ */
+ mutex_lock(&xnc->lock);
+ while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
+ xfs_ino_t ino = cur - 1;
+
+ if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
+ continue;
+
+ mutex_unlock(&xnc->lock);
+
+ error = xchk_nlinks_compare_inum(xnc, ino);
+ if (error)
+ return error;
+
+ if (xchk_should_terminate(xnc->sc, &error))
+ return error;
+
+ mutex_lock(&xnc->lock);
+ }
+ mutex_unlock(&xnc->lock);
+
+ return error;
+}
+
+/* Tear down everything associated with a nlinks check. */
+static void
+xchk_nlinks_teardown_scan(
+ void *priv)
+{
+ struct xchk_nlink_ctrs *xnc = priv;
+
+ /* Discourage any hook functions that might be running. */
+ xchk_iscan_abort(&xnc->collect_iscan);
+
+ xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
+
+ xfarray_destroy(xnc->nlinks);
+ xnc->nlinks = NULL;
+
+ xchk_iscan_teardown(&xnc->collect_iscan);
+ mutex_destroy(&xnc->lock);
+ xnc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate link count data. If
+ * the scan is successful, the counts will be left alive for a repair. If any
+ * error occurs, we'll tear everything down.
+ */
+STATIC int
+xchk_nlinks_setup_scan(
+ struct xfs_scrub *sc,
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ unsigned long long max_inos;
+ xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1;
+ xfs_agino_t first_agino, last_agino;
+ int error;
+
+ ASSERT(xnc->sc == NULL);
+ xnc->sc = sc;
+
+ mutex_init(&xnc->lock);
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
+
+ /*
+ * Set up enough space to store an nlink record for the highest
+ * possible inode number in this system.
+ */
+ xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
+ max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
+ descr = xchk_xfile_descr(sc, "file link counts");
+ error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
+ sizeof(struct xchk_nlink), &xnc->nlinks);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Hook into the directory entry code so that we can capture updates to
+ * file link counts. The hook only triggers for inodes that were
+ * already scanned, and the scanner thread takes each inode's ILOCK,
+ * which means that any in-progress inode updates will finish before we
+ * can scan the inode.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
+ error = xfs_dir_hook_add(mp, &xnc->dhook);
+ if (error)
+ goto out_teardown;
+
+ /* Use deferred cleanup to pass the inode link count data to repair. */
+ sc->buf_cleanup = xchk_nlinks_teardown_scan;
+ return 0;
+
+out_teardown:
+ xchk_nlinks_teardown_scan(xnc);
+ return error;
+}
+
+/* Scrub the link count of all inodes on the filesystem. */
+int
+xchk_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc = sc->buf;
+ int error = 0;
+
+ /* Set ourselves up to check link counts on the live filesystem. */
+ error = xchk_nlinks_setup_scan(sc, xnc);
+ if (error)
+ return error;
+
+ /* Walk all inodes, picking up link count information. */
+ error = xchk_nlinks_collect(xnc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Fail fast if we're not playing with a full dataset. */
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ xchk_set_incomplete(sc);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare link counts. */
+ error = xchk_nlinks_compare(xnc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Check one last time for an incomplete dataset. */
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ xchk_set_incomplete(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h
new file mode 100644
index 000000000000..a950f3daf204
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NLINKS_H__
+#define __XFS_SCRUB_NLINKS_H__
+
+/* Live link count control structure. */
+struct xchk_nlink_ctrs {
+ struct xfs_scrub *sc;
+
+ /* Shadow link count data and its mutex. */
+ struct xfarray *nlinks;
+ struct mutex lock;
+
+ /*
+ * The collection step uses a separate iscan context from the compare
+ * step because the collection iscan coordinates live updates to the
+ * observation data while this scanner is running. The compare iscan
+ * is secondary and can be reinitialized as needed.
+ */
+ struct xchk_iscan collect_iscan;
+ struct xchk_iscan compare_iscan;
+
+ /*
+ * Hook into directory updates so that we can receive live updates
+ * from other writer threads.
+ */
+ struct xfs_dir_hook dhook;
+};
+
+/*
+ * In-core link counts for a given inode in the filesystem.
+ *
+ * For an empty rootdir, the directory entries and the field to which they are
+ * accounted are as follows:
+ *
+ * Root directory:
+ *
+ * . points to self (root.child)
+ * .. points to self (root.parent)
+ * f1 points to a child file (f1.parent)
+ * d1 points to a child dir (d1.parent, root.child)
+ *
+ * Subdirectory d1:
+ *
+ * . points to self (d1.child)
+ * .. points to root dir (root.backref)
+ * f2 points to child file (f2.parent)
+ * f3 points to root.f1 (f1.parent)
+ *
+ * root.nlink == 3 (root.dot, root.dotdot, root.d1)
+ * d1.nlink == 2 (root.d1, d1.dot)
+ * f1.nlink == 2 (root.f1, d1.f3)
+ * f2.nlink == 1 (d1.f2)
+ */
+struct xchk_nlink {
+ /* Count of forward links from parent directories to this file. */
+ xfs_nlink_t parents;
+
+ /*
+ * Count of back links to this parent directory from child
+ * subdirectories.
+ */
+ xfs_nlink_t backrefs;
+
+ /*
+ * Count of forward links from this directory to all child files and
+ * the number of dot entries. Should be zero for non-directories.
+ */
+ xfs_nlink_t children;
+
+ /* Record state flags */
+ unsigned int flags;
+};
+
+/*
+ * This incore link count has been written at least once. We never want to
+ * store an xchk_nlink that looks uninitialized.
+ */
+#define XCHK_NLINK_WRITTEN (1U << 0)
+
+/* Already checked this link count record. */
+#define XCHK_NLINK_COMPARE_SCANNED (1U << 1)
+
+/* Already made a repair with this link count record. */
+#define XREP_NLINK_DIRTY (1U << 2)
+
+/* Compute total link count, using large enough variables to detect overflow. */
+static inline uint64_t
+xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live)
+{
+ uint64_t ret = live->parents;
+
+ /* Add one link count for the dot entry of any linked directory. */
+ if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink)
+ ret++;
+ return ret + live->children;
+}
+
+#endif /* __XFS_SCRUB_NLINKS_H__ */
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
new file mode 100644
index 000000000000..b87618322f55
--- /dev/null
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Inode Link Count Repair
+ * ============================
+ *
+ * Use the live inode link count information that we collected to replace the
+ * nlink values of the incore inodes. A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * inode is locked.
+ */
+
+/*
+ * Correct the link count of the given inode. Because we have to grab locks
+ * and resources in a certain order, it's possible that this will be a no-op.
+ */
+STATIC int
+xrep_nlinks_repair_inode(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xchk_nlink obs;
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ uint64_t total_links;
+ uint64_t actual_nlink;
+ bool dirty = false;
+ int error;
+
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp);
+ if (error)
+ return error;
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(sc->tp, ip, 0);
+
+ mutex_lock(&xnc->lock);
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_scanlock;
+ }
+
+ error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ /*
+ * We're done accessing the shared scan data, so we can drop the lock.
+ * We still hold @ip's ILOCK, so its link count cannot change.
+ */
+ mutex_unlock(&xnc->lock);
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+
+ /*
+ * Non-directories cannot have directories pointing up to them.
+ *
+ * We previously set error to zero, but set it again because one static
+ * checker author fears that programmers will fail to maintain this
+ * invariant and built their tool to flag this as a security risk. A
+ * different tool author made their bot complain about the redundant
+ * store. This is a never-ending and stupid battle; both tools missed
+ * *actual bugs* elsewhere; and I no longer care.
+ */
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) {
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ error = 0;
+ goto out_trans;
+ }
+
+ /*
+ * We did not find any links to this inode. If the inode agrees, we
+ * have nothing further to do. If not, the inode has a nonzero link
+ * count and we don't have anywhere to graft the child onto. Dropping
+ * a live inode's link count to zero can cause unexpected shutdowns in
+ * inactivation, so leave it alone.
+ */
+ if (total_links == 0) {
+ if (actual_nlink != 0)
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ goto out_trans;
+ }
+
+ /* Commit the new link count if it changed. */
+ if (total_links != actual_nlink) {
+ if (total_links > XFS_MAXLINK) {
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ goto out_trans;
+ }
+
+ trace_xrep_nlinks_update_inode(mp, ip, &obs);
+
+ set_nlink(VFS_I(ip), total_links);
+ dirty = true;
+ }
+
+ if (!dirty) {
+ error = 0;
+ goto out_trans;
+ }
+
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+
+ error = xrep_trans_commit(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ return error;
+
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+out_trans:
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem for repairs. Move on if we can't
+ * grab an inode, since we're still making forward progress.
+ */
+static int
+xrep_nlinks_iter(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ do {
+ error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+ } while (error == -EBUSY);
+
+ return error;
+}
+
+/* Commit the new inode link counters. */
+int
+xrep_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc = sc->buf;
+ int error;
+
+ /*
+ * We need ftype for an accurate count of the number of child
+ * subdirectory links. Child subdirectories with a back link (dotdot
+ * entry) but no forward link are unfixable, so we cannot repair the
+ * link count of the parent directory based on the back link count
+ * alone. Filesystems without ftype support are rare (old V4) so we
+ * just skip out here.
+ */
+ if (!xfs_has_ftype(sc->mp))
+ return -EOPNOTSUPP;
+
+ /*
+ * Use the inobt to walk all allocated inodes to compare and fix the
+ * link counts. Retry iget every tenth of a second for up to 30
+ * seconds -- even if repair misses a few inodes, we still try to fix
+ * as many of them as we can.
+ */
+ xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan);
+ ASSERT(sc->ip == NULL);
+
+ while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) {
+ /*
+ * Commit the scrub transaction so that we can create repair
+ * transactions with the correct reservations.
+ */
+ xchk_trans_cancel(sc);
+
+ error = xrep_nlinks_repair_inode(xnc);
+ xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip);
+ xchk_irele(sc, sc->ip);
+ sc->ip = NULL;
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+
+ /*
+ * Create a new empty transaction so that we can advance the
+ * iscan cursor without deadlocking if the inobt has a cycle.
+ * We can only push the inactivation workqueues with an empty
+ * transaction.
+ */
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->compare_iscan);
+ xchk_iscan_teardown(&xnc->compare_iscan);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
new file mode 100644
index 000000000000..c77eb2de8df7
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck
+ * ===============
+ *
+ * Quota counters are "summary" metadata, in the sense that they are computed
+ * as the summation of the block usage counts for every file on the filesystem.
+ * Therefore, we compute the correct icount, bcount, and rtbcount values by
+ * creating a shadow quota counter structure and walking every inode.
+ */
+
+/* Track the quota deltas for a dquot in a transaction. */
+struct xqcheck_dqtrx {
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+
+ int64_t icount_delta;
+
+ int64_t bcount_delta;
+ int64_t delbcnt_delta;
+
+ int64_t rtbcount_delta;
+ int64_t delrtb_delta;
+};
+
+#define XQCHECK_MAX_NR_DQTRXS (XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS)
+
+/*
+ * Track the quota deltas for all dquots attached to a transaction if the
+ * quota deltas are being applied to an inode that we already scanned.
+ */
+struct xqcheck_dqacct {
+ struct rhash_head hash;
+ uintptr_t tx_id;
+ struct xqcheck_dqtrx dqtrx[XQCHECK_MAX_NR_DQTRXS];
+ unsigned int refcount;
+};
+
+/* Free a shadow dquot accounting structure. */
+static void
+xqcheck_dqacct_free(
+ void *ptr,
+ void *arg)
+{
+ struct xqcheck_dqacct *dqa = ptr;
+
+ kfree(dqa);
+}
+
+/* Set us up to scrub quota counters. */
+int
+xchk_setup_quotacheck(
+ struct xfs_scrub *sc)
+{
+ if (!XFS_IS_QUOTA_ON(sc->mp))
+ return -ENOENT;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA);
+
+ sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting dquot resource usage counts. For each xfs_dquot attached
+ * to each inode, we create a shadow dquot, and compute the inode count and add
+ * the data/rt block usage from what we see.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the quota counters for an inode that we've already
+ * scanned. This will cause our counts to be incorrect. Therefore, we hook
+ * the live transaction code in two places: (1) when the callers update the
+ * per-transaction dqtrx structure to log quota counter updates; and (2) when
+ * transaction commit actually logs those updates to the incore dquot. By
+ * shadowing transaction updates in this manner, live quotacheck can ensure
+ * by locking the dquot and the shadow structure that its own copies are not
+ * out of date. Because the hook code runs in a different process context from
+ * the scrub code and the scrub state flags are not accessed atomically,
+ * failures in the hook code must abort the iscan and the scrubber must notice
+ * the aborted scan and set the incomplete flag.
+ *
+ * Note that we use srcu notifier hooks to minimize the overhead when live
+ * quotacheck is /not/ running.
+ */
+
+/* Update an incore dquot counter information from a live update. */
+static int
+xqcheck_update_incore_counts(
+ struct xqcheck *xqc,
+ struct xfarray *counts,
+ xfs_dqid_t id,
+ int64_t inodes,
+ int64_t nblks,
+ int64_t rtblks)
+{
+ struct xqcheck_dquot xcdq;
+ int error;
+
+ error = xfarray_load_sparse(counts, id, &xcdq);
+ if (error)
+ return error;
+
+ xcdq.flags |= XQCHECK_DQUOT_WRITTEN;
+ xcdq.icount += inodes;
+ xcdq.bcount += nblks;
+ xcdq.rtbcount += rtblks;
+
+ error = xfarray_store(counts, id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete.
+ */
+ error = -ECANCELED;
+ }
+ return error;
+}
+
+/* Decide if this is the shadow dquot accounting structure for a transaction. */
+static int
+xqcheck_dqacct_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const uintptr_t *tx_idp = arg->key;
+ const struct xqcheck_dqacct *dqa = obj;
+
+ if (dqa->tx_id != *tx_idp)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xqcheck_dqacct_hash_params = {
+ .min_size = 32,
+ .key_len = sizeof(uintptr_t),
+ .key_offset = offsetof(struct xqcheck_dqacct, tx_id),
+ .head_offset = offsetof(struct xqcheck_dqacct, hash),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xqcheck_dqacct_obj_cmpfn,
+};
+
+/* Find a shadow dqtrx slot for the given dquot. */
+STATIC struct xqcheck_dqtrx *
+xqcheck_get_dqtrx(
+ struct xqcheck_dqacct *dqa,
+ xfs_dqtype_t q_type,
+ xfs_dqid_t q_id)
+{
+ int i;
+
+ for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) {
+ if (dqa->dqtrx[i].q_type == 0 ||
+ (dqa->dqtrx[i].q_type == q_type &&
+ dqa->dqtrx[i].q_id == q_id))
+ return &dqa->dqtrx[i];
+ }
+
+ return NULL;
+}
+
+/*
+ * Create and fill out a quota delta tracking structure to shadow the updates
+ * going on in the regular quota code.
+ */
+static int
+xqcheck_mod_live_ino_dqtrx(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_mod_ino_dqtrx_params *p = data;
+ struct xqcheck *xqc;
+ struct xqcheck_dqacct *dqa;
+ struct xqcheck_dqtrx *dqtrx;
+ int error;
+
+ xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb);
+
+ /* Skip quota reservation fields. */
+ switch (action) {
+ case XFS_TRANS_DQ_BCOUNT:
+ case XFS_TRANS_DQ_DELBCOUNT:
+ case XFS_TRANS_DQ_ICOUNT:
+ case XFS_TRANS_DQ_RTBCOUNT:
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* Ignore dqtrx updates for quota types we don't care about. */
+ switch (p->q_type) {
+ case XFS_DQTYPE_USER:
+ if (!xqc->ucounts)
+ return NOTIFY_DONE;
+ break;
+ case XFS_DQTYPE_GROUP:
+ if (!xqc->gcounts)
+ return NOTIFY_DONE;
+ break;
+ case XFS_DQTYPE_PROJ:
+ if (!xqc->pcounts)
+ return NOTIFY_DONE;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* Skip inodes that haven't been scanned yet. */
+ if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino))
+ return NOTIFY_DONE;
+
+ /* Make a shadow quota accounting tracker for this transaction. */
+ mutex_lock(&xqc->lock);
+ dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+ xqcheck_dqacct_hash_params);
+ if (!dqa) {
+ dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS);
+ if (!dqa)
+ goto out_abort;
+
+ dqa->tx_id = p->tx_id;
+ error = rhashtable_insert_fast(&xqc->shadow_dquot_acct,
+ &dqa->hash, xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_abort;
+ }
+
+ /* Find the shadow dqtrx (or an empty slot) here. */
+ dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+ if (!dqtrx)
+ goto out_abort;
+ if (dqtrx->q_type == 0) {
+ dqtrx->q_type = p->q_type;
+ dqtrx->q_id = p->q_id;
+ dqa->refcount++;
+ }
+
+ /* Update counter */
+ switch (action) {
+ case XFS_TRANS_DQ_BCOUNT:
+ dqtrx->bcount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_DELBCOUNT:
+ dqtrx->delbcnt_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_ICOUNT:
+ dqtrx->icount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_RTBCOUNT:
+ dqtrx->rtbcount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ dqtrx->delrtb_delta += p->delta;
+ break;
+ }
+
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Apply the transaction quota deltas to our shadow quota accounting info when
+ * the regular quota code are doing the same.
+ */
+static int
+xqcheck_apply_live_dqtrx(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_apply_dqtrx_params *p = data;
+ struct xqcheck *xqc;
+ struct xqcheck_dqacct *dqa;
+ struct xqcheck_dqtrx *dqtrx;
+ struct xfarray *counts;
+ int error;
+
+ xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb);
+
+ /* Map the dquot type to an incore counter object. */
+ switch (p->q_type) {
+ case XFS_DQTYPE_USER:
+ counts = xqc->ucounts;
+ break;
+ case XFS_DQTYPE_GROUP:
+ counts = xqc->gcounts;
+ break;
+ case XFS_DQTYPE_PROJ:
+ counts = xqc->pcounts;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL)
+ return NOTIFY_DONE;
+
+ /*
+ * Find the shadow dqtrx for this transaction and dquot, if any deltas
+ * need to be applied here. If not, we're finished early.
+ */
+ mutex_lock(&xqc->lock);
+ dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+ xqcheck_dqacct_hash_params);
+ if (!dqa)
+ goto out_unlock;
+ dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+ if (!dqtrx || dqtrx->q_type == 0)
+ goto out_unlock;
+
+ /* Update our shadow dquot if we're committing. */
+ if (action == XFS_APPLY_DQTRX_COMMIT) {
+ error = xqcheck_update_incore_counts(xqc, counts, p->q_id,
+ dqtrx->icount_delta,
+ dqtrx->bcount_delta + dqtrx->delbcnt_delta,
+ dqtrx->rtbcount_delta + dqtrx->delrtb_delta);
+ if (error)
+ goto out_abort;
+ }
+
+ /* Free the shadow accounting structure if that was the last user. */
+ dqa->refcount--;
+ if (dqa->refcount == 0) {
+ error = rhashtable_remove_fast(&xqc->shadow_dquot_acct,
+ &dqa->hash, xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_abort;
+ xqcheck_dqacct_free(dqa, NULL);
+ }
+
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+out_unlock:
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+}
+
+/* Record this inode's quota usage in our shadow quota counter data. */
+STATIC int
+xqcheck_collect_inode(
+ struct xqcheck *xqc,
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp = xqc->sc->tp;
+ xfs_filblks_t nblks, rtblks;
+ uint ilock_flags = 0;
+ xfs_dqid_t id;
+ bool isreg = S_ISREG(VFS_I(ip)->i_mode);
+ int error = 0;
+
+ if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
+ /*
+ * Quota files are never counted towards quota, so we do not
+ * need to take the lock.
+ */
+ xchk_iscan_mark_visited(&xqc->iscan, ip);
+ return 0;
+ }
+
+ /* Figure out the data / rt device block counts. */
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (isreg)
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Read in the data fork for rt files so that _count_blocks
+ * can count the number of blocks allocated from the rt volume.
+ * Inodes do not track that separately.
+ */
+ ilock_flags = xfs_ilock_data_map_shared(ip);
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_abort;
+ } else {
+ ilock_flags = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ }
+ xfs_inode_count_blocks(tp, ip, &nblks, &rtblks);
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ /* Update the shadow dquot counters. */
+ mutex_lock(&xqc->lock);
+ if (xqc->ucounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER);
+ error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+
+ if (xqc->gcounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP);
+ error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+
+ if (xqc->pcounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ);
+ error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+ mutex_unlock(&xqc->lock);
+
+ xchk_iscan_mark_visited(&xqc->iscan, ip);
+ goto out_ilock;
+
+out_mutex:
+ mutex_unlock(&xqc->lock);
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+out_incomplete:
+ xchk_set_incomplete(xqc->sc);
+out_ilock:
+ xfs_iunlock(ip, ilock_flags);
+ if (isreg)
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return error;
+}
+
+/* Walk all the allocated inodes and run a quota scan on them. */
+STATIC int
+xqcheck_collect_counts(
+ struct xqcheck *xqc)
+{
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Cancel the transaction to release the log grant space while we scan
+ * the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done without
+ * risk of deadlock between sb_internal and the IOLOCK (we take the
+ * IOLOCK to quiesce the file before scanning) because empty
+ * transactions do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) {
+ error = xqcheck_collect_inode(xqc, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xqc->iscan);
+ if (error) {
+ xchk_set_incomplete(sc);
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Switch out for a real transaction in preparation for building a new
+ * tree.
+ */
+ xchk_trans_cancel(sc);
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing dquot resource counters. Walk each xfs_dquot, comparing
+ * the resource usage counters against our shadow dquots; and then walk each
+ * shadow dquot (that wasn't covered in the first part), comparing it against
+ * the xfs_dquot.
+ */
+
+/*
+ * Check the dquot data against what we observed. Caller must hold the dquot
+ * lock.
+ */
+STATIC int
+xqcheck_compare_dquot(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype,
+ struct xfs_dquot *dq)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ int error;
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ xchk_set_incomplete(xqc->sc);
+ return -ECANCELED;
+ }
+
+ mutex_lock(&xqc->lock);
+ error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+ if (error)
+ goto out_unlock;
+
+ if (xcdq.icount != dq->q_ino.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ if (xcdq.bcount != dq->q_blk.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ if (xcdq.rtbcount != dq->q_rtb.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN);
+ error = xfarray_store(counts, dq->q_id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete. This
+ * should never happen outside of the collection phase.
+ */
+ xchk_set_incomplete(xqc->sc);
+ error = -ECANCELED;
+ }
+ mutex_unlock(&xqc->lock);
+ if (error)
+ return error;
+
+ if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -ECANCELED;
+
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xqc->lock);
+ return error;
+}
+
+/*
+ * Walk all the observed dquots, and make sure there's a matching incore
+ * dquot and that its counts match ours.
+ */
+STATIC int
+xqcheck_walk_observations(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfs_dquot *dq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ mutex_lock(&xqc->lock);
+ while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+ xfs_dqid_t id = cur - 1;
+
+ if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED)
+ continue;
+
+ mutex_unlock(&xqc->lock);
+
+ error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq);
+ if (error == -ENOENT) {
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, id);
+ return 0;
+ }
+ if (error)
+ return error;
+
+ error = xqcheck_compare_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ return error;
+
+ if (xchk_should_terminate(xqc->sc, &error))
+ return error;
+
+ mutex_lock(&xqc->lock);
+ }
+ mutex_unlock(&xqc->lock);
+
+ return error;
+}
+
+/* Compare the quota counters we observed against the live dquots. */
+STATIC int
+xqcheck_compare_dqtype(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_dquot *dq;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* If the quota CHKD flag is cleared, we need to repair this quota. */
+ if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) {
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0);
+ return 0;
+ }
+
+ /* Compare what we observed against the actual dquots. */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xqcheck_compare_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /* Walk all the observed dquots and compare to the incore ones. */
+ return xqcheck_walk_observations(xqc, dqtype);
+}
+
+/* Tear down everything associated with a quotacheck. */
+static void
+xqcheck_teardown_scan(
+ void *priv)
+{
+ struct xqcheck *xqc = priv;
+ struct xfs_quotainfo *qi = xqc->sc->mp->m_quotainfo;
+
+ /* Discourage any hook functions that might be running. */
+ xchk_iscan_abort(&xqc->iscan);
+
+ /*
+ * As noted above, the apply hook is responsible for cleaning up the
+ * shadow dquot accounting data when a transaction completes. The mod
+ * hook must be removed before the apply hook so that we don't
+ * mistakenly leave an active shadow account for the mod hook to get
+ * its hands on. No hooks should be running after these functions
+ * return.
+ */
+ xfs_dqtrx_hook_del(qi, &xqc->qhook);
+
+ if (xqc->shadow_dquot_acct.key_len) {
+ rhashtable_free_and_destroy(&xqc->shadow_dquot_acct,
+ xqcheck_dqacct_free, NULL);
+ xqc->shadow_dquot_acct.key_len = 0;
+ }
+
+ if (xqc->pcounts) {
+ xfarray_destroy(xqc->pcounts);
+ xqc->pcounts = NULL;
+ }
+
+ if (xqc->gcounts) {
+ xfarray_destroy(xqc->gcounts);
+ xqc->gcounts = NULL;
+ }
+
+ if (xqc->ucounts) {
+ xfarray_destroy(xqc->ucounts);
+ xqc->ucounts = NULL;
+ }
+
+ xchk_iscan_teardown(&xqc->iscan);
+ mutex_destroy(&xqc->lock);
+ xqc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate quota counter data.
+ * If the scan is successful, the quota data will be left alive for a repair.
+ * If any error occurs, we'll tear everything down.
+ */
+STATIC int
+xqcheck_setup_scan(
+ struct xfs_scrub *sc,
+ struct xqcheck *xqc)
+{
+ char *descr;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL;
+ int error;
+
+ ASSERT(xqc->sc == NULL);
+ xqc->sc = sc;
+
+ mutex_init(&xqc->lock);
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &xqc->iscan);
+
+ error = -ENOMEM;
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) {
+ descr = xchk_xfile_descr(sc, "user dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->ucounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) {
+ descr = xchk_xfile_descr(sc, "group dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->gcounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) {
+ descr = xchk_xfile_descr(sc, "project dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->pcounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ /*
+ * Set up hash table to map transactions to our internal shadow dqtrx
+ * structures.
+ */
+ error = rhashtable_init(&xqc->shadow_dquot_acct,
+ &xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Hook into the quota code. The hook only triggers for inodes that
+ * were already scanned, and the scanner thread takes each inode's
+ * ILOCK, which means that any in-progress inode updates will finish
+ * before we can scan the inode.
+ *
+ * The apply hook (which removes the shadow dquot accounting struct)
+ * must be installed before the mod hook so that we never fail to catch
+ * the end of a quota update sequence and leave stale shadow data.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_QUOTA);
+ xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx,
+ xqcheck_apply_live_dqtrx);
+
+ error = xfs_dqtrx_hook_add(qi, &xqc->qhook);
+ if (error)
+ goto out_teardown;
+
+ /* Use deferred cleanup to pass the quota count data to repair. */
+ sc->buf_cleanup = xqcheck_teardown_scan;
+ return 0;
+
+out_teardown:
+ xqcheck_teardown_scan(xqc);
+ return error;
+}
+
+/* Scrub all counters for a given quota type. */
+int
+xchk_quotacheck(
+ struct xfs_scrub *sc)
+{
+ struct xqcheck *xqc = sc->buf;
+ int error = 0;
+
+ /* Check quota counters on the live filesystem. */
+ error = xqcheck_setup_scan(sc, xqc);
+ if (error)
+ return error;
+
+ /* Walk all inodes, picking up quota information. */
+ error = xqcheck_collect_counts(xqc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Fail fast if we're not playing with a full dataset. */
+ if (xchk_iscan_aborted(&xqc->iscan))
+ xchk_set_incomplete(sc);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare quota counters. */
+ if (xqc->ucounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+ if (xqc->gcounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+ if (xqc->pcounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+
+ /* Check one last time for an incomplete dataset. */
+ if (xchk_iscan_aborted(&xqc->iscan))
+ xchk_set_incomplete(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h
new file mode 100644
index 000000000000..4ea5f249c978
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTACHECK_H__
+#define __XFS_SCRUB_QUOTACHECK_H__
+
+/* Quota counters for live quotacheck. */
+struct xqcheck_dquot {
+ /* block usage count */
+ int64_t bcount;
+
+ /* inode usage count */
+ int64_t icount;
+
+ /* realtime block usage count */
+ int64_t rtbcount;
+
+ /* Record state */
+ unsigned int flags;
+};
+
+/*
+ * This incore dquot record has been written at least once. We never want to
+ * store an xqcheck_dquot that looks uninitialized.
+ */
+#define XQCHECK_DQUOT_WRITTEN (1U << 0)
+
+/* Already checked this dquot. */
+#define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1)
+
+/* Already repaired this dquot. */
+#define XQCHECK_DQUOT_REPAIR_SCANNED (1U << 2)
+
+/* Live quotacheck control structure. */
+struct xqcheck {
+ struct xfs_scrub *sc;
+
+ /* Shadow dquot counter data. */
+ struct xfarray *ucounts;
+ struct xfarray *gcounts;
+ struct xfarray *pcounts;
+
+ /* Lock protecting quotacheck count observations */
+ struct mutex lock;
+
+ struct xchk_iscan iscan;
+
+ /* Hooks into the quota code. */
+ struct xfs_dqtrx_hook qhook;
+
+ /* Shadow quota delta tracking structure. */
+ struct rhashtable shadow_dquot_acct;
+};
+
+/* Return the incore counter array for a given quota type. */
+static inline struct xfarray *
+xqcheck_counters_for(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ switch (dqtype) {
+ case XFS_DQTYPE_USER:
+ return xqc->ucounts;
+ case XFS_DQTYPE_GROUP:
+ return xqc->gcounts;
+ case XFS_DQTYPE_PROJ:
+ return xqc->pcounts;
+ }
+
+ ASSERT(0);
+ return NULL;
+}
+
+#endif /* __XFS_SCRUB_QUOTACHECK_H__ */
diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c
new file mode 100644
index 000000000000..dd8554c755b5
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck_repair.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck Repair
+ * ======================
+ *
+ * Use the live quota counter information that we collected to replace the
+ * counter values in the incore dquots. A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * dquot is locked.
+ */
+
+/* Commit new counters to a dquot. */
+static int
+xqcheck_commit_dquot(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype,
+ struct xfs_dquot *dq)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ int64_t delta;
+ bool dirty = false;
+ int error = 0;
+
+ /* Unlock the dquot just long enough to allocate a transaction. */
+ xfs_dqunlock(dq);
+ error = xchk_trans_alloc(xqc->sc, 0);
+ xfs_dqlock(dq);
+ if (error)
+ return error;
+
+ xfs_trans_dqjoin(xqc->sc->tp, dq);
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ error = -ECANCELED;
+ goto out_cancel;
+ }
+
+ mutex_lock(&xqc->lock);
+ error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+ if (error)
+ goto out_unlock;
+
+ /* Adjust counters as needed. */
+ delta = (int64_t)xcdq.icount - dq->q_ino.count;
+ if (delta) {
+ dq->q_ino.reserved += delta;
+ dq->q_ino.count += delta;
+ dirty = true;
+ }
+
+ delta = (int64_t)xcdq.bcount - dq->q_blk.count;
+ if (delta) {
+ dq->q_blk.reserved += delta;
+ dq->q_blk.count += delta;
+ dirty = true;
+ }
+
+ delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count;
+ if (delta) {
+ dq->q_rtb.reserved += delta;
+ dq->q_rtb.count += delta;
+ dirty = true;
+ }
+
+ xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN);
+ error = xfarray_store(counts, dq->q_id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the repair
+ * and must cancel the whole operation. This should never
+ * happen, but we need to catch it anyway.
+ */
+ error = -ECANCELED;
+ }
+ mutex_unlock(&xqc->lock);
+ if (error || !dirty)
+ goto out_cancel;
+
+ trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id);
+
+ /* Commit the dirty dquot to disk. */
+ dq->q_flags |= XFS_DQFLAG_DIRTY;
+ if (dq->q_id)
+ xfs_qm_adjust_dqtimers(dq);
+ xfs_trans_log_dquot(xqc->sc->tp, dq);
+
+ /*
+ * Transaction commit unlocks the dquot, so we must re-lock it so that
+ * the caller can put the reference (which apparently requires a locked
+ * dquot).
+ */
+ error = xrep_trans_commit(xqc->sc);
+ xfs_dqlock(dq);
+ return error;
+
+out_unlock:
+ mutex_unlock(&xqc->lock);
+out_cancel:
+ xchk_trans_cancel(xqc->sc);
+
+ /* Re-lock the dquot so the caller can put the reference. */
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Commit new quota counters for a particular quota type. */
+STATIC int
+xqcheck_commit_dqtype(
+ struct xqcheck *xqc,
+ unsigned int dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xqcheck_dquot xcdq;
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ struct xfs_dquot *dq;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ /*
+ * Update the counters of every dquot that the quota file knows about.
+ */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xqcheck_commit_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Make a second pass to deal with the dquots that we know about but
+ * the quota file previously did not know about.
+ */
+ mutex_lock(&xqc->lock);
+ while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+ xfs_dqid_t id = cur - 1;
+
+ if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED)
+ continue;
+
+ mutex_unlock(&xqc->lock);
+
+ /*
+ * Grab the dquot, allowing for dquot block allocation in a
+ * separate transaction. We committed the scrub transaction
+ * in a previous step, so we will not be creating nested
+ * transactions here.
+ */
+ error = xfs_qm_dqget(mp, id, dqtype, true, &dq);
+ if (error)
+ return error;
+
+ error = xqcheck_commit_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ return error;
+
+ mutex_lock(&xqc->lock);
+ }
+ mutex_unlock(&xqc->lock);
+
+ return error;
+}
+
+/* Figure out quota CHKD flags for the running quota types. */
+static inline unsigned int
+xqcheck_chkd_flags(
+ struct xfs_mount *mp)
+{
+ unsigned int ret = 0;
+
+ if (XFS_IS_UQUOTA_ON(mp))
+ ret |= XFS_UQUOTA_CHKD;
+ if (XFS_IS_GQUOTA_ON(mp))
+ ret |= XFS_GQUOTA_CHKD;
+ if (XFS_IS_PQUOTA_ON(mp))
+ ret |= XFS_PQUOTA_CHKD;
+ return ret;
+}
+
+/* Commit the new dquot counters. */
+int
+xrep_quotacheck(
+ struct xfs_scrub *sc)
+{
+ struct xqcheck *xqc = sc->buf;
+ unsigned int qflags = xqcheck_chkd_flags(sc->mp);
+ int error;
+
+ /*
+ * Clear the CHKD flag for the running quota types and commit the scrub
+ * transaction so that we can allocate new quota block mappings if we
+ * have to. If we crash after this point, the sb still has the CHKD
+ * flags cleared, so mount quotacheck will fix all of this up.
+ */
+ xrep_update_qflags(sc, qflags, 0);
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ /* Commit the new counters to the dquots. */
+ if (xqc->ucounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER);
+ if (error)
+ return error;
+ }
+ if (xqc->gcounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP);
+ if (error)
+ return error;
+ }
+ if (xqc->pcounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ);
+ if (error)
+ return error;
+ }
+
+ /* Set the CHKD flags now that we've fixed quota counts. */
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ xrep_update_qflags(sc, 0, qflags);
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c
new file mode 100644
index 000000000000..e1e52bc20713
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/scrub.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/rcbag.h"
+#include "scrub/trace.h"
+
+struct rcbag {
+ struct xfs_mount *mp;
+ struct xfbtree xfbtree;
+ uint64_t nr_items;
+};
+
+int
+rcbag_init(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *btp,
+ struct rcbag **bagp)
+{
+ struct rcbag *bag;
+ int error;
+
+ bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS);
+ if (!bag)
+ return -ENOMEM;
+
+ bag->nr_items = 0;
+ bag->mp = mp;
+
+ error = rcbagbt_mem_init(mp, &bag->xfbtree, btp);
+ if (error)
+ goto out_bag;
+
+ *bagp = bag;
+ return 0;
+
+out_bag:
+ kfree(bag);
+ return error;
+}
+
+void
+rcbag_free(
+ struct rcbag **bagp)
+{
+ struct rcbag *bag = *bagp;
+
+ xfbtree_destroy(&bag->xfbtree);
+ kfree(bag);
+ *bagp = NULL;
+}
+
+/* Track an rmap in the refcount bag. */
+int
+rcbag_add(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = rcbagbt_lookup_eq(cur, rmap, &has);
+ if (error)
+ goto out_cur;
+
+ if (has) {
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bagrec.rbg_refcount++;
+ error = rcbagbt_update(cur, &bagrec);
+ if (error)
+ goto out_cur;
+ } else {
+ bagrec.rbg_startblock = rmap->rm_startblock;
+ bagrec.rbg_blockcount = rmap->rm_blockcount;
+ bagrec.rbg_refcount = 1;
+
+ error = rcbagbt_insert(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ error = xfbtree_trans_commit(&bag->xfbtree, tp);
+ if (error)
+ return error;
+
+ bag->nr_items++;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Return the number of records in the bag. */
+uint64_t
+rcbag_count(
+ const struct rcbag *rcbag)
+{
+ return rcbag->nr_items;
+}
+
+static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r)
+{
+ return r->rbg_startblock + r->rbg_blockcount;
+}
+
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+int
+rcbag_next_edge(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap,
+ bool next_valid,
+ uint32_t *next_bnop)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ uint32_t next_bno = NULLAGBLOCK;
+ int has;
+ int error;
+
+ if (next_valid)
+ next_bno = next_rmap->rm_startblock;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec));
+ }
+
+ /*
+ * We should have found /something/ because either next_rrm is the next
+ * interesting rmap to look at after emitting this refcount extent, or
+ * there are other rmaps in rmap_bag contributing to the current
+ * sharing count. But if something is seriously wrong, bail out.
+ */
+ if (next_bno == NULLAGBLOCK) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ *next_bnop = next_bno;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/* Pop all refcount bag records that end at next_bno */
+int
+rcbag_remove_ending_at(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ uint32_t next_bno)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ /* go to the right edge of the tree */
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec));
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_decrement(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ if (rcbag_rec_next_bno(&bagrec) != next_bno)
+ continue;
+
+ error = xfs_btree_delete(cur, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bag->nr_items -= bagrec.rbg_refcount;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+ return xfbtree_trans_commit(&bag->xfbtree, tp);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Dump the rcbag. */
+void
+rcbag_dump(
+ struct rcbag *bag,
+ struct xfs_trans *tp)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ unsigned long long nr = 0;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n",
+ nr++,
+ (unsigned int)bagrec.rbg_startblock,
+ (unsigned int)bagrec.rbg_blockcount,
+ (unsigned long long)bagrec.rbg_refcount);
+ }
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+}
diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h
new file mode 100644
index 000000000000..e29ef788ba72
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_H__
+#define __XFS_SCRUB_RCBAG_H__
+
+struct xfs_mount;
+struct rcbag;
+struct xfs_buftarg;
+
+int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp,
+ struct rcbag **bagp);
+void rcbag_free(struct rcbag **bagp);
+int rcbag_add(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap);
+uint64_t rcbag_count(const struct rcbag *bag);
+
+int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap, bool next_valid,
+ uint32_t *next_bnop);
+int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp,
+ uint32_t next_bno);
+
+void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp);
+
+#endif /* __XFS_SCRUB_RCBAG_H__ */
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
new file mode 100644
index 000000000000..709356dc6256
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/trace.h"
+
+static struct kmem_cache *rcbagbt_cur_cache;
+
+STATIC void
+rcbagbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ struct rcbag_key *bag_key = (struct rcbag_key *)key;
+ const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec;
+
+ BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key));
+ BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec));
+
+ bag_key->rbg_startblock = bag_rec->rbg_startblock;
+ bag_key->rbg_blockcount = bag_rec->rbg_blockcount;
+}
+
+STATIC void
+rcbagbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec;
+ struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec;
+
+ bag_rec->rbg_startblock = bag_irec->rbg_startblock;
+ bag_rec->rbg_blockcount = bag_irec->rbg_blockcount;
+ bag_rec->rbg_refcount = bag_irec->rbg_refcount;
+}
+
+STATIC int64_t
+rcbagbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+ const struct rcbag_key *kp = (const struct rcbag_key *)key;
+
+ if (kp->rbg_startblock > rec->rbg_startblock)
+ return 1;
+ if (kp->rbg_startblock < rec->rbg_startblock)
+ return -1;
+
+ if (kp->rbg_blockcount > rec->rbg_blockcount)
+ return 1;
+ if (kp->rbg_blockcount < rec->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int64_t
+rcbagbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ ASSERT(mask == NULL);
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 1;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return -1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 1;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 0;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return 1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 0;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1;
+ const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2;
+
+ if (rp1->rbg_startblock > rp2->rbg_startblock)
+ return 0;
+ if (rp1->rbg_startblock < rp2->rbg_startblock)
+ return 1;
+
+ if (rp1->rbg_blockcount > rp2->rbg_blockcount)
+ return 0;
+ if (rp1->rbg_blockcount < rp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+static xfs_failaddr_t
+rcbagbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (level >= rcbagbt_maxlevels_possible())
+ return __this_address;
+
+ maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+rcbagbt_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = rcbagbt_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops rcbagbt_mem_buf_ops = {
+ .name = "rcbagbt_mem",
+ .magic = { 0, cpu_to_be32(RCBAG_MAGIC) },
+ .verify_read = rcbagbt_rw_verify,
+ .verify_write = rcbagbt_rw_verify,
+ .verify_struct = rcbagbt_verify,
+};
+
+static const struct xfs_btree_ops rcbagbt_mem_ops = {
+ .name = "rcbag",
+ .type = XFS_BTREE_TYPE_MEM,
+
+ .rec_len = sizeof(struct rcbag_rec),
+ .key_len = sizeof(struct rcbag_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = 1,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = rcbagbt_init_key_from_rec,
+ .init_rec_from_cur = rcbagbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = rcbagbt_key_diff,
+ .buf_ops = &rcbagbt_mem_buf_ops,
+ .diff_two_keys = rcbagbt_diff_two_keys,
+ .keys_inorder = rcbagbt_keys_inorder,
+ .recs_inorder = rcbagbt_recs_inorder,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+rcbagbt_mem_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfbtree *xfbtree)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops,
+ rcbagbt_maxlevels_possible(), rcbagbt_cur_cache);
+
+ cur->bc_mem.xfbtree = xfbtree;
+ cur->bc_nlevels = xfbtree->nlevels;
+ return cur;
+}
+
+/* Create an in-memory refcount bag btree. */
+int
+rcbagbt_mem_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp)
+{
+ xfbt->owner = 0;
+ return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops);
+}
+
+/* Calculate number of records in a refcount bag btree block. */
+static inline unsigned int
+rcbagbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct rcbag_rec);
+ return blocklen /
+ (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount bag btree block.
+ */
+unsigned int
+rcbagbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= RCBAG_BLOCK_LEN;
+ return rcbagbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for refcount bag btrees. */
+unsigned int
+rcbagbt_maxlevels_possible(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_space_to_height(minrecs, ULLONG_MAX);
+}
+
+/* Calculate the refcount bag btree size for some records. */
+unsigned long long
+rcbagbt_calc_size(
+ unsigned long long nr_records)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_calc_size(minrecs, nr_records);
+}
+
+int __init
+rcbagbt_init_cur_cache(void)
+{
+ rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur",
+ xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()),
+ 0, 0, NULL);
+
+ if (!rcbagbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+rcbagbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(rcbagbt_cur_cache);
+ rcbagbt_cur_cache = NULL;
+}
+
+/* Look up the refcount bag record corresponding to this reverse mapping. */
+int
+rcbagbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap,
+ int *success)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+
+ rec->rbg_startblock = rmap->rm_startblock;
+ rec->rbg_blockcount = rmap->rm_blockcount;
+
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success);
+}
+
+/* Get the data from the pointed-to record. */
+int
+rcbagbt_get_rec(
+ struct xfs_btree_cur *cur,
+ struct rcbag_rec *rec,
+ int *has)
+{
+ union xfs_btree_rec *btrec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &btrec, has);
+ if (error || !(*has))
+ return error;
+
+ memcpy(rec, btrec, sizeof(struct rcbag_rec));
+ return 0;
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_update(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec)
+{
+ union xfs_btree_rec btrec;
+
+ memcpy(&btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_update(cur, &btrec);
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_insert(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec,
+ int *success)
+{
+ struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec;
+
+ memcpy(btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_insert(cur, success);
+}
diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h
new file mode 100644
index 000000000000..03cadb032552
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_BTREE_H__
+#define __XFS_SCRUB_RCBAG_BTREE_H__
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */
+
+struct rcbag_key {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+};
+
+struct rcbag_rec {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+ uint64_t rbg_refcount;
+};
+
+typedef __be64 rcbag_ptr_t;
+
+/* reflinks only exist on crc enabled filesystems */
+#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define RCBAG_REC_ADDR(block, index) \
+ ((struct rcbag_rec *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct rcbag_rec))))
+
+#define RCBAG_KEY_ADDR(block, index) \
+ ((struct rcbag_key *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ ((index) - 1) * sizeof(struct rcbag_key)))
+
+#define RCBAG_PTR_ADDR(block, index, maxrecs) \
+ ((rcbag_ptr_t *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (maxrecs) * sizeof(struct rcbag_key) + \
+ ((index) - 1) * sizeof(rcbag_ptr_t)))
+
+unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
+
+unsigned long long rcbagbt_calc_size(unsigned long long nr_records);
+
+unsigned int rcbagbt_maxlevels_possible(void);
+
+int __init rcbagbt_init_cur_cache(void);
+void rcbagbt_destroy_cur_cache(void);
+
+struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp);
+
+int rcbagbt_lookup_eq(struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap, int *success);
+int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has);
+int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec);
+int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec,
+ int *success);
+
+#else
+# define rcbagbt_init_cur_cache() 0
+# define rcbagbt_destroy_cur_cache() ((void)0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index 16462332c897..dfdcb96b6c16 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -281,7 +281,7 @@ xchk_dir_walk(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
@@ -332,7 +332,7 @@ xchk_dir_lookup(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
error = xfs_dir2_sf_lookup(&args);
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index f99eca799809..0252a3b5b65a 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -114,7 +114,7 @@ xreap_put_freelist(
int error;
/* Make sure there's space on the freelist. */
- error = xrep_fix_freelist(sc, true);
+ error = xrep_fix_freelist(sc, 0);
if (error)
return error;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index bf22f245bbfa..d0c7d4a29c0f 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -7,8 +7,10 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
@@ -17,6 +19,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reference count btrees.
@@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt(
{
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_refcountbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index f38fccc42a20..a00d7ce7ae5b 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -25,6 +25,7 @@
#include "xfs_refcount_btree.h"
#include "xfs_error.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -37,6 +38,7 @@
#include "scrub/xfarray.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"
+#include "scrub/rcbag.h"
/*
* Rebuilding the Reference Count Btree
@@ -97,12 +99,6 @@
* insert all the records.
*/
-/* The only parts of the rmap that we care about for computing refcounts. */
-struct xrep_refc_rmap {
- xfs_agblock_t startblock;
- xfs_extlen_t blockcount;
-} __packed;
-
struct xrep_refc {
/* refcount extents */
struct xfarray *refcount_records;
@@ -122,6 +118,20 @@ struct xrep_refc {
xfs_extlen_t btblocks;
};
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_ag_refcountbt(
+ struct xfs_scrub *sc)
+{
+ char *descr;
+ int error;
+
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ return error;
+}
+
/* Check for any obvious conflicts with this shared/CoW staging extent. */
STATIC int
xrep_refc_check_ext(
@@ -223,10 +233,9 @@ xrep_refc_rmap_shareable(
STATIC int
xrep_refc_walk_rmaps(
struct xrep_refc *rr,
- struct xrep_refc_rmap *rrm,
+ struct xfs_rmap_irec *rmap,
bool *have_rec)
{
- struct xfs_rmap_irec rmap;
struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
struct xfs_mount *mp = cur->bc_mp;
int have_gt;
@@ -250,29 +259,30 @@ xrep_refc_walk_rmaps(
if (!have_gt)
return 0;
- error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+ error = xfs_rmap_get_rec(cur, rmap, &have_gt);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, !have_gt))
+ if (XFS_IS_CORRUPT(mp, !have_gt)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
- if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
- error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
- rmap.rm_blockcount);
+ if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
+ rmap->rm_blockcount);
if (error)
return error;
- } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+ } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) {
/* refcountbt block, dump it when we're done. */
- rr->btblocks += rmap.rm_blockcount;
+ rr->btblocks += rmap->rm_blockcount;
error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
- rmap.rm_startblock, rmap.rm_blockcount);
+ rmap->rm_startblock,
+ rmap->rm_blockcount);
if (error)
return error;
}
- } while (!xrep_refc_rmap_shareable(mp, &rmap));
+ } while (!xrep_refc_rmap_shareable(mp, rmap));
- rrm->startblock = rmap.rm_startblock;
- rrm->blockcount = rmap.rm_blockcount;
*have_rec = true;
return 0;
}
@@ -354,45 +364,6 @@ xrep_refc_sort_records(
return error;
}
-#define RRM_NEXT(r) ((r).startblock + (r).blockcount)
-/*
- * Find the next block where the refcount changes, given the next rmap we
- * looked at and the ones we're already tracking.
- */
-static inline int
-xrep_refc_next_edge(
- struct xfarray *rmap_bag,
- struct xrep_refc_rmap *next_rrm,
- bool next_valid,
- xfs_agblock_t *nbnop)
-{
- struct xrep_refc_rmap rrm;
- xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
- xfs_agblock_t nbno = NULLAGBLOCK;
- int error;
-
- if (next_valid)
- nbno = next_rrm->startblock;
-
- while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
- nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
-
- if (error)
- return error;
-
- /*
- * We should have found /something/ because either next_rrm is the next
- * interesting rmap to look at after emitting this refcount extent, or
- * there are other rmaps in rmap_bag contributing to the current
- * sharing count. But if something is seriously wrong, bail out.
- */
- if (nbno == NULLAGBLOCK)
- return -EFSCORRUPTED;
-
- *nbnop = nbno;
- return 0;
-}
-
/*
* Walk forward through the rmap btree to collect all rmaps starting at
* @bno in @rmap_bag. These represent the file(s) that share ownership of
@@ -402,22 +373,21 @@ xrep_refc_next_edge(
static int
xrep_refc_push_rmaps_at(
struct xrep_refc *rr,
- struct xfarray *rmap_bag,
+ struct rcbag *rcstack,
xfs_agblock_t bno,
- struct xrep_refc_rmap *rrm,
- bool *have,
- uint64_t *stack_sz)
+ struct xfs_rmap_irec *rmap,
+ bool *have)
{
struct xfs_scrub *sc = rr->sc;
int have_gt;
int error;
- while (*have && rrm->startblock == bno) {
- error = xfarray_store_anywhere(rmap_bag, rrm);
+ while (*have && rmap->rm_startblock == bno) {
+ error = rcbag_add(rcstack, rr->sc->tp, rmap);
if (error)
return error;
- (*stack_sz)++;
- error = xrep_refc_walk_rmaps(rr, rrm, have);
+
+ error = xrep_refc_walk_rmaps(rr, rmap, have);
if (error)
return error;
}
@@ -425,8 +395,10 @@ xrep_refc_push_rmaps_at(
error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
if (error)
return error;
- if (XFS_IS_CORRUPT(sc->mp, !have_gt))
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+ xfs_btree_mark_sick(sc->sa.rmap_cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -436,12 +408,9 @@ STATIC int
xrep_refc_find_refcounts(
struct xrep_refc *rr)
{
- struct xrep_refc_rmap rrm;
struct xfs_scrub *sc = rr->sc;
- struct xfarray *rmap_bag;
- char *descr;
- uint64_t old_stack_sz;
- uint64_t stack_sz = 0;
+ struct rcbag *rcstack;
+ uint64_t old_stack_height;
xfs_agblock_t sbno;
xfs_agblock_t cbno;
xfs_agblock_t nbno;
@@ -451,14 +420,11 @@ xrep_refc_find_refcounts(
xrep_ag_btcur_init(sc, &sc->sa);
/*
- * Set up a sparse array to store all the rmap records that we're
- * tracking to generate a reference count record. If this exceeds
+ * Set up a bag to store all the rmap records that we're tracking to
+ * generate a reference count record. If the size of the bag exceeds
* MAXREFCOUNT, we clamp rc_refcount.
*/
- descr = xchk_xfile_ag_descr(sc, "rmap record bag");
- error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
- &rmap_bag);
- kfree(descr);
+ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
if (error)
goto out_cur;
@@ -469,62 +435,54 @@ xrep_refc_find_refcounts(
/* Process reverse mappings into refcount data. */
while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+ struct xfs_rmap_irec rmap;
+
/* Push all rmaps with pblk == sbno onto the stack */
- error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
if (error)
goto out_bag;
if (!have)
break;
- sbno = cbno = rrm.startblock;
- error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
- &rrm, &have, &stack_sz);
+ sbno = cbno = rmap.rm_startblock;
+ error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+ &have);
if (error)
goto out_bag;
/* Set nbno to the bno of the next refcount change */
- error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
if (error)
goto out_bag;
ASSERT(nbno > sbno);
- old_stack_sz = stack_sz;
+ old_stack_height = rcbag_count(rcstack);
/* While stack isn't empty... */
- while (stack_sz) {
- xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
-
+ while (rcbag_count(rcstack) > 0) {
/* Pop all rmaps that end at nbno */
- while ((error = xfarray_iter(rmap_bag, &array_cur,
- &rrm)) == 1) {
- if (RRM_NEXT(rrm) != nbno)
- continue;
- error = xfarray_unset(rmap_bag, array_cur - 1);
- if (error)
- goto out_bag;
- stack_sz--;
- }
+ error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
if (error)
goto out_bag;
/* Push array items that start at nbno */
- error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
if (error)
goto out_bag;
if (have) {
- error = xrep_refc_push_rmaps_at(rr, rmap_bag,
- nbno, &rrm, &have, &stack_sz);
+ error = xrep_refc_push_rmaps_at(rr, rcstack,
+ nbno, &rmap, &have);
if (error)
goto out_bag;
}
/* Emit refcount if necessary */
ASSERT(nbno > cbno);
- if (stack_sz != old_stack_sz) {
- if (old_stack_sz > 1) {
+ if (rcbag_count(rcstack) != old_stack_height) {
+ if (old_stack_height > 1) {
error = xrep_refc_stash(rr,
XFS_REFC_DOMAIN_SHARED,
cbno, nbno - cbno,
- old_stack_sz);
+ old_stack_height);
if (error)
goto out_bag;
}
@@ -532,13 +490,13 @@ xrep_refc_find_refcounts(
}
/* Stack empty, go find the next rmap */
- if (stack_sz == 0)
+ if (rcbag_count(rcstack) == 0)
break;
- old_stack_sz = stack_sz;
+ old_stack_height = rcbag_count(rcstack);
sbno = nbno;
/* Set nbno to the bno of the next refcount change */
- error = xrep_refc_next_edge(rmap_bag, &rrm, have,
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
&nbno);
if (error)
goto out_bag;
@@ -547,14 +505,13 @@ xrep_refc_find_refcounts(
}
}
- ASSERT(stack_sz == 0);
+ ASSERT(rcbag_count(rcstack) == 0);
out_bag:
- xfarray_destroy(rmap_bag);
+ rcbag_free(&rcstack);
out_cur:
xchk_ag_btcur_free(&sc->sa);
return error;
}
-#undef RRM_NEXT
/* Retrieve refcountbt data for bulk load. */
STATIC int
@@ -653,8 +610,8 @@ xrep_refc_build_new_tree(
rr->new_btree.bload.claim_block = xrep_refc_claim_block;
/* Compute how many blocks we'll need. */
- refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
- pag);
+ refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake);
error = xfs_btree_bload_compute_geometry(refc_cur,
&rr->new_btree.bload,
xfarray_length(rr->refcount_records));
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 745d5b8f405a..f43dce771cdd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -30,12 +30,15 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_reflink.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/stats.h"
+#include "scrub/xfile.h"
/*
* Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -400,7 +403,7 @@ xrep_calc_ag_resblks(
int
xrep_fix_freelist(
struct xfs_scrub *sc,
- bool can_shrink)
+ int alloc_flags)
{
struct xfs_alloc_arg args = {0};
@@ -410,8 +413,7 @@ xrep_fix_freelist(
args.alignment = 1;
args.pag = sc->sa.pag;
- return xfs_alloc_fix_freelist(&args,
- can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+ return xfs_alloc_fix_freelist(&args, alloc_flags);
}
/*
@@ -687,6 +689,44 @@ xrep_find_ag_btree_roots(
}
#ifdef CONFIG_XFS_QUOTA
+/* Update some quota flags in the superblock. */
+void
+xrep_update_qflags(
+ struct xfs_scrub *sc,
+ unsigned int clear_flags,
+ unsigned int set_flags)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+
+ mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+ if ((mp->m_qflags & clear_flags) == 0 &&
+ (mp->m_qflags & set_flags) == set_flags)
+ goto no_update;
+
+ mp->m_qflags &= ~clear_flags;
+ mp->m_qflags |= set_flags;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags &= ~clear_flags;
+ mp->m_sb.sb_qflags |= set_flags;
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * Update the quota flags in the ondisk superblock without touching
+ * the summary counters. We have not quiesced inode chunk allocation,
+ * so we cannot coordinate with updates to the icount and ifree percpu
+ * counters.
+ */
+ bp = xfs_trans_getsb(sc->tp);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+
+no_update:
+ mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+}
+
/* Force a quotacheck the next time we mount. */
void
xrep_force_quotacheck(
@@ -699,13 +739,7 @@ xrep_force_quotacheck(
if (!(flag & sc->mp->m_qflags))
return;
- mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
- sc->mp->m_qflags &= ~flag;
- spin_lock(&sc->mp->m_sb_lock);
- sc->mp->m_sb.sb_qflags &= ~flag;
- spin_unlock(&sc->mp->m_sb_lock);
- xfs_log_sb(sc->tp);
- mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ xrep_update_qflags(sc, flag, 0);
}
/*
@@ -799,20 +833,20 @@ xrep_ag_btcur_init(
/* Set up a bnobt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
- sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
- sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+ sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
}
/* Set up a inobt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
- sa->agi_bp, XFS_BTNUM_INO);
+ sa->agi_bp);
if (xfs_has_finobt(mp))
- sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
- sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
+ sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag,
+ sc->tp, sa->agi_bp);
}
/* Set up a rmapbt cursor for cross-referencing. */
@@ -1115,3 +1149,55 @@ xrep_metadata_inode_forks(
return 0;
}
+
+/*
+ * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating
+ * a shmem file might take loks, so we cannot be in transaction context. Park
+ * our resources in the scrub context and let the teardown function take care
+ * of them at the right time.
+ */
+int
+xrep_setup_xfbtree(
+ struct xfs_scrub *sc,
+ const char *descr)
+{
+ ASSERT(sc->tp == NULL);
+
+ return xmbuf_alloc(sc->mp, descr, &sc->xmbtp);
+}
+
+/*
+ * Create a dummy transaction for use in a live update hook function. This
+ * function MUST NOT be called from regular repair code because the current
+ * process' transaction is saved via the cookie.
+ */
+int
+xrep_trans_alloc_hook_dummy(
+ struct xfs_mount *mp,
+ void **cookiep,
+ struct xfs_trans **tpp)
+{
+ int error;
+
+ *cookiep = current->journal_info;
+ current->journal_info = NULL;
+
+ error = xfs_trans_alloc_empty(mp, tpp);
+ if (!error)
+ return 0;
+
+ current->journal_info = *cookiep;
+ *cookiep = NULL;
+ return error;
+}
+
+/* Cancel a dummy transaction used by a live update hook function. */
+void
+xrep_trans_cancel_hook_dummy(
+ void **cookiep,
+ struct xfs_trans *tp)
+{
+ xfs_trans_cancel(tp);
+ current->journal_info = *cookiep;
+ *cookiep = NULL;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 17114327e6fa..ce082d941459 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -51,7 +51,7 @@ struct xbitmap;
struct xagb_bitmap;
struct xfsb_bitmap;
-int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
+int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
struct xrep_find_ag_btree {
/* in: rmap owner of the btree we're looking for */
@@ -72,6 +72,8 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
#ifdef CONFIG_XFS_QUOTA
+void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags,
+ unsigned int set_flags);
void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
#else
@@ -79,11 +81,15 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
# define xrep_ino_dqattach(sc) (0)
#endif /* CONFIG_XFS_QUOTA */
+int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr);
+
int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
xfs_extnum_t nextents);
int xrep_reset_perag_resv(struct xfs_scrub *sc);
int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
int xrep_metadata_inode_forks(struct xfs_scrub *sc);
+int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
+int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -109,11 +115,14 @@ int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
int xrep_allocbt(struct xfs_scrub *sc);
int xrep_iallocbt(struct xfs_scrub *sc);
+int xrep_rmapbt(struct xfs_scrub *sc);
int xrep_refcountbt(struct xfs_scrub *sc);
int xrep_inode(struct xfs_scrub *sc);
int xrep_bmap_data(struct xfs_scrub *sc);
int xrep_bmap_attr(struct xfs_scrub *sc);
int xrep_bmap_cow(struct xfs_scrub *sc);
+int xrep_nlinks(struct xfs_scrub *sc);
+int xrep_fscounters(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xrep_rtbitmap(struct xfs_scrub *sc);
@@ -123,13 +132,19 @@ int xrep_rtbitmap(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_QUOTA
int xrep_quota(struct xfs_scrub *sc);
+int xrep_quotacheck(struct xfs_scrub *sc);
#else
# define xrep_quota xrep_notsupported
+# define xrep_quotacheck xrep_notsupported
#endif /* CONFIG_XFS_QUOTA */
int xrep_reinit_pagf(struct xfs_scrub *sc);
int xrep_reinit_pagi(struct xfs_scrub *sc);
+int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
+ struct xfs_trans **tpp);
+void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
+
#else
#define xrep_ino_dqattach(sc) (0)
@@ -171,6 +186,8 @@ xrep_setup_nothing(
return 0;
}
#define xrep_setup_ag_allocbt xrep_setup_nothing
+#define xrep_setup_ag_rmapbt xrep_setup_nothing
+#define xrep_setup_ag_refcountbt xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
@@ -184,6 +201,7 @@ xrep_setup_nothing(
#define xrep_agi xrep_notsupported
#define xrep_allocbt xrep_notsupported
#define xrep_iallocbt xrep_notsupported
+#define xrep_rmapbt xrep_notsupported
#define xrep_refcountbt xrep_notsupported
#define xrep_inode xrep_notsupported
#define xrep_bmap_data xrep_notsupported
@@ -191,6 +209,9 @@ xrep_setup_nothing(
#define xrep_bmap_cow xrep_notsupported
#define xrep_rtbitmap xrep_notsupported
#define xrep_quota xrep_notsupported
+#define xrep_quotacheck xrep_notsupported
+#define xrep_nlinks xrep_notsupported
+#define xrep_fscounters xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index c99d1714f283..ba5bbc3fb754 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -25,6 +25,7 @@
#include "scrub/btree.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reverse mapping btrees.
@@ -36,6 +37,14 @@ xchk_setup_ag_rmapbt(
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_rmapbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
@@ -349,7 +358,7 @@ xchk_rmapbt_rec(
struct xfs_rmap_irec irec;
if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
- xfs_rmap_check_irec(bs->cur, &irec) != NULL) {
+ xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -412,8 +421,8 @@ xchk_rmapbt_walk_ag_metadata(
/* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */
cur = sc->sa.bno_cur;
if (!cur)
- cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
if (cur != sc->sa.bno_cur)
xfs_btree_del_cursor(cur, error);
@@ -422,8 +431,8 @@ xchk_rmapbt_walk_ag_metadata(
cur = sc->sa.cnt_cur;
if (!cur)
- cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
if (cur != sc->sa.cnt_cur)
xfs_btree_del_cursor(cur, error);
@@ -447,8 +456,7 @@ xchk_rmapbt_walk_ag_metadata(
/* OWN_INOBT: inobt, finobt */
cur = sc->sa.ino_cur;
if (!cur)
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp,
- XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp);
error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
if (cur != sc->sa.ino_cur)
xfs_btree_del_cursor(cur, error);
@@ -458,8 +466,8 @@ xchk_rmapbt_walk_ag_metadata(
if (xfs_has_finobt(sc->mp)) {
cur = sc->sa.fino_cur;
if (!cur)
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
- sc->sa.agi_bp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp,
+ sc->sa.agi_bp);
error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
if (cur != sc->sa.fino_cur)
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
new file mode 100644
index 000000000000..e8e07b683eab
--- /dev/null
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -0,0 +1,1697 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Reverse Mapping Btree Repair
+ * ============================
+ *
+ * This is the most involved of all the AG space btree rebuilds. Everywhere
+ * else in XFS we lock inodes and then AG data structures, but generating the
+ * list of rmap records requires that we be able to scan both block mapping
+ * btrees of every inode in the filesystem to see if it owns any extents in
+ * this AG. We can't tolerate any inode updates while we do this, so we
+ * freeze the filesystem to lock everyone else out, and grant ourselves
+ * special privileges to run transactions with regular background reclamation
+ * turned off.
+ *
+ * We also have to be very careful not to allow inode reclaim to start a
+ * transaction because all transactions (other than our own) will block.
+ * Deferred inode inactivation helps us out there.
+ *
+ * I) Reverse mappings for all non-space metadata and file data are collected
+ * according to the following algorithm:
+ *
+ * 1. For each fork of each inode:
+ * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
+ * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
+ * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree
+ * block.
+ * 1.3. If the incore extent map is loaded but the fork is in btree format,
+ * just visit the bmbt blocks to set the corresponding BMBIT areas.
+ * 1.4. From the incore extent map, accumulate each bmap that falls into our
+ * target AG. Remember, multiple bmap records can map to a single rmap
+ * record, so we cannot simply emit rmap records 1:1.
+ * 1.5. Emit rmap records for each extent in BMBIT and free it.
+ * 2. Create bitmaps INOBIT and ICHUNKBIT.
+ * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
+ * and set bits in INOBIT for each btree block. If the inobt has no records
+ * at all, we must be careful to record its root in INOBIT.
+ * 4. For each block in the finobt, set the corresponding INOBIT area.
+ * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
+ * 6. Create bitmaps REFCBIT and COWBIT.
+ * 7. For each CoW staging extent in the refcountbt, set the corresponding
+ * areas in COWBIT.
+ * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
+ * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
+ * A. Emit rmap for the AG headers.
+ * B. Emit rmap for the log, if there is one.
+ *
+ * II) The rmapbt shape and space metadata rmaps are computed as follows:
+ *
+ * 1. Count the rmaps collected in the previous step. (= NR)
+ * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
+ * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
+ * 4. Create bitmap AGBIT.
+ * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
+ * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
+ * 7. Count the extents in AGBIT. (= AGNR)
+ * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
+ * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
+ * and clear AGBIT. Go to step 5.
+ * A. Emit rmaps for each extent in AGBIT.
+ *
+ * III) The rmapbt is constructed and set in place as follows:
+ *
+ * 1. Sort the rmap records.
+ * 2. Bulk load the rmaps.
+ *
+ * IV) Reap the old btree blocks.
+ *
+ * 1. Create a bitmap OLDRMBIT.
+ * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
+ * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
+ * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are
+ * the parts of the AG that the rmap didn't find during its scan of the
+ * primary metadata and aren't known to be in the free space, which implies
+ * that they were the old rmapbt blocks.
+ * 5. Commit.
+ *
+ * We use the 'xrep_rmap' prefix for all the rmap functions.
+ */
+
+/* Context for collecting rmaps */
+struct xrep_rmap {
+ /* new rmapbt information */
+ struct xrep_newbt new_btree;
+
+ /* lock for the xfbtree and xfile */
+ struct mutex lock;
+
+ /* rmap records generated from primary metadata */
+ struct xfbtree rmap_btree;
+
+ struct xfs_scrub *sc;
+
+ /* in-memory btree cursor for the xfs_btree_bload iteration */
+ struct xfs_btree_cur *mcur;
+
+ /* Hooks into rmap update code. */
+ struct xfs_rmap_hook rhook;
+
+ /* inode scan cursor */
+ struct xchk_iscan iscan;
+
+ /* Number of non-freespace records found. */
+ unsigned long long nr_records;
+
+ /* bnobt/cntbt contribution to btreeblks */
+ xfs_agblock_t freesp_btblocks;
+
+ /* old agf_rmap_blocks counter */
+ unsigned int old_rmapbt_fsbcount;
+};
+
+/* Set us up to repair reverse mapping btrees. */
+int
+xrep_setup_ag_rmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rmap *rr;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
+ descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+
+ rr->sc = sc;
+ sc->buf = rr;
+ return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rmap_check_mapping(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rmap_stash(
+ struct xrep_rmap *rr,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ struct xfs_rmap_irec rmap = {
+ .rm_startblock = startblock,
+ .rm_blockcount = blockcount,
+ .rm_owner = owner,
+ .rm_offset = offset,
+ .rm_flags = flags,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *mcur;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
+ error = xfs_rmap_map_raw(mcur, &rmap);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
+ if (error)
+ goto out_abort;
+
+ mutex_unlock(&rr->lock);
+ return 0;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
+out_abort:
+ xchk_iscan_abort(&rr->iscan);
+ mutex_unlock(&rr->lock);
+ return error;
+}
+
+struct xrep_rmap_stash_run {
+ struct xrep_rmap *rr;
+ uint64_t owner;
+ unsigned int rmap_flags;
+};
+
+static int
+xrep_rmap_stash_run(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_rmap_stash_run *rsr = priv;
+ struct xrep_rmap *rr = rsr->rr;
+
+ return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rmap_stash_bitmap(
+ struct xrep_rmap *rr,
+ struct xagb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xrep_rmap_stash_run rsr = {
+ .rr = rr,
+ .owner = oinfo->oi_owner,
+ .rmap_flags = 0,
+ };
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
+
+ return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
+}
+
+/* Section (I): Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rmap_ifork {
+ /*
+ * Accumulate rmap data here to turn multiple adjacent bmaps into a
+ * single rmap.
+ */
+ struct xfs_rmap_irec accum;
+
+ /* Bitmap of bmbt blocks in this AG. */
+ struct xagb_bitmap bmbt_blocks;
+
+ struct xrep_rmap *rr;
+
+ /* Which inode fork? */
+ int whichfork;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rmap_stash_accumulated(
+ struct xrep_rmap_ifork *rf)
+{
+ if (rf->accum.rm_blockcount == 0)
+ return 0;
+
+ return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
+ rf->accum.rm_blockcount, rf->accum.rm_owner,
+ rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rmap_visit_bmbt(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap_ifork *rf = priv;
+ struct xfs_mount *mp = rf->rr->sc->mp;
+ struct xfs_rmap_irec *accum = &rf->accum;
+ xfs_agblock_t agbno;
+ unsigned int rmap_flags = 0;
+ int error;
+
+ if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
+ rf->rr->sc->sa.pag->pag_agno)
+ return 0;
+
+ agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
+ if (rf->whichfork == XFS_ATTR_FORK)
+ rmap_flags |= XFS_RMAP_ATTR_FORK;
+ if (rec->br_state == XFS_EXT_UNWRITTEN)
+ rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+ /* If this bmap is adjacent to the previous one, just add it. */
+ if (accum->rm_blockcount > 0 &&
+ rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+ agbno == accum->rm_startblock + accum->rm_blockcount &&
+ rmap_flags == accum->rm_flags) {
+ accum->rm_blockcount += rec->br_blockcount;
+ return 0;
+ }
+
+ /* Otherwise stash the old rmap and start accumulating a new one. */
+ error = xrep_rmap_stash_accumulated(rf);
+ if (error)
+ return error;
+
+ accum->rm_startblock = agbno;
+ accum->rm_blockcount = rec->br_blockcount;
+ accum->rm_offset = rec->br_startoff;
+ accum->rm_flags = rmap_flags;
+ return 0;
+}
+
+/* Add a btree block to the bitmap. */
+STATIC int
+xrep_rmap_visit_iroot_btree_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xrep_rmap_ifork *rf = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
+ return 0;
+
+ agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
+}
+
+/*
+ * Iterate a metadata btree rooted in an inode to collect rmap records for
+ * anything in this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iroot_btree(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_owner_info oinfo;
+ struct xrep_rmap *rr = rf->rr;
+ int error;
+
+ xagb_bitmap_init(&rf->bmbt_blocks);
+
+ /* Record all the blocks in the btree itself. */
+ error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
+ XFS_BTREE_VISIT_ALL, rf);
+ if (error)
+ goto out;
+
+ /* Emit rmaps for the btree blocks. */
+ xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
+ error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
+ if (error)
+ goto out;
+
+ /* Stash any remaining accumulated rmaps. */
+ error = xrep_rmap_stash_accumulated(rf);
+out:
+ xagb_bitmap_destroy(&rf->bmbt_blocks);
+ return error;
+}
+
+static inline bool
+is_rt_data_fork(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that matches the AG. Sets @mappings_done to true if we've scanned the
+ * block mappings in this fork.
+ */
+STATIC int
+xrep_rmap_scan_bmbt(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_inode *ip,
+ bool *mappings_done)
+{
+ struct xrep_rmap *rr = rf->rr;
+ struct xfs_btree_cur *cur;
+ struct xfs_ifork *ifp;
+ int error;
+
+ *mappings_done = false;
+ ifp = xfs_ifork_ptr(ip, rf->whichfork);
+ cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
+
+ if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
+ xfs_need_iread_extents(ifp)) {
+ /*
+ * If the incore extent cache isn't loaded, scan the bmbt for
+ * mapping records. This avoids loading the incore extent
+ * tree, which will increase memory pressure at a time when
+ * we're trying to run as quickly as we possibly can. Ignore
+ * realtime extents.
+ */
+ error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
+ if (error)
+ goto out_cur;
+
+ *mappings_done = true;
+ }
+
+ /* Scan for the bmbt blocks, which always live on the data device. */
+ error = xrep_rmap_scan_iroot_btree(rf, cur);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iext(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
+ int error;
+
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
+ if (error)
+ return error;
+ }
+
+ return xrep_rmap_stash_accumulated(rf);
+}
+
+/* Find all the extents from a given AG in an inode fork. */
+STATIC int
+xrep_rmap_scan_ifork(
+ struct xrep_rmap *rr,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xrep_rmap_ifork rf = {
+ .accum = { .rm_owner = ip->i_ino, },
+ .rr = rr,
+ .whichfork = whichfork,
+ };
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ int error = 0;
+
+ if (!ifp)
+ return 0;
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ bool mappings_done;
+
+ /*
+ * Scan the bmap btree for data device mappings. This includes
+ * the btree blocks themselves, even if this is a realtime
+ * file.
+ */
+ error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
+ if (error || mappings_done)
+ return error;
+ } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+ return 0;
+ }
+
+ /* Scan incore extent cache if this isn't a realtime file. */
+ if (xfs_ifork_is_realtime(ip, whichfork))
+ return 0;
+
+ return xrep_rmap_scan_iext(&rf, ifp);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
+ * attr bmbt. Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_rmap_scan_ilock(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ if (xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rmap_scan_inode(
+ struct xrep_rmap *rr,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode = 0;
+ int error;
+
+ /*
+ * Directory updates (create/link/unlink/rename) drop the directory's
+ * ILOCK before finishing any rmapbt updates associated with directory
+ * shape changes. For this scan to coordinate correctly with the live
+ * update hook, we must take the only lock (i_rwsem) that is held all
+ * the way to dir op completion. This will get fixed by the parent
+ * pointer patchset.
+ */
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ lock_mode = XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lock_mode);
+ }
+ lock_mode |= xrep_rmap_scan_ilock(ip);
+
+ /* Check the data fork. */
+ error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* Check the attr fork. */
+ error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* COW fork extents are "owned" by the refcount btree. */
+
+ xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Section (I): Find all AG metadata extents except for free space metadata. */
+
+struct xrep_rmap_inodes {
+ struct xrep_rmap *rr;
+ struct xagb_bitmap inobt_blocks; /* INOBIT */
+ struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */
+};
+
+/* Record inode btree rmaps. */
+STATIC int
+xrep_rmap_walk_inobt(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xrep_rmap_inodes *ri = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agino_t agino;
+ xfs_agino_t iperhole;
+ unsigned int i;
+ int error;
+
+ /* Record the inobt blocks. */
+ error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
+ if (error)
+ return error;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+ if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
+ return -EFSCORRUPTED;
+
+ agino = irec.ir_startino;
+
+ /* Record a non-sparse inode chunk. */
+ if (!xfs_inobt_issparse(irec.ir_holemask)) {
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ aglen = max_t(xfs_extlen_t, 1,
+ XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
+
+ return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+ }
+
+ /* Iterate each chunk. */
+ iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
+ XFS_INODES_PER_HOLEMASK_BIT);
+ aglen = iperhole / mp->m_sb.sb_inopblock;
+ for (i = 0, agino = irec.ir_startino;
+ i < XFS_INOBT_HOLEMASK_BITS;
+ i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
+ /* Skip holes. */
+ if (irec.ir_holemask & (1 << i))
+ continue;
+
+ /* Record the inode chunk otherwise. */
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
+STATIC int
+xrep_rmap_find_inode_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xrep_rmap_inodes ri = {
+ .rr = rr,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ xagb_bitmap_init(&ri.inobt_blocks);
+ xagb_bitmap_init(&ri.ichunk_blocks);
+
+ /*
+ * Iterate every record in the inobt so we can capture all the inode
+ * chunks and the blocks in the inobt itself.
+ */
+ error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Note that if there are zero records in the inobt then query_all does
+ * nothing and we have to account the empty inobt root manually.
+ */
+ if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+
+ error = xagb_bitmap_set(&ri.inobt_blocks,
+ be32_to_cpu(agi->agi_root), 1);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Scan the finobt too. */
+ if (xfs_has_finobt(sc->mp)) {
+ error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
+ sc->sa.fino_cur);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Generate rmaps for everything. */
+ error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
+ &XFS_RMAP_OINFO_INOBT);
+ if (error)
+ goto out_bitmap;
+ error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
+ &XFS_RMAP_OINFO_INODES);
+
+out_bitmap:
+ xagb_bitmap_destroy(&ri.inobt_blocks);
+ xagb_bitmap_destroy(&ri.ichunk_blocks);
+ return error;
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rmap_walk_cowblocks(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+
+ if (!xfs_refcount_check_domain(irec) ||
+ irec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rmap_find_refcount_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xagb_bitmap refcountbt_blocks; /* REFCBIT */
+ struct xagb_bitmap cow_blocks; /* COWBIT */
+ struct xfs_refcount_irec low = {
+ .rc_startblock = 0,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_refcount_irec high = {
+ .rc_startblock = -1U,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ if (!xfs_has_reflink(sc->mp))
+ return 0;
+
+ xagb_bitmap_init(&refcountbt_blocks);
+ xagb_bitmap_init(&cow_blocks);
+
+ /* refcountbt */
+ error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
+ if (error)
+ goto out_bitmap;
+
+ /* Collect rmaps for CoW staging extents. */
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
+ xrep_rmap_walk_cowblocks, &cow_blocks);
+ if (error)
+ goto out_bitmap;
+
+ /* Generate rmaps for everything. */
+ error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+ error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
+ &XFS_RMAP_OINFO_REFC);
+
+out_bitmap:
+ xagb_bitmap_destroy(&cow_blocks);
+ xagb_bitmap_destroy(&refcountbt_blocks);
+ return error;
+}
+
+/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
+STATIC int
+xrep_rmap_find_agheader_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ /* Create a record for the AG sb->agfl. */
+ return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
+ XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
+ XFS_RMAP_OWN_FS, 0, 0);
+}
+
+/* Generate rmaps for the log, if it's in this AG. */
+STATIC int
+xrep_rmap_find_log_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
+ return 0;
+
+ return xrep_rmap_stash(rr,
+ XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
+ sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
+}
+
+/* Check and count all the records that we gathered. */
+STATIC int
+xrep_rmap_check_record(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+ int error;
+
+ error = xrep_rmap_check_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ rr->nr_records++;
+ return 0;
+}
+
+/*
+ * Generate all the reverse-mappings for this AG, a list of the old rmapbt
+ * blocks, and the new btreeblks count. Figure out if we have enough free
+ * space to reconstruct the inode btrees. The caller must clean up the lists
+ * if anything goes wrong. This implements section (I) above.
+ */
+STATIC int
+xrep_rmap_find_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xchk_ag *sa = &sc->sa;
+ struct xfs_inode *ip;
+ struct xfs_btree_cur *mcur;
+ int error;
+
+ /* Find all the per-AG metadata. */
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ error = xrep_rmap_find_inode_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_refcount_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_agheader_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_log_rmaps(rr);
+end_agscan:
+ xchk_ag_btcur_free(&sc->sa);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Unlock the AG header buffers and cancel the transaction to release
+ * the log grant space while we scan the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ sa->agf_bp = NULL;
+ sa->agi_bp = NULL;
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ /* Iterate all AGs for inodes rmaps. */
+ while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+ error = xrep_rmap_scan_inode(rr, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rr->iscan);
+ if (error)
+ return error;
+
+ /*
+ * Switch out for a real transaction and lock the AG headers in
+ * preparation for building a new tree.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_setup_fs(sc);
+ if (error)
+ return error;
+ error = xchk_perag_drain_and_lock(sc);
+ if (error)
+ return error;
+
+ /*
+ * If a hook failed to update the in-memory btree, we lack the data to
+ * continue the repair.
+ */
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ /*
+ * Now that we have everything locked again, we need to count the
+ * number of rmap records stashed in the btree. This should reflect
+ * all actively-owned space in the filesystem. At the same time, check
+ * all our records before we start building a new btree, which requires
+ * a bnobt cursor.
+ */
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+
+ rr->nr_records = 0;
+ error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
+
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ xfs_btree_del_cursor(mcur, error);
+
+ return error;
+}
+
+/* Section (II): Reserving space for new rmapbt and setting free space bitmap */
+
+struct xrep_rmap_agfl {
+ struct xagb_bitmap *bitmap;
+ xfs_agnumber_t agno;
+};
+
+/* Add an AGFL block to the rmap list. */
+STATIC int
+xrep_rmap_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xrep_rmap_agfl *ra = priv;
+
+ return xagb_bitmap_set(ra->bitmap, agbno, 1);
+}
+
+/*
+ * Run one round of reserving space for the new rmapbt and recomputing the
+ * number of blocks needed to store the previously observed rmapbt records and
+ * the ones we'll create for the free space metadata. When we don't need more
+ * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
+ * true.
+ */
+STATIC int
+xrep_rmap_try_reserve(
+ struct xrep_rmap *rr,
+ struct xfs_btree_cur *rmap_cur,
+ struct xagb_bitmap *freesp_blocks,
+ uint64_t *blocks_reserved,
+ bool *done)
+{
+ struct xrep_rmap_agfl ra = {
+ .bitmap = freesp_blocks,
+ .agno = rr->sc->sa.pag->pag_agno,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_buf *agfl_bp;
+ uint64_t nr_blocks; /* RMB */
+ uint64_t freesp_records;
+ int error;
+
+ /*
+ * We're going to recompute new_btree.bload.nr_blocks at the end of
+ * this function to reflect however many btree blocks we need to store
+ * all the rmap records (including the ones that reflect the changes we
+ * made to support the new rmapbt blocks), so we save the old value
+ * here so we can decide if we've reserved enough blocks.
+ */
+ nr_blocks = rr->new_btree.bload.nr_blocks;
+
+ /*
+ * Make sure we've reserved enough space for the new btree. This can
+ * change the shape of the free space btrees, which can cause secondary
+ * interactions with the rmap records because all three space btrees
+ * have the same rmap owner. We'll account for all that below.
+ */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ nr_blocks - *blocks_reserved);
+ if (error)
+ return error;
+
+ *blocks_reserved = rr->new_btree.bload.nr_blocks;
+
+ /* Clear everything in the bitmap. */
+ xagb_bitmap_destroy(freesp_blocks);
+
+ /* Set all the bnobt blocks in the bitmap. */
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ if (error)
+ return error;
+
+ /* Set all the cntbt blocks in the bitmap. */
+ sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
+ xfs_btree_del_cursor(sc->sa.cnt_cur, error);
+ sc->sa.cnt_cur = NULL;
+ if (error)
+ return error;
+
+ /* Record our new btreeblks value. */
+ rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
+
+ /* Set all the new rmapbt blocks in the bitmap. */
+ list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
+ error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
+ if (error)
+ return error;
+ }
+
+ /* Set all the AGFL blocks in the bitmap. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
+ error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
+ if (error)
+ return error;
+
+ /* Count the extents in the bitmap. */
+ freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
+
+ /* Compute how many blocks we'll need for all the rmaps. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records + freesp_records);
+ if (error)
+ return error;
+
+ /* We're done when we don't need more blocks. */
+ *done = nr_blocks >= rr->new_btree.bload.nr_blocks;
+ return 0;
+}
+
+/*
+ * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
+ * the free space metadata. This implements section (II) above.
+ */
+STATIC int
+xrep_rmap_reserve_space(
+ struct xrep_rmap *rr,
+ struct xfs_btree_cur *rmap_cur)
+{
+ struct xagb_bitmap freesp_blocks; /* AGBIT */
+ uint64_t blocks_reserved = 0;
+ bool done = false;
+ int error;
+
+ /* Compute how many blocks we'll need for the rmaps collected so far. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ xagb_bitmap_init(&freesp_blocks);
+
+ /*
+ * Iteratively reserve space for the new rmapbt and recompute the
+ * number of blocks needed to store the previously observed rmapbt
+ * records and the ones we'll create for the free space metadata.
+ * Finish when we don't need more blocks.
+ */
+ do {
+ error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
+ &blocks_reserved, &done);
+ if (error)
+ goto out_bitmap;
+ } while (!done);
+
+ /* Emit rmaps for everything in the free space bitmap. */
+ xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
+ error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
+ xchk_ag_btcur_free(&rr->sc->sa);
+
+out_bitmap:
+ xagb_bitmap_destroy(&freesp_blocks);
+ return error;
+}
+
+/* Section (III): Building the new rmap btree. */
+
+/* Update the AGF counters. */
+STATIC int
+xrep_rmap_reset_counters(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ xfs_agblock_t rmap_btblocks;
+
+ /*
+ * The AGF header contains extra information related to the reverse
+ * mapping btree, so we must update those fields here.
+ */
+ rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
+ agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the rmapbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/* Retrieve rmapbt data for bulk load. */
+STATIC int
+xrep_rmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ int stat = 0;
+
+ error = xfs_btree_increment(rr->mcur, 0, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Custom allocation function for new rmap btrees. */
+STATIC int
+xrep_rmap_alloc_vextent(
+ struct xfs_scrub *sc,
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t alloc_hint)
+{
+ int error;
+
+ /*
+ * We don't want an rmap update on the allocation, since we iteratively
+ * compute the OWN_AG records /after/ allocating blocks for the records
+ * that we already know we need to store. Therefore, fix the freelist
+ * with the NORMAP flag set so that we don't also try to create an rmap
+ * for new AGFL blocks.
+ */
+ error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
+ if (error)
+ return error;
+
+ /*
+ * If xrep_fix_freelist fixed the freelist by moving blocks from the
+ * free space btrees or by removing blocks from the AGFL and queueing
+ * an EFI to free the block, the transaction will be dirty. This
+ * second case is of interest to us.
+ *
+ * Later on, we will need to compare gaps in the new recordset against
+ * the block usage of all OWN_AG owners in order to free the old
+ * btree's blocks, which means that we can't have EFIs for former AGFL
+ * blocks attached to the repair transaction when we commit the new
+ * btree.
+ *
+ * xrep_newbt_alloc_blocks guarantees this for us by calling
+ * xrep_defer_finish to commit anything that fix_freelist may have
+ * added to the transaction.
+ */
+ return xfs_alloc_vextent_near_bno(args, alloc_hint);
+}
+
+
+/* Count the records in this btree. */
+STATIC int
+xrep_rmap_count_records(
+ struct xfs_btree_cur *cur,
+ unsigned long long *nr)
+{
+ int running = 1;
+ int error;
+
+ *nr = 0;
+
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ return error;
+
+ while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
+ if (running)
+ (*nr)++;
+ }
+
+ return error;
+}
+/*
+ * Use the collected rmap information to stage a new rmap btree. If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed. This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rmap_build_new_tree(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_btree_cur *rmap_cur;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ /*
+ * Preserve the old rmapbt block count so that we can adjust the
+ * per-AG rmapbt reservation after we commit the new btree root and
+ * want to dispose of the old btree blocks.
+ */
+ rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header. The new blocks are accounted to the
+ * rmapbt per-AG reservation, which we will adjust further after
+ * committing the new btree.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
+ fsbno, XFS_AG_RESV_RMAPBT);
+ rr->new_btree.bload.get_records = xrep_rmap_get_records;
+ rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
+ rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
+ rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
+
+ /*
+ * Initialize @rr->new_btree, reserve space for the new rmapbt,
+ * and compute OWN_AG rmaps.
+ */
+ error = xrep_rmap_reserve_space(rr, rmap_cur);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Count the rmapbt records again, because the space reservation
+ * for the rmapbt itself probably added more records to the btree.
+ */
+ rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
+ &rr->rmap_btree);
+
+ error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
+ if (error)
+ goto err_mcur;
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the incore btree height so
+ * that we don't trip the verifiers when writing the new btree blocks
+ * to disk.
+ */
+ pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
+
+ /*
+ * Move the cursor to the left edge of the tree so that the first
+ * increment in ->get_records positions us at the first record.
+ */
+ error = xfs_btree_goto_left_edge(rr->mcur);
+ if (error)
+ goto err_level;
+
+ /* Add all observed rmap records. */
+ error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_level;
+
+ /*
+ * Install the new btree in the AG header. After this point the old
+ * btree is no longer accessible and the new tree is live.
+ */
+ xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(rmap_cur, 0);
+ xfs_btree_del_cursor(rr->mcur, 0);
+ rr->mcur = NULL;
+
+ /*
+ * Now that we've written the new btree to disk, we don't need to keep
+ * updating the in-memory btree. Abort the scan to stop live updates.
+ */
+ xchk_iscan_abort(&rr->iscan);
+
+ /*
+ * The newly committed rmap recordset includes mappings for the blocks
+ * that we reserved to build the new btree. If there is excess space
+ * reservation to be freed, the corresponding rmap records must also be
+ * removed.
+ */
+ rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_rmap_reset_counters(rr);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_level:
+ pag->pagf_repair_rmap_level = 0;
+err_mcur:
+ xfs_btree_del_cursor(rr->mcur, error);
+err_cur:
+ xfs_btree_del_cursor(rmap_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/* Section (IV): Reaping the old btree. */
+
+struct xrep_rmap_find_gaps {
+ struct xagb_bitmap rmap_gaps;
+ xfs_agblock_t next_agbno;
+};
+
+/* Subtract each free extent in the bnobt from the rmap gaps. */
+STATIC int
+xrep_rmap_find_freesp(
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xrep_rmap_find_gaps *rfg = priv;
+
+ return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
+ rec->ar_blockcount);
+}
+
+/* Record the free space we find, as part of cleaning out the btree. */
+STATIC int
+xrep_rmap_find_gaps(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap_find_gaps *rfg = priv;
+ int error;
+
+ if (rec->rm_startblock > rfg->next_agbno) {
+ error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
+ rec->rm_startblock - rfg->next_agbno);
+ if (error)
+ return error;
+ }
+
+ rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/*
+ * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make
+ * a list of gaps in the rmap records and a list of the extents mentioned in
+ * the bnobt. Any block that's in the new rmapbt gap list but not mentioned
+ * in the bnobt is a block from the old rmapbt and can be removed.
+ */
+STATIC int
+xrep_rmap_remove_old_tree(
+ struct xrep_rmap *rr)
+{
+ struct xrep_rmap_find_gaps rfg = {
+ .next_agbno = 0,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_btree_cur *mcur;
+ xfs_agblock_t agend;
+ int error;
+
+ xagb_bitmap_init(&rfg.rmap_gaps);
+
+ /* Compute free space from the new rmapbt. */
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+
+ error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_bitmap;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agend = be32_to_cpu(agf->agf_length);
+ if (rfg.next_agbno < agend) {
+ error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
+ agend - rfg.next_agbno);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Compute free space from the existing bnobt. */
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
+ &rfg);
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Free the "free" blocks that the new rmapbt knows about but the bnobt
+ * doesn't--these are the old rmapbt blocks. Credit the old rmapbt
+ * block usage count back to the per-AG rmapbt reservation (and not
+ * fdblocks, since the rmap btree lives in free space) to keep the
+ * reservation and free space accounting correct.
+ */
+ error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
+ &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Now that we've zapped all the old rmapbt blocks we can turn off
+ * the alternate height mechanism and reset the per-AG space
+ * reservation.
+ */
+ pag->pagf_repair_rmap_level = 0;
+ sc->flags |= XREP_RESET_PERAG_RESV;
+out_bitmap:
+ xagb_bitmap_destroy(&rfg.rmap_gaps);
+ return error;
+}
+
+static inline bool
+xrep_rmapbt_want_live_update(
+ struct xchk_iscan *iscan,
+ const struct xfs_owner_info *oi)
+{
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ /*
+ * Before unlocking the AG header to perform the inode scan, we
+ * recorded reverse mappings for all AG metadata except for the OWN_AG
+ * metadata. IOWs, the in-memory btree knows about the AG headers, the
+ * two inode btrees, the CoW staging extents, and the refcount btrees.
+ * For these types of metadata, we need to record the live updates in
+ * the in-memory rmap btree.
+ *
+ * However, we do not scan the free space btrees or the AGFL until we
+ * have re-locked the AGF and are ready to reserve space for the new
+ * rmap btree, so we do not want live updates for OWN_AG metadata.
+ */
+ if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+ return oi->oi_owner != XFS_RMAP_OWN_AG;
+
+ /* Ignore updates to files that the scanner hasn't visited yet. */
+ return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the AGF buffer and is generating
+ * the update, so we must be careful about which parts of the struct xrep_rmap
+ * that we change.
+ */
+static int
+xrep_rmapbt_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_rmap_update_params *p = data;
+ struct xrep_rmap *rr;
+ struct xfs_mount *mp;
+ struct xfs_btree_cur *mcur;
+ struct xfs_trans *tp;
+ void *txcookie;
+ int error;
+
+ rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
+ mp = rr->sc->mp;
+
+ if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
+ goto out_unlock;
+
+ trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
+
+ error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+ if (error)
+ goto out_abort;
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
+ error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+ p->blockcount, &p->oinfo, p->unwritten);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rmap_btree, tp);
+ if (error)
+ goto out_cancel;
+
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ mutex_unlock(&rr->lock);
+ return NOTIFY_DONE;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rmap_btree, tp);
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+ mutex_unlock(&rr->lock);
+ xchk_iscan_abort(&rr->iscan);
+out_unlock:
+ return NOTIFY_DONE;
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rmap_setup_scan(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ mutex_init(&rr->lock);
+
+ /* Set up in-memory rmap btree */
+ error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
+ sc->sa.pag->pag_agno);
+ if (error)
+ goto out_mutex;
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+ /*
+ * Hook into live rmap operations so that we can update our in-memory
+ * btree to reflect live changes on the filesystem. Since we drop the
+ * AGF buffer to scan all the inodes, we need this piece to avoid
+ * installing a stale btree.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+ xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
+ error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
+ if (error)
+ goto out_iscan;
+ return 0;
+
+out_iscan:
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rmap_btree);
+out_mutex:
+ mutex_destroy(&rr->lock);
+ return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rmap_teardown(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ xchk_iscan_abort(&rr->iscan);
+ xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rmap_btree);
+ mutex_destroy(&rr->lock);
+}
+
+/* Repair the rmap btree for some AG. */
+int
+xrep_rmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rmap *rr = sc->buf;
+ int error;
+
+ error = xrep_rmap_setup_scan(rr);
+ if (error)
+ return error;
+
+ /*
+ * Collect rmaps for everything in this AG that isn't space metadata.
+ * These rmaps won't change even as we try to allocate blocks.
+ */
+ error = xrep_rmap_find_rmaps(rr);
+ if (error)
+ goto out_records;
+
+ /* Rebuild the rmap information. */
+ error = xrep_rmap_build_new_tree(rr);
+ if (error)
+ goto out_records;
+
+ /* Kill the old tree. */
+ error = xrep_rmap_remove_old_tree(rr);
+ if (error)
+ goto out_records;
+
+out_records:
+ xrep_rmap_teardown(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index b1ff4f33324a..5055092bd9e8 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -119,7 +119,7 @@ xfsum_load(
xfs_rtsumoff_t sumoff,
union xfs_suminfo_raw *rawinfo)
{
- return xfile_obj_load(sc->xfile, rawinfo,
+ return xfile_load(sc->xfile, rawinfo,
sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
@@ -130,7 +130,7 @@ xfsum_store(
xfs_rtsumoff_t sumoff,
const union xfs_suminfo_raw rawinfo)
{
- return xfile_obj_store(sc->xfile, &rawinfo,
+ return xfile_store(sc->xfile, &rawinfo,
sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
@@ -142,7 +142,7 @@ xfsum_copyout(
union xfs_suminfo_raw *rawinfo,
unsigned int nr_words)
{
- return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
+ return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
sumoff << XFS_WORDLOG);
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index caf324c2b991..20fac9723c08 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -15,6 +15,8 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_scrub.h"
+#include "xfs_buf_mem.h"
+#include "xfs_rmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -157,6 +159,15 @@ xchk_fsgates_disable(
if (sc->flags & XCHK_FSGATES_DRAIN)
xfs_drain_wait_disable();
+ if (sc->flags & XCHK_FSGATES_QUOTA)
+ xfs_dqtrx_hook_disable();
+
+ if (sc->flags & XCHK_FSGATES_DIRENTS)
+ xfs_dir_hook_disable();
+
+ if (sc->flags & XCHK_FSGATES_RMAP)
+ xfs_rmap_hook_disable();
+
sc->flags &= ~XCHK_FSGATES_ALL;
}
@@ -184,6 +195,10 @@ xchk_teardown(
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
}
+ if (sc->xmbtp) {
+ xmbuf_free(sc->xmbtp);
+ sc->xmbtp = NULL;
+ }
if (sc->xfile) {
xfile_destroy(sc->xfile);
sc->xfile = NULL;
@@ -267,7 +282,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.setup = xchk_setup_ag_rmapbt,
.scrub = xchk_rmapbt,
.has = xfs_has_rmapbt,
- .repair = xrep_notsupported,
+ .repair = xrep_rmapbt,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
@@ -358,7 +373,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_fscounters,
.scrub = xchk_fscounters,
- .repair = xrep_notsupported,
+ .repair = xrep_fscounters,
+ },
+ [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */
+ .type = ST_FS,
+ .setup = xchk_setup_quotacheck,
+ .scrub = xchk_quotacheck,
+ .repair = xrep_quotacheck,
+ },
+ [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */
+ .type = ST_FS,
+ .setup = xchk_setup_nlinks,
+ .scrub = xchk_nlinks,
+ .repair = xrep_nlinks,
+ },
+ [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */
+ .type = ST_FS,
+ .setup = xchk_setup_fs,
+ .scrub = xchk_health_record,
+ .repair = xrep_notsupported,
},
};
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 7fc50654c4fe..9ad65b604fe1 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -99,6 +99,9 @@ struct xfs_scrub {
/* xfile used by the scrubbers; freed at teardown. */
struct xfile *xfile;
+ /* buffer target for in-memory btrees; also freed at teardown. */
+ struct xfs_buftarg *xmbtp;
+
/* Lock flags for @ip. */
uint ilock_flags;
@@ -121,6 +124,9 @@ struct xfs_scrub {
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
+#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */
+#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */
+#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */
#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
@@ -130,7 +136,10 @@ struct xfs_scrub {
* features are gated off via dynamic code patching, which is why the state
* must be enabled during scrub setup and can only be torn down afterwards.
*/
-#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN)
+#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \
+ XCHK_FSGATES_QUOTA | \
+ XCHK_FSGATES_DIRENTS | \
+ XCHK_FSGATES_RMAP)
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
@@ -167,14 +176,21 @@ xchk_rtsummary(struct xfs_scrub *sc)
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_quota(struct xfs_scrub *sc);
+int xchk_quotacheck(struct xfs_scrub *sc);
#else
static inline int
xchk_quota(struct xfs_scrub *sc)
{
return -ENOENT;
}
+static inline int
+xchk_quotacheck(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
#endif
int xchk_fscounters(struct xfs_scrub *sc);
+int xchk_nlinks(struct xfs_scrub *sc);
/* cross-referencing helpers */
void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index cd91db4a5548..42cafbed94ac 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -77,6 +77,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_GQUOTA] = "grpquota",
[XFS_SCRUB_TYPE_PQUOTA] = "prjquota",
[XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters",
+ [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck",
+ [XFS_SCRUB_TYPE_NLINKS] = "nlinks",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
@@ -329,9 +331,9 @@ xchk_stats_register(
if (!cs->cs_debugfs)
return;
- debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
+ debugfs_create_file("stats", 0444, cs->cs_debugfs, cs,
&scrub_stats_fops);
- debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
+ debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs,
&clear_scrub_stats_fops);
}
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index ddff86713df3..d77d8a9598f6 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -13,6 +13,7 @@
#include "xfs_inode.h"
#include "xfs_symlink.h"
#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/health.h"
@@ -67,7 +68,7 @@ xchk_symlink(
}
/* Remote symlink; must read the contents. */
- error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+ error = xfs_symlink_remote_read(sc->ip, sc->buf);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index d0e24ffaf754..3dd281d6d185 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -16,10 +16,16 @@
#include "xfs_rtbitmap.h"
#include "xfs_quota.h"
#include "xfs_quota_defs.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
+#include "xfs_rmap.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/quota.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/fscounters.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
@@ -32,7 +38,7 @@ xchk_btree_cur_fsbno(
xfs_buf_daddr(cur->bc_levels[level].bp));
if (level == cur->bc_nlevels - 1 &&
- (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
return NULLFSBLOCK;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 6bbb4e8639dc..5b294be52c55 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -15,11 +15,17 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+#include "xfs_quota_defs.h"
+struct xfs_scrub;
struct xfile;
struct xfarray;
struct xfarray_sortinfo;
struct xchk_dqiter;
+struct xchk_iscan;
+struct xchk_nlink;
+struct xchk_fscounters;
+struct xfs_rmap_update_params;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -27,14 +33,6 @@ struct xchk_dqiter;
* ring buffer. Somehow this was only worth mentioning in the ftrace sample
* code.
*/
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
@@ -63,6 +61,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -89,7 +90,10 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \
{ XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \
{ XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \
- { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }
+ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \
+ { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \
+ { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \
+ { XFS_SCRUB_TYPE_HEALTHY, "healthy" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -107,9 +111,21 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
+ { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \
+ { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \
+ { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \
{ XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC);
+TRACE_DEFINE_ENUM(XFS_RMAP_FREE);
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -395,6 +411,29 @@ DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter);
+
+TRACE_EVENT(xchk_qcheck_error,
+ TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id,
+ void *ret_ip),
+ TP_ARGS(sc, dqtype, id, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, dqtype)
+ __field(xfs_dqid_t, id)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->dqtype = dqtype;
+ __entry->id = id;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+ __entry->id,
+ __entry->ret_ip)
+);
#endif /* CONFIG_XFS_QUOTA */
TRACE_EVENT(xchk_incomplete,
@@ -423,7 +462,7 @@ TRACE_EVENT(xchk_btree_op_error,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -436,7 +475,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -444,10 +483,10 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -465,7 +504,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(int, ptr)
__field(xfs_agnumber_t, agno)
@@ -479,7 +518,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
@@ -487,12 +526,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -508,7 +547,7 @@ TRACE_EVENT(xchk_btree_error,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -519,17 +558,17 @@ TRACE_EVENT(xchk_btree_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+ TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -546,7 +585,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -559,19 +598,19 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -586,7 +625,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
__field(int, level)
@@ -598,17 +637,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
),
- TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
+ TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->bno,
__entry->level,
@@ -861,18 +900,11 @@ TRACE_EVENT(xfile_destroy,
__field(loff_t, size)
),
TP_fast_assign(
- struct xfile_stat statbuf;
- int ret;
+ struct inode *inode = file_inode(xf->file);
- ret = xfile_stat(xf, &statbuf);
- if (!ret) {
- __entry->bytes = statbuf.bytes;
- __entry->size = statbuf.size;
- } else {
- __entry->bytes = -1;
- __entry->size = -1;
- }
- __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->ino = inode->i_ino;
+ __entry->bytes = inode->i_blocks << SECTOR_SHIFT;
+ __entry->size = i_size_read(inode);
),
TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx",
__entry->ino,
@@ -891,19 +923,12 @@ DECLARE_EVENT_CLASS(xfile_class,
__field(unsigned long long, bytecount)
),
TP_fast_assign(
- struct xfile_stat statbuf;
- int ret;
+ struct inode *inode = file_inode(xf->file);
- ret = xfile_stat(xf, &statbuf);
- if (!ret) {
- __entry->bytes_used = statbuf.bytes;
- __entry->size = statbuf.size;
- } else {
- __entry->bytes_used = -1;
- __entry->size = -1;
- }
- __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->ino = inode->i_ino;
+ __entry->bytes_used = inode->i_blocks << SECTOR_SHIFT;
__entry->pos = pos;
+ __entry->size = i_size_read(inode);
__entry->bytecount = bytecount;
),
TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx",
@@ -917,11 +942,11 @@ DECLARE_EVENT_CLASS(xfile_class,
DEFINE_EVENT(xfile_class, name, \
TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \
TP_ARGS(xf, pos, bytecount))
-DEFINE_XFILE_EVENT(xfile_pread);
-DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_load);
+DEFINE_XFILE_EVENT(xfile_store);
DEFINE_XFILE_EVENT(xfile_seek_data);
-DEFINE_XFILE_EVENT(xfile_get_page);
-DEFINE_XFILE_EVENT(xfile_put_page);
+DEFINE_XFILE_EVENT(xfile_get_folio);
+DEFINE_XFILE_EVENT(xfile_put_folio);
TRACE_EVENT(xfarray_create,
TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
@@ -968,7 +993,7 @@ TRACE_EVENT(xfarray_isort,
__entry->hi - __entry->lo)
);
-TRACE_EVENT(xfarray_pagesort,
+TRACE_EVENT(xfarray_foliosort,
TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
TP_ARGS(si, lo, hi),
TP_STRUCT__entry(
@@ -1039,6 +1064,47 @@ TRACE_EVENT(xfarray_sort,
__entry->bytes)
);
+TRACE_EVENT(xfarray_sort_scan,
+ TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx),
+ TP_ARGS(si, idx),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, nr)
+ __field(size_t, obj_size)
+ __field(unsigned long long, idx)
+ __field(unsigned long long, folio_pos)
+ __field(unsigned long, folio_bytes)
+ __field(unsigned long long, first_idx)
+ __field(unsigned long long, last_idx)
+ ),
+ TP_fast_assign(
+ __entry->nr = si->array->nr;
+ __entry->obj_size = si->array->obj_size;
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->idx = idx;
+ if (si->folio) {
+ __entry->folio_pos = folio_pos(si->folio);
+ __entry->folio_bytes = folio_size(si->folio);
+ __entry->first_idx = si->first_folio_idx;
+ __entry->last_idx = si->last_folio_idx;
+ } else {
+ __entry->folio_pos = 0;
+ __entry->folio_bytes = 0;
+ __entry->first_idx = 0;
+ __entry->last_idx = 0;
+ }
+ ),
+ TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu",
+ __entry->ino,
+ __entry->nr,
+ __entry->obj_size,
+ __entry->idx,
+ __entry->folio_pos,
+ __entry->folio_bytes,
+ __entry->first_idx,
+ __entry->last_idx)
+);
+
TRACE_EVENT(xfarray_sort_stats,
TP_PROTO(struct xfarray_sortinfo *si, int error),
TP_ARGS(si, error),
@@ -1119,6 +1185,323 @@ TRACE_EVENT(xchk_rtsum_record_free,
);
#endif /* CONFIG_XFS_RT */
+DECLARE_EVENT_CLASS(xchk_iscan_class,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited)
+)
+#define DEFINE_ISCAN_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan), \
+ TP_ARGS(iscan))
+DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor);
+DEFINE_ISCAN_EVENT(xchk_iscan_visit);
+DEFINE_ISCAN_EVENT(xchk_iscan_skip);
+DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag);
+
+DECLARE_EVENT_CLASS(xchk_iscan_ino_class,
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino),
+ TP_ARGS(iscan, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, startino)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->startino = iscan->scan_start_ino;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->startino,
+ __entry->cursor,
+ __entry->visited,
+ __entry->ino)
+)
+#define DEFINE_ISCAN_INO_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_ino_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \
+ TP_ARGS(iscan, ino))
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update);
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_start);
+
+TRACE_EVENT(xchk_iscan_iget,
+ TP_PROTO(struct xchk_iscan *iscan, int error),
+ TP_ARGS(iscan, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->error)
+);
+
+TRACE_EVENT(xchk_iscan_iget_batch,
+ TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan,
+ unsigned int nr, unsigned int avail),
+ TP_ARGS(mp, iscan, nr, avail),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, nr)
+ __field(unsigned int, avail)
+ __field(unsigned int, unavail)
+ __field(xfs_ino_t, batch_ino)
+ __field(unsigned long long, skipmask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->nr = nr;
+ __entry->avail = avail;
+ __entry->unavail = hweight64(iscan->__skipped_inomask);
+ __entry->batch_ino = iscan->__batch_ino;
+ __entry->skipmask = iscan->__skipped_inomask;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->batch_ino,
+ __entry->skipmask,
+ __entry->nr,
+ __entry->avail,
+ __entry->unavail)
+);
+
+TRACE_EVENT(xchk_iscan_iget_retry_wait,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, retry_delay)
+ __field(unsigned long, remaining)
+ __field(unsigned int, iget_timeout)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->retry_delay = iscan->iget_retry_delay;
+ __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies);
+ __entry->iget_timeout = iscan->iget_timeout;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->remaining,
+ __entry->iget_timeout,
+ __entry->retry_delay)
+);
+
+TRACE_EVENT(xchk_nlinks_collect_dirent,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
+ xfs_ino_t ino, const struct xfs_name *name),
+ TP_ARGS(mp, dp, ino, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp->i_ino;
+ __entry->ino = ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_collect_metafile,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino),
+ TP_ARGS(mp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino)
+);
+
+TRACE_EVENT(xchk_nlinks_live_update,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp,
+ int action, xfs_ino_t ino, int delta,
+ const char *name, unsigned int namelen),
+ TP_ARGS(mp, dp, action, ino, delta, name, namelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(int, action)
+ __field(xfs_ino_t, ino)
+ __field(int, delta)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp ? dp->i_ino : NULLFSINO;
+ __entry->action = action;
+ __entry->ino = ino;
+ __entry->delta = delta;
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->delta,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_check_zero,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *live),
+ TP_ARGS(mp, ino, live),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+
+TRACE_EVENT(xchk_nlinks_update_incore,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *live, int parents_delta,
+ int backrefs_delta, int children_delta),
+ TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ __field(int, parents_delta)
+ __field(int, backrefs_delta)
+ __field(int, children_delta)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ __entry->parents_delta = parents_delta;
+ __entry->backrefs_delta = backrefs_delta;
+ __entry->children_delta = children_delta;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents_delta,
+ __entry->parents,
+ __entry->backrefs_delta,
+ __entry->backrefs,
+ __entry->children_delta,
+ __entry->children)
+);
+
+DECLARE_EVENT_CLASS(xchk_nlinks_diff_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip,
+ const struct xchk_nlink *live),
+ TP_ARGS(mp, ip, live),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(uint8_t, ftype)
+ __field(xfs_nlink_t, nlink)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->nlink,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \
+DEFINE_EVENT(xchk_nlinks_diff_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \
+ const struct xchk_nlink *live), \
+ TP_ARGS(mp, ip, live))
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode);
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
@@ -1223,7 +1606,6 @@ DEFINE_EVENT(xrep_rmap_class, name, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
-DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
TRACE_EVENT(xrep_abt_found,
@@ -1341,6 +1723,38 @@ TRACE_EVENT(xrep_bmap_found,
__entry->state)
);
+TRACE_EVENT(xrep_rmap_found,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ const struct xfs_rmap_irec *rec),
+ TP_ARGS(mp, agno, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = rec->rm_startblock;
+ __entry->len = rec->rm_blockcount;
+ __entry->owner = rec->rm_owner;
+ __entry->offset = rec->rm_offset;
+ __entry->flags = rec->rm_flags;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
@@ -1425,16 +1839,28 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__entry->refcbt_sz)
)
TRACE_EVENT(xrep_reset_counters,
- TP_PROTO(struct xfs_mount *mp),
- TP_ARGS(mp),
+ TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc),
+ TP_ARGS(mp, fsc),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(uint64_t, icount)
+ __field(uint64_t, ifree)
+ __field(uint64_t, fdblocks)
+ __field(uint64_t, frextents)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
+ __entry->icount = fsc->icount;
+ __entry->ifree = fsc->ifree;
+ __entry->fdblocks = fsc->fdblocks;
+ __entry->frextents = fsc->frextents;
),
- TP_printk("dev %d:%d",
- MAJOR(__entry->dev), MINOR(__entry->dev))
+ TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->icount,
+ __entry->ifree,
+ __entry->fdblocks,
+ __entry->frextents)
)
DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
@@ -1645,6 +2071,55 @@ TRACE_EVENT(xrep_dinode_count_rmaps,
__entry->attr_extents)
);
+TRACE_EVENT(xrep_dinode_findmode_dirent,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype),
+ TP_ARGS(sc, dp, ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR))
+);
+
+TRACE_EVENT(xrep_dinode_findmode_dirent_inval,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype, unsigned int found_ftype),
+ TP_ARGS(sc, dp, ftype, found_ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ __field(unsigned int, found_ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ __entry->found_ftype = found_ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR))
+);
+
TRACE_EVENT(xrep_cow_mark_file_range,
TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
xfs_fileoff_t startoff, xfs_filblks_t blockcount),
@@ -1756,8 +2231,48 @@ DEFINE_EVENT(xrep_dquot_class, name, \
DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item);
DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot);
DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole);
+DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot);
#endif /* CONFIG_XFS_QUOTA */
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode);
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode);
+
+TRACE_EVENT(xrep_rmap_live_update,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op,
+ const struct xfs_rmap_update_params *p),
+ TP_ARGS(mp, agno, op, p),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, op)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->op = op;
+ __entry->agbno = p->startblock;
+ __entry->len = p->blockcount;
+ xfs_owner_info_unpack(&p->oinfo, &__entry->owner,
+ &__entry->offset, &__entry->flags);
+ if (p->unwritten)
+ __entry->flags |= XFS_RMAP_UNWRITTEN;
+ ),
+ TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->op,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index f0f532c10a5a..17c982a4821d 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -16,7 +16,7 @@
* Large Arrays of Fixed-Size Records
* ==================================
*
- * This memory array uses an xfile (which itself is a memfd "file") to store
+ * This memory array uses an xfile (which itself is a shmem file) to store
* large numbers of fixed-size records in memory that can be paged out. This
* puts less stress on the memory reclaim algorithms during an online repair
* because we don't have to pin so much memory. However, array access is less
@@ -136,7 +136,7 @@ xfarray_load(
if (idx >= array->nr)
return -ENODATA;
- return xfile_obj_load(array->xfile, ptr, array->obj_size,
+ return xfile_load(array->xfile, ptr, array->obj_size,
xfarray_pos(array, idx));
}
@@ -152,7 +152,7 @@ xfarray_is_unset(
if (array->unset_slots == 0)
return false;
- error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
+ error = xfile_load(array->xfile, temp, array->obj_size, pos);
if (!error && xfarray_element_is_null(array, temp))
return true;
@@ -184,7 +184,7 @@ xfarray_unset(
return 0;
memset(temp, 0, array->obj_size);
- error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
+ error = xfile_store(array->xfile, temp, array->obj_size, pos);
if (error)
return error;
@@ -209,7 +209,7 @@ xfarray_store(
ASSERT(!xfarray_element_is_null(array, ptr));
- ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ ret = xfile_store(array->xfile, ptr, array->obj_size,
xfarray_pos(array, idx));
if (ret)
return ret;
@@ -245,12 +245,12 @@ xfarray_store_anywhere(
for (pos = 0;
pos < endpos && array->unset_slots > 0;
pos += array->obj_size) {
- error = xfile_obj_load(array->xfile, temp, array->obj_size,
+ error = xfile_load(array->xfile, temp, array->obj_size,
pos);
if (error || !xfarray_element_is_null(array, temp))
continue;
- error = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ error = xfile_store(array->xfile, ptr, array->obj_size,
pos);
if (error)
return error;
@@ -552,7 +552,7 @@ xfarray_isort(
trace_xfarray_isort(si, lo, hi);
xfarray_sort_bump_loads(si);
- error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
+ error = xfile_load(si->array->xfile, scratch, len, lo_pos);
if (error)
return error;
@@ -560,88 +560,45 @@ xfarray_isort(
sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
- return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
+ return xfile_store(si->array->xfile, scratch, len, lo_pos);
}
-/* Grab a page for sorting records. */
-static inline int
-xfarray_sort_get_page(
- struct xfarray_sortinfo *si,
- loff_t pos,
- uint64_t len)
-{
- int error;
-
- error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
- if (error)
- return error;
-
- /*
- * xfile pages must never be mapped into userspace, so we skip the
- * dcache flush when mapping the page.
- */
- si->page_kaddr = kmap_local_page(si->xfpage.page);
- return 0;
-}
-
-/* Release a page we grabbed for sorting records. */
-static inline int
-xfarray_sort_put_page(
- struct xfarray_sortinfo *si)
-{
- if (!si->page_kaddr)
- return 0;
-
- kunmap_local(si->page_kaddr);
- si->page_kaddr = NULL;
-
- return xfile_put_page(si->array->xfile, &si->xfpage);
-}
-
-/* Decide if these records are eligible for in-page sorting. */
-static inline bool
-xfarray_want_pagesort(
- struct xfarray_sortinfo *si,
- xfarray_idx_t lo,
- xfarray_idx_t hi)
-{
- pgoff_t lo_page;
- pgoff_t hi_page;
- loff_t end_pos;
-
- /* We can only map one page at a time. */
- lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
- end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
- hi_page = end_pos >> PAGE_SHIFT;
-
- return lo_page == hi_page;
-}
-
-/* Sort a bunch of records that all live in the same memory page. */
+/*
+ * Sort the records from lo to hi (inclusive) if they are all backed by the
+ * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative
+ * errno.
+ */
STATIC int
-xfarray_pagesort(
+xfarray_foliosort(
struct xfarray_sortinfo *si,
xfarray_idx_t lo,
xfarray_idx_t hi)
{
+ struct folio *folio;
void *startp;
loff_t lo_pos = xfarray_pos(si->array, lo);
- uint64_t len = xfarray_pos(si->array, hi - lo);
- int error = 0;
+ uint64_t len = xfarray_pos(si->array, hi - lo + 1);
- trace_xfarray_pagesort(si, lo, hi);
+ /* No single folio could back this many records. */
+ if (len > XFILE_MAX_FOLIO_SIZE)
+ return 0;
xfarray_sort_bump_loads(si);
- error = xfarray_sort_get_page(si, lo_pos, len);
- if (error)
- return error;
+ folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ if (!folio)
+ return 0;
+
+ trace_xfarray_foliosort(si, lo, hi);
xfarray_sort_bump_heapsorts(si);
- startp = si->page_kaddr + offset_in_page(lo_pos);
+ startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
- return xfarray_sort_put_page(si);
+ xfile_put_folio(si->array->xfile, folio);
+ return 1;
}
/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
@@ -829,63 +786,78 @@ xfarray_qsort_push(
return 0;
}
+static inline void
+xfarray_sort_scan_done(
+ struct xfarray_sortinfo *si)
+{
+ if (si->folio)
+ xfile_put_folio(si->array->xfile, si->folio);
+ si->folio = NULL;
+}
+
/*
- * Load an element from the array into the first scratchpad and cache the page,
- * if possible.
+ * Cache the folio backing the start of the given array element. If the array
+ * element is contained entirely within the folio, return a pointer to the
+ * cached folio. Otherwise, load the element into the scratchpad and return a
+ * pointer to the scratchpad.
*/
static inline int
-xfarray_sort_load_cached(
+xfarray_sort_scan(
struct xfarray_sortinfo *si,
xfarray_idx_t idx,
- void *ptr)
+ void **ptrp)
{
loff_t idx_pos = xfarray_pos(si->array, idx);
- pgoff_t startpage;
- pgoff_t endpage;
int error = 0;
- /*
- * If this load would split a page, release the cached page, if any,
- * and perform a traditional read.
- */
- startpage = idx_pos >> PAGE_SHIFT;
- endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
- if (startpage != endpage) {
- error = xfarray_sort_put_page(si);
- if (error)
- return error;
+ if (xfarray_sort_terminated(si, &error))
+ return error;
- if (xfarray_sort_terminated(si, &error))
- return error;
+ trace_xfarray_sort_scan(si, idx);
- return xfile_obj_load(si->array->xfile, ptr,
- si->array->obj_size, idx_pos);
- }
+ /* If the cached folio doesn't cover this index, release it. */
+ if (si->folio &&
+ (idx < si->first_folio_idx || idx > si->last_folio_idx))
+ xfarray_sort_scan_done(si);
- /* If the cached page is not the one we want, release it. */
- if (xfile_page_cached(&si->xfpage) &&
- xfile_page_index(&si->xfpage) != startpage) {
- error = xfarray_sort_put_page(si);
- if (error)
- return error;
+ /* Grab the first folio that backs this array element. */
+ if (!si->folio) {
+ loff_t next_pos;
+
+ si->folio = xfile_get_folio(si->array->xfile, idx_pos,
+ si->array->obj_size, XFILE_ALLOC);
+ if (IS_ERR(si->folio))
+ return PTR_ERR(si->folio);
+
+ si->first_folio_idx = xfarray_idx(si->array,
+ folio_pos(si->folio) + si->array->obj_size - 1);
+
+ next_pos = folio_pos(si->folio) + folio_size(si->folio);
+ si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
+ if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
+ si->last_folio_idx--;
+
+ trace_xfarray_sort_scan(si, idx);
}
/*
- * If we don't have a cached page (and we know the load is contained
- * in a single page) then grab it.
+ * If this folio still doesn't cover the desired element, it must cross
+ * a folio boundary. Read into the scratchpad and we're done.
*/
- if (!xfile_page_cached(&si->xfpage)) {
- if (xfarray_sort_terminated(si, &error))
- return error;
+ if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
+ void *temp = xfarray_scratch(si->array);
- error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
- PAGE_SIZE);
+ error = xfile_load(si->array->xfile, temp, si->array->obj_size,
+ idx_pos);
if (error)
return error;
+
+ *ptrp = temp;
+ return 0;
}
- memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
- si->array->obj_size);
+ /* Otherwise return a pointer to the array element in the folio. */
+ *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
return 0;
}
@@ -952,6 +924,8 @@ xfarray_sort(
pivot = xfarray_sortinfo_pivot(si);
while (si->stack_depth >= 0) {
+ int ret;
+
lo = si_lo[si->stack_depth];
hi = si_hi[si->stack_depth];
@@ -964,13 +938,13 @@ xfarray_sort(
}
/*
- * If directly mapping the page and sorting can solve our
+ * If directly mapping the folio and sorting can solve our
* problems, we're done.
*/
- if (xfarray_want_pagesort(si, lo, hi)) {
- error = xfarray_pagesort(si, lo, hi);
- if (error)
- goto out_free;
+ ret = xfarray_foliosort(si, lo, hi);
+ if (ret < 0)
+ goto out_free;
+ if (ret == 1) {
si->stack_depth--;
continue;
}
@@ -995,25 +969,24 @@ xfarray_sort(
* than the pivot is on the right side of the range.
*/
while (lo < hi) {
+ void *p;
+
/*
* Decrement hi until it finds an a[hi] less than the
* pivot value.
*/
- error = xfarray_sort_load_cached(si, hi, scratch);
+ error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
- while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
- lo < hi) {
+ while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
hi--;
- error = xfarray_sort_load_cached(si, hi,
- scratch);
+ error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
}
- error = xfarray_sort_put_page(si);
- if (error)
- goto out_free;
-
+ if (p != scratch)
+ memcpy(scratch, p, si->array->obj_size);
+ xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
@@ -1028,21 +1001,18 @@ xfarray_sort(
* Increment lo until it finds an a[lo] greater than
* the pivot value.
*/
- error = xfarray_sort_load_cached(si, lo, scratch);
+ error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
- while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
- lo < hi) {
+ while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
lo++;
- error = xfarray_sort_load_cached(si, lo,
- scratch);
+ error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
}
- error = xfarray_sort_put_page(si);
- if (error)
- goto out_free;
-
+ if (p != scratch)
+ memcpy(scratch, p, si->array->obj_size);
+ xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 62b9c506fdd1..acb2f94c56c1 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -45,6 +45,25 @@ int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+/*
+ * Load an array element, but zero the buffer if there's no data because we
+ * haven't stored to that array element yet.
+ */
+static inline int
+xfarray_load_sparse(
+ struct xfarray *array,
+ uint64_t idx,
+ void *rec)
+{
+ int error = xfarray_load(array, idx, rec);
+
+ if (error == -ENODATA) {
+ memset(rec, 0, array->obj_size);
+ return 0;
+ }
+ return error;
+}
+
/* Append an element to the array. */
static inline int xfarray_append(struct xfarray *array, const void *ptr)
{
@@ -105,9 +124,14 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */
unsigned int flags;
- /* Cache a page here for faster access. */
- struct xfile_page xfpage;
- void *page_kaddr;
+ /* Cache a folio here for faster scanning for pivots */
+ struct folio *folio;
+
+ /* First array index in folio that is completely readable */
+ xfarray_idx_t first_folio_idx;
+
+ /* Last array index in folio that is completely readable */
+ xfarray_idx_t last_folio_idx;
#ifdef DEBUG
/* Performance statistics. */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 090c3ead43fd..8cdd863db585 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -34,13 +34,6 @@
* xfiles assume that the caller will handle all required concurrency
* management; standard vfs locks (freezer and inode) are not taken. Reads
* and writes are satisfied directly from the page cache.
- *
- * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
- * of a hole cause a page to be mapped into the file. If you are going to
- * create a sparse xfile, please be careful about reading from uninitialized
- * parts of the file. These pages are !Uptodate and will eventually be
- * reclaimed if not written, but in the short term this boosts memory
- * consumption.
*/
/*
@@ -62,38 +55,27 @@ xfile_create(
{
struct inode *inode;
struct xfile *xf;
- int error = -ENOMEM;
+ int error;
xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
if (!xf)
return -ENOMEM;
- xf->file = shmem_file_setup(description, isize, 0);
- if (!xf->file)
- goto out_xfile;
+ xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE);
if (IS_ERR(xf->file)) {
error = PTR_ERR(xf->file);
goto out_xfile;
}
- /*
- * We want a large sparse file that we can pread, pwrite, and seek.
- * xfile users are responsible for keeping the xfile hidden away from
- * all other callers, so we skip timestamp updates and security checks.
- * Make the inode only accessible by root, just in case the xfile ever
- * escapes.
- */
- xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
- FMODE_LSEEK;
- xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
inode = file_inode(xf->file);
- inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
- inode->i_mode &= ~0177;
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
-
lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
+ /*
+ * We don't want to bother with kmapping data during repair, so don't
+ * allow highmem pages to back this mapping.
+ */
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
trace_xfile_create(xf);
*xfilep = xf;
@@ -118,164 +100,128 @@ xfile_destroy(
}
/*
- * Read a memory object directly from the xfile's page cache. Unlike regular
- * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
- * high an offset, instead of truncating the read. Otherwise, we return
- * bytes read or an error code, like regular pread.
+ * Load an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
*/
-ssize_t
-xfile_pread(
+int
+xfile_load(
struct xfile *xf,
void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- struct page *page = NULL;
- ssize_t read = 0;
unsigned int pflags;
- int error = 0;
if (count > MAX_RW_COUNT)
- return -E2BIG;
+ return -ENOMEM;
if (inode->i_sb->s_maxbytes - pos < count)
- return -EFBIG;
+ return -ENOMEM;
- trace_xfile_pread(xf, pos, count);
+ trace_xfile_load(xf, pos, count);
pflags = memalloc_nofs_save();
while (count > 0) {
- void *p, *kaddr;
+ struct folio *folio;
unsigned int len;
+ unsigned int offset;
- len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
- /*
- * In-kernel reads of a shmem file cause it to allocate a page
- * if the mapping shows a hole. Therefore, if we hit ENOMEM
- * we can continue by zeroing the caller's buffer.
- */
- page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
- __GFP_NOWARN);
- if (IS_ERR(page)) {
- error = PTR_ERR(page);
- if (error != -ENOMEM)
- break;
-
- memset(buf, 0, len);
- goto advance;
- }
-
- if (PageUptodate(page)) {
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ SGP_READ) < 0)
+ break;
+ if (!folio) {
/*
- * xfile pages must never be mapped into userspace, so
- * we skip the dcache flush.
+ * No data stored at this offset, just zero the output
+ * buffer until the next page boundary.
*/
- kaddr = kmap_local_page(page);
- p = kaddr + offset_in_page(pos);
- memcpy(buf, p, len);
- kunmap_local(kaddr);
- } else {
+ len = min_t(ssize_t, count,
+ PAGE_SIZE - offset_in_page(pos));
memset(buf, 0, len);
- }
- put_page(page);
+ } else {
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ break;
+ }
+
+ offset = offset_in_folio(folio, pos);
+ len = min_t(ssize_t, count, folio_size(folio) - offset);
+ memcpy(buf, folio_address(folio) + offset, len);
-advance:
+ folio_unlock(folio);
+ folio_put(folio);
+ }
count -= len;
pos += len;
buf += len;
- read += len;
}
memalloc_nofs_restore(pflags);
- if (read > 0)
- return read;
- return error;
+ if (count)
+ return -ENOMEM;
+ return 0;
}
/*
- * Write a memory object directly to the xfile's page cache. Unlike regular
- * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
- * high an offset, instead of truncating the write. Otherwise, we return
- * bytes written or an error code, like regular pwrite.
+ * Store an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
*/
-ssize_t
-xfile_pwrite(
+int
+xfile_store(
struct xfile *xf,
const void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- struct page *page = NULL;
- ssize_t written = 0;
unsigned int pflags;
- int error = 0;
if (count > MAX_RW_COUNT)
- return -E2BIG;
+ return -ENOMEM;
if (inode->i_sb->s_maxbytes - pos < count)
- return -EFBIG;
+ return -ENOMEM;
- trace_xfile_pwrite(xf, pos, count);
+ trace_xfile_store(xf, pos, count);
+
+ /*
+ * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+ * actually allocates a folio instead of erroring out.
+ */
+ if (pos + count > i_size_read(inode))
+ i_size_write(inode, pos + count);
pflags = memalloc_nofs_save();
while (count > 0) {
- void *fsdata = NULL;
- void *p, *kaddr;
+ struct folio *folio;
unsigned int len;
- int ret;
-
- len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
- /*
- * We call write_begin directly here to avoid all the freezer
- * protection lock-taking that happens in the normal path.
- * shmem doesn't support fs freeze, but lockdep doesn't know
- * that and will trip over that.
- */
- error = aops->write_begin(NULL, mapping, pos, len, &page,
- &fsdata);
- if (error)
- break;
+ unsigned int offset;
- /*
- * xfile pages must never be mapped into userspace, so we skip
- * the dcache flush. If the page is not uptodate, zero it
- * before writing data.
- */
- kaddr = kmap_local_page(page);
- if (!PageUptodate(page)) {
- memset(kaddr, 0, PAGE_SIZE);
- SetPageUptodate(page);
- }
- p = kaddr + offset_in_page(pos);
- memcpy(p, buf, len);
- kunmap_local(kaddr);
-
- ret = aops->write_end(NULL, mapping, pos, len, len, page,
- fsdata);
- if (ret < 0) {
- error = ret;
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ SGP_CACHE) < 0)
+ break;
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
- written += ret;
- if (ret != len)
- break;
+ offset = offset_in_folio(folio, pos);
+ len = min_t(ssize_t, count, folio_size(folio) - offset);
+ memcpy(folio_address(folio) + offset, buf, len);
+
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
- count -= ret;
- pos += ret;
- buf += ret;
+ count -= len;
+ pos += len;
+ buf += len;
}
memalloc_nofs_restore(pflags);
- if (written > 0)
- return written;
- return error;
+ if (count)
+ return -ENOMEM;
+ return 0;
}
/* Find the next written area in the xfile data for a given offset. */
@@ -291,129 +237,76 @@ xfile_seek_data(
return ret;
}
-/* Query stat information for an xfile. */
-int
-xfile_stat(
- struct xfile *xf,
- struct xfile_stat *statbuf)
-{
- struct kstat ks;
- int error;
-
- error = vfs_getattr_nosec(&xf->file->f_path, &ks,
- STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
- if (error)
- return error;
-
- statbuf->size = ks.size;
- statbuf->bytes = ks.blocks << SECTOR_SHIFT;
- return 0;
-}
-
/*
- * Grab the (locked) page for a memory object. The object cannot span a page
- * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
- * cannot grab the page, or the usual negative errno.
+ * Grab the (locked) folio for a memory object. The object cannot span a folio
+ * boundary. Returns the locked folio if successful, NULL if there was no
+ * folio or it didn't cover the range requested, or an ERR_PTR on failure.
*/
-int
-xfile_get_page(
+struct folio *
+xfile_get_folio(
struct xfile *xf,
loff_t pos,
- unsigned int len,
- struct xfile_page *xfpage)
+ size_t len,
+ unsigned int flags)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- struct page *page = NULL;
- void *fsdata = NULL;
- loff_t key = round_down(pos, PAGE_SIZE);
+ struct folio *folio = NULL;
unsigned int pflags;
int error;
if (inode->i_sb->s_maxbytes - pos < len)
- return -ENOMEM;
- if (len > PAGE_SIZE - offset_in_page(pos))
- return -ENOTBLK;
-
- trace_xfile_get_page(xf, pos, len);
+ return ERR_PTR(-ENOMEM);
- pflags = memalloc_nofs_save();
+ trace_xfile_get_folio(xf, pos, len);
/*
- * We call write_begin directly here to avoid all the freezer
- * protection lock-taking that happens in the normal path. shmem
- * doesn't support fs freeze, but lockdep doesn't know that and will
- * trip over that.
+ * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+ * actually allocates a folio instead of erroring out.
*/
- error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
- &fsdata);
+ if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode))
+ i_size_write(inode, pos + len);
+
+ pflags = memalloc_nofs_save();
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
+ memalloc_nofs_restore(pflags);
if (error)
- goto out_pflags;
+ return ERR_PTR(error);
- /* We got the page, so make sure we push out EOF. */
- if (i_size_read(inode) < pos + len)
- i_size_write(inode, pos + len);
+ if (!folio)
+ return NULL;
- /*
- * If the page isn't up to date, fill it with zeroes before we hand it
- * to the caller and make sure the backing store will hold on to them.
- */
- if (!PageUptodate(page)) {
- void *kaddr;
+ if (len > folio_size(folio) - offset_in_folio(folio, pos)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return NULL;
+ }
- kaddr = kmap_local_page(page);
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_local(kaddr);
- SetPageUptodate(page);
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(-EIO);
}
/*
- * Mark each page dirty so that the contents are written to some
- * backing store when we drop this buffer, and take an extra reference
- * to prevent the xfile page from being swapped or removed from the
- * page cache by reclaim if the caller unlocks the page.
+ * Mark the folio dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xfile_put_folio.
*/
- set_page_dirty(page);
- get_page(page);
-
- xfpage->page = page;
- xfpage->fsdata = fsdata;
- xfpage->pos = key;
-out_pflags:
- memalloc_nofs_restore(pflags);
- return error;
+ if (flags & XFILE_ALLOC)
+ folio_set_dirty(folio);
+ return folio;
}
/*
- * Release the (locked) page for a memory object. Returns 0 or a negative
- * errno.
+ * Release the (locked) folio for a memory object.
*/
-int
-xfile_put_page(
+void
+xfile_put_folio(
struct xfile *xf,
- struct xfile_page *xfpage)
+ struct folio *folio)
{
- struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- unsigned int pflags;
- int ret;
-
- trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
-
- /* Give back the reference that we took in xfile_get_page. */
- put_page(xfpage->page);
+ trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio));
- pflags = memalloc_nofs_save();
- ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
- xfpage->page, xfpage->fsdata);
- memalloc_nofs_restore(pflags);
- memset(xfpage, 0, sizeof(struct xfile_page));
-
- if (ret < 0)
- return ret;
- if (ret != PAGE_SIZE)
- return -EIO;
- return 0;
+ folio_unlock(folio);
+ folio_put(folio);
}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index d56643b0f429..76d78dba7e34 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -6,22 +6,6 @@
#ifndef __XFS_SCRUB_XFILE_H__
#define __XFS_SCRUB_XFILE_H__
-struct xfile_page {
- struct page *page;
- void *fsdata;
- loff_t pos;
-};
-
-static inline bool xfile_page_cached(const struct xfile_page *xfpage)
-{
- return xfpage->page != NULL;
-}
-
-static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
-{
- return xfpage->page->index;
-}
-
struct xfile {
struct file *file;
};
@@ -29,49 +13,17 @@ struct xfile {
int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
void xfile_destroy(struct xfile *xf);
-ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos);
-ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count,
+int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);
+int xfile_store(struct xfile *xf, const void *buf, size_t count,
loff_t pos);
-/*
- * Load an object. Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos)
-{
- ssize_t ret = xfile_pread(xf, buf, count, pos);
-
- if (ret < 0 || ret != count)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Store an object. Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos)
-{
- ssize_t ret = xfile_pwrite(xf, buf, count, pos);
-
- if (ret < 0 || ret != count)
- return -ENOMEM;
- return 0;
-}
-
loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
-struct xfile_stat {
- loff_t size;
- unsigned long long bytes;
-};
-
-int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
+#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER)
-int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len,
- struct xfile_page *xbuf);
-int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf);
+#define XFILE_ALLOC (1 << 0) /* allocate folio if not present */
+struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len,
+ unsigned int flags);
+void xfile_put_folio(struct xfile *xf, struct folio *folio);
#endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6b840301817a..4bf69c9c088e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -167,7 +167,7 @@ xfs_get_acl(struct inode *inode, int type, bool rcu)
acl = ERR_PTR(error);
}
- kmem_free(args.value);
+ kvfree(args.value);
return acl;
}
@@ -204,7 +204,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
error = xfs_attr_change(&args);
- kmem_free(args.value);
+ kvfree(args.value);
/*
* If the attribute didn't exist to start with that's fine.
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 89c7a9f4f930..24fb12986a56 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -23,6 +23,7 @@
#include "xfs_quota.h"
#include "xfs_dir2.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Invalidate any incore buffers associated with this remote attribute value
@@ -147,6 +148,7 @@ xfs_attr3_node_inactive(
if (level > XFS_DA_NODE_MAXDEPTH) {
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -197,6 +199,7 @@ xfs_attr3_node_inactive(
default:
xfs_buf_mark_corrupt(child_bp);
xfs_trans_brelse(*trans, child_bp);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
break;
}
@@ -286,6 +289,7 @@ xfs_attr3_root_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, bp);
break;
default:
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp);
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 9e02111bd890..9b4c61e1c22e 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -108,7 +108,7 @@ STATIC void
xfs_attri_item_free(
struct xfs_attri_log_item *attrip)
{
- kmem_free(attrip->attri_item.li_lv_shadow);
+ kvfree(attrip->attri_item.li_lv_shadow);
xfs_attri_log_nameval_put(attrip->attri_nameval);
kmem_cache_free(xfs_attri_cache, attrip);
}
@@ -226,7 +226,7 @@ xfs_attri_init(
{
struct xfs_attri_log_item *attrip;
- attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL);
+ attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_KERNEL | __GFP_NOFAIL);
/*
* Grab an extra reference to the name/value buffer for this log item.
@@ -251,7 +251,7 @@ static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
STATIC void
xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
{
- kmem_free(attrdp->attrd_item.li_lv_shadow);
+ kvfree(attrdp->attrd_item.li_lv_shadow);
kmem_cache_free(xfs_attrd_cache, attrdp);
}
@@ -386,11 +386,16 @@ xfs_attr_free_item(
xfs_da_state_free(attr->xattri_da_state);
xfs_attri_log_nameval_put(attr->xattri_nameval);
if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY)
- kmem_free(attr);
+ kfree(attr);
else
kmem_cache_free(xfs_attr_intent_cache, attr);
}
+static inline struct xfs_attr_intent *attri_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_attr_intent, xattri_list);
+}
+
/* Process an attr. */
STATIC int
xfs_attr_finish_item(
@@ -399,11 +404,10 @@ xfs_attr_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_attr_intent *attr;
+ struct xfs_attr_intent *attr = attri_entry(item);
struct xfs_da_args *args;
int error;
- attr = container_of(item, struct xfs_attr_intent, xattri_list);
args = attr->xattri_da_args;
/* Reset trans after EAGAIN cycle since the transaction is new */
@@ -443,9 +447,8 @@ STATIC void
xfs_attr_cancel_item(
struct list_head *item)
{
- struct xfs_attr_intent *attr;
+ struct xfs_attr_intent *attr = attri_entry(item);
- attr = container_of(item, struct xfs_attr_intent, xattri_list);
xfs_attr_free_item(attr);
}
@@ -512,8 +515,8 @@ xfs_attri_recover_work(
if (error)
return ERR_PTR(error);
- attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
- sizeof(struct xfs_da_args), KM_NOFS);
+ attr = kzalloc(sizeof(struct xfs_attr_intent) +
+ sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL);
args = (struct xfs_da_args *)(attr + 1);
attr->xattri_da_args = args;
@@ -666,7 +669,7 @@ xfs_attr_create_done(
attrip = ATTRI_ITEM(intent);
- attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
&xfs_attrd_item_ops);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index e368ad671e26..a6819a642cc0 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -22,6 +22,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_dir2.h"
+#include "xfs_health.h"
STATIC int
xfs_attr_shortform_compare(const void *a, const void *b)
@@ -82,8 +83,10 @@ xfs_attr_shortform_list(
for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval,
- sfe->namelen)))
+ sfe->namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
context->put_listent(context,
sfe->flags,
sfe->nameval,
@@ -109,7 +112,7 @@ xfs_attr_shortform_list(
* It didn't all fit, so we have to sort everything on hashval.
*/
sbsize = sf->count * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
+ sbp = sbuf = kmalloc(sbsize, GFP_KERNEL | __GFP_NOFAIL);
/*
* Scan the attribute list for the rest of the entries, storing
@@ -124,7 +127,8 @@ xfs_attr_shortform_list(
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe,
sizeof(*sfe));
- kmem_free(sbuf);
+ kfree(sbuf);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -175,6 +179,7 @@ xfs_attr_shortform_list(
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sbp->name,
sbp->namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
goto out;
}
@@ -188,7 +193,7 @@ xfs_attr_shortform_list(
cursor->offset++;
}
out:
- kmem_free(sbuf);
+ kfree(sbuf);
return error;
}
@@ -262,8 +267,10 @@ xfs_attr_node_list_lookup(
return 0;
/* We can't point back to the root. */
- if (XFS_IS_CORRUPT(mp, cursor->blkno == 0))
+ if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) {
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
}
if (expected_level != 0)
@@ -275,6 +282,7 @@ xfs_attr_node_list_lookup(
out_corruptbuf:
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(tp, bp);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -304,6 +312,8 @@ xfs_attr_node_list(
if (cursor->blkno > 0) {
error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
XFS_ATTR_FORK);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
if ((error != 0) && (error != -EFSCORRUPTED))
return error;
if (bp) {
@@ -464,8 +474,10 @@ xfs_attr3_leaf_list_int(
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(name, namelen)))
+ !xfs_attr_namecheck(name, namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
context->put_listent(context, entry->flags,
name, namelen, valuelen);
if (context->seen_enough)
@@ -504,7 +516,7 @@ xfs_attr_list_ilocked(
{
struct xfs_inode *dp = context->dp;
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
/*
* Decide on what work routines to call based on the inode size.
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 52fb8a148b7d..d27859a684aa 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -25,6 +25,7 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
#include "xfs_ag.h"
+#include "xfs_trace.h"
struct kmem_cache *xfs_bui_cache;
struct kmem_cache *xfs_bud_cache;
@@ -40,7 +41,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_free(buip->bui_item.li_lv_shadow);
+ kvfree(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_cache, buip);
}
@@ -201,7 +202,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_free(budp->bud_item.li_lv_shadow);
+ kvfree(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_cache, budp);
}
@@ -221,6 +222,11 @@ static const struct xfs_item_ops xfs_bud_item_ops = {
.iop_intent = xfs_bud_item_intent,
};
+static inline struct xfs_bmap_intent *bi_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_bmap_intent, bi_list);
+}
+
/* Sort bmap intents by inode. */
static int
xfs_bmap_update_diff_items(
@@ -228,37 +234,12 @@ xfs_bmap_update_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_bmap_intent *ba;
- struct xfs_bmap_intent *bb;
+ struct xfs_bmap_intent *ba = bi_entry(a);
+ struct xfs_bmap_intent *bb = bi_entry(b);
- ba = container_of(a, struct xfs_bmap_intent, bi_list);
- bb = container_of(b, struct xfs_bmap_intent, bi_list);
return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
}
-/* Set the map extent flags for this mapping. */
-static void
-xfs_trans_set_bmap_flags(
- struct xfs_map_extent *map,
- enum xfs_bmap_intent_type type,
- int whichfork,
- xfs_exntst_t state)
-{
- map->me_flags = 0;
- switch (type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- map->me_flags = type;
- break;
- default:
- ASSERT(0);
- }
- if (state == XFS_EXT_UNWRITTEN)
- map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
- if (whichfork == XFS_ATTR_FORK)
- map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
-}
-
/* Log bmap updates in the intent item. */
STATIC void
xfs_bmap_update_log_item(
@@ -281,8 +262,21 @@ xfs_bmap_update_log_item(
map->me_startblock = bi->bi_bmap.br_startblock;
map->me_startoff = bi->bi_bmap.br_startoff;
map->me_len = bi->bi_bmap.br_blockcount;
- xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork,
- bi->bi_bmap.br_state);
+
+ switch (bi->bi_type) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+ map->me_flags = bi->bi_type;
+ break;
+ default:
+ ASSERT(0);
+ }
+ if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN)
+ map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+ if (bi->bi_whichfork == XFS_ATTR_FORK)
+ map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ map->me_flags |= XFS_BMAP_EXTENT_REALTIME;
}
static struct xfs_log_item *
@@ -325,13 +319,16 @@ xfs_bmap_update_create_done(
}
/* Take a passive ref to the AG containing the space we're mapping. */
-void
+static inline void
xfs_bmap_update_get_group(
struct xfs_mount *mp,
struct xfs_bmap_intent *bi)
{
xfs_agnumber_t agno;
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ return;
+
agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
/*
@@ -344,14 +341,40 @@ xfs_bmap_update_get_group(
bi->bi_pag = xfs_perag_intent_get(mp, agno);
}
+/* Add this deferred BUI to the transaction. */
+void
+xfs_bmap_defer_add(
+ struct xfs_trans *tp,
+ struct xfs_bmap_intent *bi)
+{
+ trace_xfs_bmap_defer(bi);
+
+ xfs_bmap_update_get_group(tp->t_mountp, bi);
+ xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
+}
+
/* Release a passive AG ref after finishing mapping work. */
static inline void
xfs_bmap_update_put_group(
struct xfs_bmap_intent *bi)
{
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ return;
+
xfs_perag_intent_put(bi->bi_pag);
}
+/* Cancel a deferred bmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_bmap_intent *bi = bi_entry(item);
+
+ xfs_bmap_update_put_group(bi);
+ kmem_cache_free(xfs_bmap_intent_cache, bi);
+}
+
/* Process a deferred bmap update. */
STATIC int
xfs_bmap_update_finish_item(
@@ -360,19 +383,16 @@ xfs_bmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_bmap_intent *bi;
+ struct xfs_bmap_intent *bi = bi_entry(item);
int error;
- bi = container_of(item, struct xfs_bmap_intent, bi_list);
-
error = xfs_bmap_finish_one(tp, bi);
if (!error && bi->bi_bmap.br_blockcount > 0) {
ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
return -EAGAIN;
}
- xfs_bmap_update_put_group(bi);
- kmem_cache_free(xfs_bmap_intent_cache, bi);
+ xfs_bmap_update_cancel_item(item);
return error;
}
@@ -384,19 +404,6 @@ xfs_bmap_update_abort_intent(
xfs_bui_release(BUI_ITEM(intent));
}
-/* Cancel a deferred bmap update. */
-STATIC void
-xfs_bmap_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_bmap_intent *bi;
-
- bi = container_of(item, struct xfs_bmap_intent, bi_list);
-
- xfs_bmap_update_put_group(bi);
- kmem_cache_free(xfs_bmap_intent_cache, bi);
-}
-
/* Is this recovered BUI ok? */
static inline bool
xfs_bui_validate(
@@ -428,6 +435,9 @@ xfs_bui_validate(
if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
return false;
+ if (map->me_flags & XFS_BMAP_EXTENT_REALTIME)
+ return xfs_verify_rtbext(mp, map->me_startblock, map->me_len);
+
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
@@ -445,7 +455,8 @@ xfs_bui_recover_work(
if (error)
return ERR_PTR(error);
- bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ bi = kmem_cache_zalloc(xfs_bmap_intent_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
@@ -502,6 +513,12 @@ xfs_bmap_recover_work(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ if (!!(map->me_flags & XFS_BMAP_EXTENT_REALTIME) !=
+ xfs_ifork_is_realtime(ip, work->bi_whichfork)) {
+ error = -EFSCORRUPTED;
+ goto err_cancel;
+ }
+
if (work->bi_type == XFS_BMAP_MAP)
iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
else
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 3fafd3881a0b..6fee6a508343 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -68,4 +68,8 @@ struct xfs_bud_log_item {
extern struct kmem_cache *xfs_bui_cache;
extern struct kmem_cache *xfs_bud_cache;
+struct xfs_bmap_intent;
+
+void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index c2531c28905c..19e11d1da660 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -66,7 +66,7 @@ xfs_zero_extent(
return blkdev_issue_zeroout(target->bt_bdev,
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
- GFP_NOFS, 0);
+ GFP_KERNEL, 0);
}
/*
@@ -508,8 +508,8 @@ xfs_can_free_eofblocks(
* Caller must either hold the exclusive io lock; or be inactivating
* the inode, which guarantees there are no other users of the inode.
*/
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) ||
- (VFS_I(ip)->i_state & I_FREEING));
+ if (!(VFS_I(ip)->i_state & I_FREEING))
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
/* prealloc/delalloc exists only on regular files */
if (!S_ISREG(VFS_I(ip)->i_mode))
@@ -965,8 +965,7 @@ xfs_collapse_file_space(
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
bool done = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
trace_xfs_collapse_file_space(ip);
@@ -1035,8 +1034,7 @@ xfs_insert_file_space(
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
bool done = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
trace_xfs_insert_file_space(ip);
@@ -1307,16 +1305,16 @@ xfs_swap_extent_rmap(
}
/* Remove the mapping from the donor file. */
- xfs_bmap_unmap_extent(tp, tip, &uirec);
+ xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec);
/* Remove the mapping from the source file. */
- xfs_bmap_unmap_extent(tp, ip, &irec);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec);
/* Map the donor file's blocks into the source file. */
- xfs_bmap_map_extent(tp, ip, &uirec);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec);
/* Map the source file's blocks into the donor file. */
- xfs_bmap_map_extent(tp, tip, &irec);
+ xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec);
error = xfs_defer_finish(tpp);
tp = *tpp;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 01b41fabbe3c..1a18c381127e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -21,6 +21,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_ag.h"
+#include "xfs_buf_mem.h"
struct kmem_cache *xfs_buf_cache;
@@ -60,6 +61,11 @@ xfs_buf_submit(
return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
}
+static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
+{
+ return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
+}
+
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
@@ -189,8 +195,8 @@ xfs_buf_get_maps(
return 0;
}
- bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
- KM_NOFS);
+ bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
if (!bp->b_maps)
return -ENOMEM;
return 0;
@@ -204,7 +210,7 @@ xfs_buf_free_maps(
struct xfs_buf *bp)
{
if (bp->b_maps != &bp->__b_map) {
- kmem_free(bp->b_maps);
+ kfree(bp->b_maps);
bp->b_maps = NULL;
}
}
@@ -222,7 +228,8 @@ _xfs_buf_alloc(
int i;
*bpp = NULL;
- bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
+ bp = kmem_cache_zalloc(xfs_buf_cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -289,7 +296,7 @@ xfs_buf_free_pages(
mm_account_reclaimed_pages(bp->b_page_count);
if (bp->b_pages != bp->b_page_array)
- kmem_free(bp->b_pages);
+ kfree(bp->b_pages);
bp->b_pages = NULL;
bp->b_flags &= ~_XBF_PAGES;
}
@@ -312,10 +319,12 @@ xfs_buf_free(
ASSERT(list_empty(&bp->b_lru));
- if (bp->b_flags & _XBF_PAGES)
+ if (xfs_buftarg_is_mem(bp->b_target))
+ xmbuf_unmap_page(bp);
+ else if (bp->b_flags & _XBF_PAGES)
xfs_buf_free_pages(bp);
else if (bp->b_flags & _XBF_KMEM)
- kmem_free(bp->b_addr);
+ kfree(bp->b_addr);
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
@@ -325,21 +334,21 @@ xfs_buf_alloc_kmem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
- xfs_km_flags_t kmflag_mask = KM_NOFS;
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
size_t size = BBTOB(bp->b_length);
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
- kmflag_mask |= KM_ZERO;
+ gfp_mask |= __GFP_ZERO;
- bp->b_addr = kmem_alloc(size, kmflag_mask);
+ bp->b_addr = kmalloc(size, gfp_mask);
if (!bp->b_addr)
return -ENOMEM;
if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
((unsigned long)bp->b_addr & PAGE_MASK)) {
/* b_addr spans two pages - use alloc_page instead */
- kmem_free(bp->b_addr);
+ kfree(bp->b_addr);
bp->b_addr = NULL;
return -ENOMEM;
}
@@ -356,13 +365,11 @@ xfs_buf_alloc_pages(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
- gfp_t gfp_mask = __GFP_NOWARN;
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
long filled = 0;
if (flags & XBF_READ_AHEAD)
gfp_mask |= __GFP_NORETRY;
- else
- gfp_mask |= GFP_NOFS;
/* Make sure that we have a page list */
bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
@@ -429,11 +436,18 @@ _xfs_buf_map_pages(
/*
* vm_map_ram() will allocate auxiliary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we are likely to be under
- * GFP_NOFS context here. Hence we need to tell memory reclaim
- * that we are in such a context via PF_MEMALLOC_NOFS to prevent
- * memory reclaim re-entering the filesystem here and
- * potentially deadlocking.
+ * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
+ * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
+ * from the same call site that can be run from both above and
+ * below memory reclaim causes lockdep false positives. Hence we
+ * always need to force this allocation to nofs context because
+ * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
+ * prevent false positive lockdep reports.
+ *
+ * XXX(dgc): I think dquot reclaim is the only place we can get
+ * to this function from memory reclaim context now. If we fix
+ * that like we've fixed inode reclaim to avoid writeback from
+ * reclaim, this nofs wrapping can go away.
*/
nofs_flag = memalloc_nofs_save();
do {
@@ -499,18 +513,18 @@ static const struct rhashtable_params xfs_buf_hash_params = {
};
int
-xfs_buf_hash_init(
- struct xfs_perag *pag)
+xfs_buf_cache_init(
+ struct xfs_buf_cache *bch)
{
- spin_lock_init(&pag->pag_buf_lock);
- return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+ spin_lock_init(&bch->bc_lock);
+ return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
}
void
-xfs_buf_hash_destroy(
- struct xfs_perag *pag)
+xfs_buf_cache_destroy(
+ struct xfs_buf_cache *bch)
{
- rhashtable_destroy(&pag->pag_buf_hash);
+ rhashtable_destroy(&bch->bc_hash);
}
static int
@@ -573,7 +587,7 @@ xfs_buf_find_lock(
static inline int
xfs_buf_lookup(
- struct xfs_perag *pag,
+ struct xfs_buf_cache *bch,
struct xfs_buf_map *map,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
@@ -582,7 +596,7 @@ xfs_buf_lookup(
int error;
rcu_read_lock();
- bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+ bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
rcu_read_unlock();
return -ENOENT;
@@ -607,6 +621,7 @@ xfs_buf_lookup(
static int
xfs_buf_find_insert(
struct xfs_buftarg *btp,
+ struct xfs_buf_cache *bch,
struct xfs_perag *pag,
struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
@@ -622,31 +637,33 @@ xfs_buf_find_insert(
if (error)
goto out_drop_pag;
- /*
- * For buffers that fit entirely within a single page, first attempt to
- * allocate the memory from the heap to minimise memory usage. If we
- * can't get heap memory for these small buffers, we fall back to using
- * the page allocator.
- */
- if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
- xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+ if (xfs_buftarg_is_mem(new_bp->b_target)) {
+ error = xmbuf_map_page(new_bp);
+ } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
+ xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+ /*
+ * For buffers that fit entirely within a single page, first
+ * attempt to allocate the memory from the heap to minimise
+ * memory usage. If we can't get heap memory for these small
+ * buffers, we fall back to using the page allocator.
+ */
error = xfs_buf_alloc_pages(new_bp, flags);
- if (error)
- goto out_free_buf;
}
+ if (error)
+ goto out_free_buf;
- spin_lock(&pag->pag_buf_lock);
- bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+ spin_lock(&bch->bc_lock);
+ bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
error = PTR_ERR(bp);
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
if (bp) {
/* found an existing buffer */
atomic_inc(&bp->b_hold);
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
error = xfs_buf_find_lock(bp, flags);
if (error)
xfs_buf_rele(bp);
@@ -657,17 +674,40 @@ xfs_buf_find_insert(
/* The new buffer keeps the perag reference until it is freed. */
new_bp->b_pag = pag;
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
*bpp = new_bp;
return 0;
out_free_buf:
xfs_buf_free(new_bp);
out_drop_pag:
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
return error;
}
+static inline struct xfs_perag *
+xfs_buftarg_get_pag(
+ struct xfs_buftarg *btp,
+ const struct xfs_buf_map *map)
+{
+ struct xfs_mount *mp = btp->bt_mount;
+
+ if (xfs_buftarg_is_mem(btp))
+ return NULL;
+ return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
+}
+
+static inline struct xfs_buf_cache *
+xfs_buftarg_buf_cache(
+ struct xfs_buftarg *btp,
+ struct xfs_perag *pag)
+{
+ if (pag)
+ return &pag->pag_bcache;
+ return btp->bt_cache;
+}
+
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -681,6 +721,7 @@ xfs_buf_get_map(
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
+ struct xfs_buf_cache *bch;
struct xfs_perag *pag;
struct xfs_buf *bp = NULL;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
@@ -696,10 +737,10 @@ xfs_buf_get_map(
if (error)
return error;
- pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+ pag = xfs_buftarg_get_pag(btp, &cmap);
+ bch = xfs_buftarg_buf_cache(btp, pag);
- error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+ error = xfs_buf_lookup(bch, &cmap, flags, &bp);
if (error && error != -ENOENT)
goto out_put_perag;
@@ -711,13 +752,14 @@ xfs_buf_get_map(
goto out_put_perag;
/* xfs_buf_find_insert() consumes the perag reference. */
- error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+ error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
flags, &bp);
if (error)
return error;
} else {
XFS_STATS_INC(btp->bt_mount, xb_get_locked);
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
}
/* We do not hold a perag reference anymore. */
@@ -745,7 +787,8 @@ xfs_buf_get_map(
return 0;
out_put_perag:
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
return error;
}
@@ -892,6 +935,13 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
+ /*
+ * Currently we don't have a good means or justification for performing
+ * xmbuf_map_page asynchronously, so we don't do readahead.
+ */
+ if (xfs_buftarg_is_mem(target))
+ return;
+
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
@@ -957,7 +1007,10 @@ xfs_buf_get_uncached(
if (error)
return error;
- error = xfs_buf_alloc_pages(bp, flags);
+ if (xfs_buftarg_is_mem(bp->b_target))
+ error = xmbuf_map_page(bp);
+ else
+ error = xfs_buf_alloc_pages(bp, flags);
if (error)
goto fail_free_buf;
@@ -990,29 +1043,29 @@ xfs_buf_hold(
atomic_inc(&bp->b_hold);
}
-/*
- * Release a hold on the specified buffer. If the hold count is 1, the buffer is
- * placed on LRU or freed (depending on b_lru_ref).
- */
-void
-xfs_buf_rele(
+static void
+xfs_buf_rele_uncached(
+ struct xfs_buf *bp)
+{
+ ASSERT(list_empty(&bp->b_lru));
+ if (atomic_dec_and_test(&bp->b_hold)) {
+ xfs_buf_ioacct_dec(bp);
+ xfs_buf_free(bp);
+ }
+}
+
+static void
+xfs_buf_rele_cached(
struct xfs_buf *bp)
{
+ struct xfs_buftarg *btp = bp->b_target;
struct xfs_perag *pag = bp->b_pag;
+ struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
bool release;
bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
- if (!pag) {
- ASSERT(list_empty(&bp->b_lru));
- if (atomic_dec_and_test(&bp->b_hold)) {
- xfs_buf_ioacct_dec(bp);
- xfs_buf_free(bp);
- }
- return;
- }
-
ASSERT(atomic_read(&bp->b_hold) > 0);
/*
@@ -1026,7 +1079,7 @@ xfs_buf_rele(
* leading to a use-after-free scenario.
*/
spin_lock(&bp->b_lock);
- release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
@@ -1047,11 +1100,11 @@ xfs_buf_rele(
* buffer for the LRU and clear the (now stale) dispose list
* state flag
*/
- if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) {
+ if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
} else {
/*
* most of the time buffers will already be removed from the
@@ -1060,16 +1113,17 @@ xfs_buf_rele(
* was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
+ rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
+ xfs_buf_hash_params);
+ spin_unlock(&bch->bc_lock);
+ if (pag)
+ xfs_perag_put(pag);
freebuf = true;
}
@@ -1080,6 +1134,19 @@ out_unlock:
xfs_buf_free(bp);
}
+/*
+ * Release a hold on the specified buffer.
+ */
+void
+xfs_buf_rele(
+ struct xfs_buf *bp)
+{
+ trace_xfs_buf_rele(bp, _RET_IP_);
+ if (xfs_buf_is_uncached(bp))
+ xfs_buf_rele_uncached(bp);
+ else
+ xfs_buf_rele_cached(bp);
+}
/*
* Lock a buffer object, if it is not already locked.
@@ -1585,6 +1652,12 @@ _xfs_buf_ioapply(
/* we only use the buffer cache for meta-data */
op |= REQ_META;
+ /* in-memory targets are directly mapped, no IO required. */
+ if (xfs_buftarg_is_mem(bp->b_target)) {
+ xfs_buf_ioend(bp);
+ return;
+ }
+
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
* into the buffer and the desired IO size before we start -
@@ -1940,25 +2013,30 @@ xfs_buftarg_shrink_count(
}
void
-xfs_free_buftarg(
+xfs_destroy_buftarg(
struct xfs_buftarg *btp)
{
shrinker_free(btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
+}
+void
+xfs_free_buftarg(
+ struct xfs_buftarg *btp)
+{
+ xfs_destroy_buftarg(btp);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
fput(btp->bt_bdev_file);
-
- kmem_free(btp);
+ kfree(btp);
}
int
xfs_setsize_buftarg(
- xfs_buftarg_t *btp,
+ struct xfs_buftarg *btp,
unsigned int sectorsize)
{
/* Set up metadata sector size info */
@@ -1972,23 +2050,46 @@ xfs_setsize_buftarg(
return -EINVAL;
}
- /* Set up device logical sector size mask */
- btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
- btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
-
return 0;
}
-/*
- * When allocating the initial buffer target we have not yet
- * read in the superblock, so don't know what sized sectors
- * are being used at this early stage. Play safe.
- */
-STATIC int
-xfs_setsize_buftarg_early(
- xfs_buftarg_t *btp)
+int
+xfs_init_buftarg(
+ struct xfs_buftarg *btp,
+ size_t logical_sectorsize,
+ const char *descr)
{
- return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev));
+ /* Set up device logical sector size mask */
+ btp->bt_logical_sectorsize = logical_sectorsize;
+ btp->bt_logical_sectormask = logical_sectorsize - 1;
+
+ /*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages
+ * per 30 seconds so as to not spam logs too much on repeated errors.
+ */
+ ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
+ DEFAULT_RATELIMIT_BURST);
+
+ if (list_lru_init(&btp->bt_lru))
+ return -ENOMEM;
+ if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+ goto out_destroy_lru;
+
+ btp->bt_shrinker =
+ shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
+ if (!btp->bt_shrinker)
+ goto out_destroy_io_count;
+ btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
+ btp->bt_shrinker->private_data = btp;
+ shrinker_register(btp->bt_shrinker);
+ return 0;
+
+out_destroy_io_count:
+ percpu_counter_destroy(&btp->bt_io_count);
+out_destroy_lru:
+ list_lru_destroy(&btp->bt_lru);
+ return -ENOMEM;
}
struct xfs_buftarg *
@@ -1996,13 +2097,13 @@ xfs_alloc_buftarg(
struct xfs_mount *mp,
struct file *bdev_file)
{
- xfs_buftarg_t *btp;
+ struct xfs_buftarg *btp;
const struct dax_holder_operations *ops = NULL;
#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
ops = &xfs_dax_holder_operations;
#endif
- btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
+ btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL);
btp->bt_mount = mp;
btp->bt_bdev_file = bdev_file;
@@ -2012,40 +2113,19 @@ xfs_alloc_buftarg(
mp, ops);
/*
- * Buffer IO error rate limiting. Limit it to no more than 10 messages
- * per 30 seconds so as to not spam logs too much on repeated errors.
+ * When allocating the buftargs we have not yet read the super block and
+ * thus don't know the file system sector size yet.
*/
- ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
- DEFAULT_RATELIMIT_BURST);
-
- if (xfs_setsize_buftarg_early(btp))
+ if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
goto error_free;
-
- if (list_lru_init(&btp->bt_lru))
+ if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
+ mp->m_super->s_id))
goto error_free;
- if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
- goto error_lru;
-
- btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
- mp->m_super->s_id);
- if (!btp->bt_shrinker)
- goto error_pcpu;
-
- btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
- btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
- btp->bt_shrinker->private_data = btp;
-
- shrinker_register(btp->bt_shrinker);
-
return btp;
-error_pcpu:
- percpu_counter_destroy(&btp->bt_io_count);
-error_lru:
- list_lru_destroy(&btp->bt_lru);
error_free:
- kmem_free(btp);
+ kfree(btp);
return NULL;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 304e858d04fb..b1580644501f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -83,6 +83,14 @@ typedef unsigned int xfs_buf_flags_t;
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */
+struct xfs_buf_cache {
+ spinlock_t bc_lock;
+ struct rhashtable bc_hash;
+};
+
+int xfs_buf_cache_init(struct xfs_buf_cache *bch);
+void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
+
/*
* The xfs_buftarg contains 2 notions of "sector size" -
*
@@ -96,11 +104,12 @@ typedef unsigned int xfs_buf_flags_t;
* The latter is derived from the underlying device, and controls direct IO
* alignment constraints.
*/
-typedef struct xfs_buftarg {
+struct xfs_buftarg {
dev_t bt_dev;
struct file *bt_bdev_file;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
+ struct file *bt_file;
u64 bt_dax_part_off;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
@@ -114,7 +123,10 @@ typedef struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
-} xfs_buftarg_t;
+
+ /* built-in cache, if we're not using the perag one */
+ struct xfs_buf_cache bt_cache[];
+};
#define XB_PAGES 2
@@ -379,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+/* for xfs_buf_mem.c only: */
+int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
+ const char *descr);
+void xfs_destroy_buftarg(struct xfs_buftarg *btp);
+
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 023d4e0385dd..43031842341a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -805,8 +805,8 @@ xfs_buf_item_get_format(
return;
}
- bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
- 0);
+ bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
}
STATIC void
@@ -814,7 +814,7 @@ xfs_buf_item_free_format(
struct xfs_buf_log_item *bip)
{
if (bip->bli_formats != &bip->__bli_format) {
- kmem_free(bip->bli_formats);
+ kfree(bip->bli_formats);
bip->bli_formats = NULL;
}
}
@@ -1044,7 +1044,7 @@ xfs_buf_item_free(
struct xfs_buf_log_item *bip)
{
xfs_buf_item_free_format(bip);
- kmem_free(bip->bli_item.li_lv_shadow);
+ kvfree(bip->bli_item.li_lv_shadow);
kmem_cache_free(xfs_buf_item_cache, bip);
}
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 43167f543afc..09e893cf563c 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -85,7 +85,7 @@ xlog_add_buffer_cancelled(
return false;
}
- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
+ bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL);
bcp->bc_blkno = blkno;
bcp->bc_len = len;
bcp->bc_refcount = 1;
@@ -129,7 +129,7 @@ xlog_put_buffer_cancelled(
if (--bcp->bc_refcount == 0) {
list_del(&bcp->bc_list);
- kmem_free(bcp);
+ kfree(bcp);
}
return true;
}
@@ -1062,10 +1062,10 @@ xlog_free_buf_cancel_table(
&log->l_buf_cancel_table[i],
struct xfs_buf_cancel, bc_list))) {
list_del(&bc->bc_list);
- kmem_free(bc);
+ kfree(bc);
}
}
- kmem_free(log->l_buf_cancel_table);
+ kfree(log->l_buf_cancel_table);
log->l_buf_cancel_table = NULL;
}
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
new file mode 100644
index 000000000000..8ad38c64708e
--- /dev/null
+++ b/fs/xfs/xfs_buf_mem.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_buf.h"
+#include "xfs_buf_mem.h"
+#include "xfs_trace.h"
+#include <linux/shmem_fs.h>
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+/*
+ * Buffer Cache for In-Memory Files
+ * ================================
+ *
+ * Online fsck wants to create ephemeral ordered recordsets. The existing
+ * btree infrastructure can do this, but we need the buffer cache to target
+ * memory instead of block devices.
+ *
+ * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
+ * requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to
+ * store our staging data. This file is not installed in the file descriptor
+ * table so that user programs cannot access the data, which means that the
+ * xmbuf must be freed with xmbuf_destroy.
+ *
+ * xmbufs assume that the caller will handle all required concurrency
+ * management; standard vfs locks (freezer and inode) are not taken. Reads
+ * and writes are satisfied directly from the page cache.
+ *
+ * The only supported block size is PAGE_SIZE, and we cannot use highmem.
+ */
+
+/*
+ * shmem files used to back an in-memory buffer cache must not be exposed to
+ * userspace. Upper layers must coordinate access to the one handle returned
+ * by the constructor, so establish a separate lock class for xmbufs to avoid
+ * confusing lockdep.
+ */
+static struct lock_class_key xmbuf_i_mutex_key;
+
+/*
+ * Allocate a buffer cache target for a memory-backed file and set up the
+ * buffer target.
+ */
+int
+xmbuf_alloc(
+ struct xfs_mount *mp,
+ const char *descr,
+ struct xfs_buftarg **btpp)
+{
+ struct file *file;
+ struct inode *inode;
+ struct xfs_buftarg *btp;
+ int error;
+
+ btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
+ if (!btp)
+ return -ENOMEM;
+
+ file = shmem_kernel_file_setup(descr, 0, 0);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out_free_btp;
+ }
+ inode = file_inode(file);
+
+ /* private file, private locking */
+ lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);
+
+ /*
+ * We don't want to bother with kmapping data during repair, so don't
+ * allow highmem pages to back this mapping.
+ */
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
+ /* ensure all writes are below EOF to avoid pagecache zeroing */
+ i_size_write(inode, inode->i_sb->s_maxbytes);
+
+ trace_xmbuf_create(btp);
+
+ error = xfs_buf_cache_init(btp->bt_cache);
+ if (error)
+ goto out_file;
+
+ /* Initialize buffer target */
+ btp->bt_mount = mp;
+ btp->bt_dev = (dev_t)-1U;
+ btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */
+ btp->bt_file = file;
+ btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
+ btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;
+
+ error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
+ if (error)
+ goto out_bcache;
+
+ *btpp = btp;
+ return 0;
+
+out_bcache:
+ xfs_buf_cache_destroy(btp->bt_cache);
+out_file:
+ fput(file);
+out_free_btp:
+ kfree(btp);
+ return error;
+}
+
+/* Free a buffer cache target for a memory-backed buffer cache. */
+void
+xmbuf_free(
+ struct xfs_buftarg *btp)
+{
+ ASSERT(xfs_buftarg_is_mem(btp));
+ ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+
+ trace_xmbuf_free(btp);
+
+ xfs_destroy_buftarg(btp);
+ xfs_buf_cache_destroy(btp->bt_cache);
+ fput(btp->bt_file);
+ kfree(btp);
+}
+
+/* Directly map a shmem page into the buffer cache. */
+int
+xmbuf_map_page(
+ struct xfs_buf *bp)
+{
+ struct inode *inode = file_inode(bp->b_target->bt_file);
+ struct folio *folio = NULL;
+ struct page *page;
+ loff_t pos = BBTOB(xfs_buf_daddr(bp));
+ int error;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ if (bp->b_map_count != 1)
+ return -ENOMEM;
+ if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)
+ return -ENOMEM;
+ if (offset_in_page(pos) != 0) {
+ ASSERT(offset_in_page(pos));
+ return -ENOMEM;
+ }
+
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
+ if (error)
+ return error;
+
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -EIO;
+ }
+
+ page = folio_file_page(folio, pos >> PAGE_SHIFT);
+
+ /*
+ * Mark the page dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xmbuf_unmap_page.
+ */
+ set_page_dirty(page);
+ unlock_page(page);
+
+ bp->b_addr = page_address(page);
+ bp->b_pages = bp->b_page_array;
+ bp->b_pages[0] = page;
+ bp->b_page_count = 1;
+ return 0;
+}
+
+/* Unmap a shmem page that was mapped into the buffer cache. */
+void
+xmbuf_unmap_page(
+ struct xfs_buf *bp)
+{
+ struct page *page = bp->b_pages[0];
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ put_page(page);
+
+ bp->b_addr = NULL;
+ bp->b_pages[0] = NULL;
+ bp->b_pages = NULL;
+ bp->b_page_count = 0;
+}
+
+/* Is this a valid daddr within the buftarg? */
+bool
+xmbuf_verify_daddr(
+ struct xfs_buftarg *btp,
+ xfs_daddr_t daddr)
+{
+ struct inode *inode = file_inode(btp->bt_file);
+
+ ASSERT(xfs_buftarg_is_mem(btp));
+
+ return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
+}
+
+/* Discard the page backing this buffer. */
+static void
+xmbuf_stale(
+ struct xfs_buf *bp)
+{
+ struct inode *inode = file_inode(bp->b_target->bt_file);
+ loff_t pos;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ pos = BBTOB(xfs_buf_daddr(bp));
+ shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1);
+}
+
+/*
+ * Finalize a buffer -- discard the backing page if it's stale, or run the
+ * write verifier to detect problems.
+ */
+int
+xmbuf_finalize(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+ int error = 0;
+
+ if (bp->b_flags & XBF_STALE) {
+ xmbuf_stale(bp);
+ return 0;
+ }
+
+ /*
+ * Although this btree is ephemeral, validate the buffer structure so
+ * that we can detect memory corruption errors and software bugs.
+ */
+ fa = bp->b_ops->verify_struct(bp);
+ if (fa) {
+ error = -EFSCORRUPTED;
+ xfs_verifier_error(bp, error, fa);
+ }
+
+ return error;
+}
+
+/*
+ * Detach this xmbuf buffer from the transaction by any means necessary.
+ * All buffers are direct-mapped, so they do not need bwrite.
+ */
+void
+xmbuf_trans_bdetach(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bli = bp->b_log_item;
+
+ ASSERT(bli != NULL);
+
+ bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED |
+ XFS_BLI_LOGGED | XFS_BLI_STALE);
+ clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);
+
+ while (bp->b_log_item != NULL)
+ xfs_trans_bdetach(tp, bp);
+}
diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h
new file mode 100644
index 000000000000..eed4a7b63232
--- /dev/null
+++ b/fs/xfs/xfs_buf_mem.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BUF_MEM_H__
+#define __XFS_BUF_MEM_H__
+
+#define XMBUF_BLOCKSIZE (PAGE_SIZE)
+#define XMBUF_BLOCKSHIFT (PAGE_SHIFT)
+
+#ifdef CONFIG_XFS_MEMORY_BUFS
+static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp)
+{
+ return btp->bt_bdev == NULL;
+}
+
+int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
+ struct xfs_buftarg **btpp);
+void xmbuf_free(struct xfs_buftarg *btp);
+
+int xmbuf_map_page(struct xfs_buf *bp);
+void xmbuf_unmap_page(struct xfs_buf *bp);
+bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
+void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
+int xmbuf_finalize(struct xfs_buf *bp);
+#else
+# define xfs_buftarg_is_mem(...) (false)
+# define xmbuf_map_page(...) (-ENOMEM)
+# define xmbuf_unmap_page(...) ((void)0)
+# define xmbuf_verify_daddr(...) (false)
+#endif /* CONFIG_XFS_MEMORY_BUFS */
+
+#endif /* __XFS_BUF_MEM_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index cc6dc56f455d..cf9296b7e06f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -118,8 +118,10 @@ xfs_dir2_sf_getdents(
ctx->pos = off & 0x7fffffff;
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(sfep->name,
- sfep->namelen)))
+ sfep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
+ }
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
xfs_dir3_get_dtype(mp, filetype)))
return 0;
@@ -211,6 +213,7 @@ xfs_dir2_block_getdents(
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(dep->name,
dep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_rele;
}
@@ -465,6 +468,7 @@ xfs_dir2_leaf_getdents(
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(dep->name,
dep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
error = -EFSCORRUPTED;
break;
}
@@ -522,7 +526,7 @@ xfs_readdir(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d5787991bb5b..268bb734dc0a 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -8,6 +8,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_trans.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
@@ -18,6 +19,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
/*
* Notes on an efficient, low latency fstrim algorithm
@@ -79,7 +81,7 @@ xfs_discard_endio_work(
container_of(work, struct xfs_busy_extents, endio_work);
xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
- kmem_free(extents->owner);
+ kfree(extents->owner);
}
/*
@@ -120,7 +122,7 @@ xfs_discard_extents(
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, &bio);
+ GFP_KERNEL, &bio);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llx,%u], error %d",
@@ -155,6 +157,7 @@ xfs_trim_gather_extents(
uint64_t *blocks_trimmed)
{
struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_trans *tp;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
int error;
@@ -168,11 +171,15 @@ xfs_trim_gather_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
+ error = xfs_trans_alloc_empty(mp, &tp);
if (error)
return error;
- cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+ if (error)
+ goto out_trans_cancel;
+
+ cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
/*
* Look up the extent length requested in the AGF and start with it.
@@ -204,6 +211,7 @@ xfs_trim_gather_extents(
if (error)
break;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
break;
}
@@ -279,7 +287,8 @@ next_extent:
xfs_extent_busy_clear(mp, &extents->extent_list, false);
out_del_cursor:
xfs_btree_del_cursor(cur, error);
- xfs_buf_relse(agbp);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index b4f20d9c8f98..30d36596a2e4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -24,6 +24,7 @@
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Lock order:
@@ -44,6 +45,29 @@ static struct kmem_cache *xfs_dquot_cache;
static struct lock_class_key xfs_dquot_group_class;
static struct lock_class_key xfs_dquot_project_class;
+/* Record observations of quota corruption with the health tracking system. */
+static void
+xfs_dquot_mark_sick(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+
+ switch (dqp->q_type) {
+ case XFS_DQTYPE_USER:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_UQUOTA);
+ break;
+ case XFS_DQTYPE_GROUP:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
+ break;
+ case XFS_DQTYPE_PROJ:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
/*
* This is called to free all the memory associated with a dquot
*/
@@ -53,7 +77,7 @@ xfs_qm_dqdestroy(
{
ASSERT(list_empty(&dqp->q_lru));
- kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
+ kvfree(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
@@ -451,6 +475,8 @@ xfs_dquot_disk_read(
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dquot_mark_sick(dqp);
if (error) {
ASSERT(bp == NULL);
return error;
@@ -574,6 +600,7 @@ xfs_dquot_from_disk(
"Metadata corruption detected at %pS, quota %u",
__this_address, dqp->q_id);
xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
+ xfs_dquot_mark_sick(dqp);
return -EFSCORRUPTED;
}
@@ -950,7 +977,7 @@ xfs_qm_dqget_inode(
if (error)
return error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(xfs_inode_dquot(ip, type) == NULL);
id = xfs_qm_id_for_quotatype(ip, type);
@@ -1007,7 +1034,7 @@ restart:
}
dqret:
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
trace_xfs_dqget_miss(dqp);
*O_dqpp = dqp;
return 0;
@@ -1238,6 +1265,8 @@ xfs_qm_dqflush(
&bp, &xfs_dquot_buf_ops);
if (error == -EAGAIN)
goto out_unlock;
+ if (xfs_metadata_is_sick(error))
+ xfs_dquot_mark_sick(dqp);
if (error)
goto out_abort;
@@ -1246,6 +1275,7 @@ xfs_qm_dqflush(
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
dqp->q_id, fa);
xfs_buf_relse(bp);
+ xfs_dquot_mark_sick(dqp);
error = -EFSCORRUPTED;
goto out_abort;
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index b2cbbba3e15a..7ad0e92c6b5b 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -240,15 +240,15 @@ xfs_errortag_init(
{
int ret;
- mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
- KM_MAYFAIL);
+ mp->m_errortag = kzalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!mp->m_errortag)
return -ENOMEM;
ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
&mp->m_kobj, "errortag");
if (ret)
- kmem_free(mp->m_errortag);
+ kfree(mp->m_errortag);
return ret;
}
@@ -257,7 +257,7 @@ xfs_errortag_del(
struct xfs_mount *mp)
{
xfs_sysfs_del(&mp->m_errortag_kobj);
- kmem_free(mp->m_errortag);
+ kfree(mp->m_errortag);
}
static bool
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 2ccde32c9a9e..56cfa1498571 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -32,7 +32,8 @@ xfs_extent_busy_insert_list(
struct rb_node **rbp;
struct rb_node *parent = NULL;
- new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
+ new = kzalloc(sizeof(struct xfs_extent_busy),
+ GFP_KERNEL | __GFP_NOFAIL);
new->agno = pag->pag_agno;
new->bno = bno;
new->length = len;
@@ -530,7 +531,7 @@ xfs_extent_busy_clear_one(
}
list_del_init(&busyp->list);
- kmem_free(busyp);
+ kfree(busyp);
}
static void
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 1d1185fca6a5..8c382f092332 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,9 +40,9 @@ STATIC void
xfs_efi_item_free(
struct xfs_efi_log_item *efip)
{
- kmem_free(efip->efi_item.li_lv_shadow);
+ kvfree(efip->efi_item.li_lv_shadow);
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
- kmem_free(efip);
+ kfree(efip);
else
kmem_cache_free(xfs_efi_cache, efip);
}
@@ -229,9 +229,9 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
STATIC void
xfs_efd_item_free(struct xfs_efd_log_item *efdp)
{
- kmem_free(efdp->efd_item.li_lv_shadow);
+ kvfree(efdp->efd_item.li_lv_shadow);
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
- kmem_free(efdp);
+ kfree(efdp);
else
kmem_cache_free(xfs_efd_cache, efdp);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f..632653e00906 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -879,7 +879,7 @@ xfs_break_dax_layouts(
{
struct page *page;
- ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
page = dax_layout_busy_page(inode->i_mapping);
if (!page)
@@ -900,7 +900,7 @@ xfs_break_layouts(
bool retry;
int error;
- ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
do {
retry = false;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 2fc98d313708..e3aaa0555597 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -44,7 +44,7 @@ xfs_fstrm_free_func(
atomic_dec(&pag->pagf_fstrms);
xfs_perag_rele(pag);
- kmem_free(item);
+ kfree(item);
}
/*
@@ -313,7 +313,7 @@ xfs_filestream_create_association(
* we return a referenced AG, the allocation can still go ahead just
* fine.
*/
- item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
+ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!item)
goto out_put_fstrms;
@@ -326,7 +326,7 @@ xfs_filestream_create_association(
out_free_item:
xfs_perag_rele(item->pag);
- kmem_free(item);
+ kfree(item);
out_put_fstrms:
atomic_dec(&args->pag->pagf_fstrms);
return 0;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a72217f5feb..de59eec74765 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -763,8 +763,8 @@ xfs_getfsmap_datadev_bnobt_query(
return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
/* Allocate cursor for this AG and query_range it. */
- *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->pag, XFS_BTNUM_BNO);
+ *curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->pag);
key->ar_startblock = info->low.rm_startblock;
key[1].ar_startblock = info->high.rm_startblock;
return xfs_alloc_query_range(*curpp, key, &key[1],
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 9a57afee9338..b39f959146bc 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -14,6 +14,10 @@
#include "xfs_trace.h"
#include "xfs_health.h"
#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
/*
* Warn about metadata corruption that we detected but haven't fixed, and
@@ -93,11 +97,25 @@ xfs_fs_mark_sick(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
trace_xfs_fs_mark_sick(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_fs_sick |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark per-fs metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_fs_mark_corrupt(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
+ trace_xfs_fs_mark_corrupt(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_fs_sick |= mask;
mp->m_fs_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -108,11 +126,13 @@ xfs_fs_mark_healthy(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
trace_xfs_fs_mark_healthy(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_fs_sick &= ~mask;
+ if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY))
+ mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY;
mp->m_fs_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -136,11 +156,25 @@ xfs_rt_mark_sick(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
trace_xfs_rt_mark_sick(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_rt_sick |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark realtime metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_rt_mark_corrupt(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
+ trace_xfs_rt_mark_corrupt(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_rt_sick |= mask;
mp->m_rt_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -151,11 +185,13 @@ xfs_rt_mark_healthy(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
trace_xfs_rt_mark_healthy(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_rt_sick &= ~mask;
+ if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY))
+ mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY;
mp->m_rt_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -173,17 +209,48 @@ xfs_rt_measure_sickness(
spin_unlock(&mp->m_sb_lock);
}
+/* Mark unhealthy per-ag metadata given a raw AG number. */
+void
+xfs_agno_mark_sick(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ unsigned int mask)
+{
+ struct xfs_perag *pag = xfs_perag_get(mp, agno);
+
+ /* per-ag structure not set up yet? */
+ if (!pag)
+ return;
+
+ xfs_ag_mark_sick(pag, mask);
+ xfs_perag_put(pag);
+}
+
/* Mark unhealthy per-ag metadata. */
void
xfs_ag_mark_sick(
struct xfs_perag *pag,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
spin_lock(&pag->pag_state_lock);
pag->pag_sick |= mask;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_ag_mark_corrupt(
+ struct xfs_perag *pag,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
+ trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask);
+
+ spin_lock(&pag->pag_state_lock);
+ pag->pag_sick |= mask;
pag->pag_checked |= mask;
spin_unlock(&pag->pag_state_lock);
}
@@ -194,11 +261,13 @@ xfs_ag_mark_healthy(
struct xfs_perag *pag,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
spin_lock(&pag->pag_state_lock);
pag->pag_sick &= ~mask;
+ if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY))
+ pag->pag_sick &= ~XFS_SICK_AG_SECONDARY;
pag->pag_checked |= mask;
spin_unlock(&pag->pag_state_lock);
}
@@ -222,11 +291,34 @@ xfs_inode_mark_sick(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
trace_xfs_inode_mark_sick(ip, mask);
spin_lock(&ip->i_flags_lock);
ip->i_sick |= mask;
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Keep this inode around so we don't lose the sickness report. Scrub
+ * grabs inodes with DONTCACHE assuming that most inode are ok, which
+ * is not the case here.
+ */
+ spin_lock(&VFS_I(ip)->i_lock);
+ VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ spin_unlock(&VFS_I(ip)->i_lock);
+}
+
+/* Mark inode metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_inode_mark_corrupt(
+ struct xfs_inode *ip,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
+ trace_xfs_inode_mark_corrupt(ip, mask);
+
+ spin_lock(&ip->i_flags_lock);
+ ip->i_sick |= mask;
ip->i_checked |= mask;
spin_unlock(&ip->i_flags_lock);
@@ -246,11 +338,13 @@ xfs_inode_mark_healthy(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
trace_xfs_inode_mark_healthy(ip, mask);
spin_lock(&ip->i_flags_lock);
ip->i_sick &= ~mask;
+ if (!(ip->i_sick & XFS_SICK_INO_PRIMARY))
+ ip->i_sick &= ~XFS_SICK_INO_SECONDARY;
ip->i_checked |= mask;
spin_unlock(&ip->i_flags_lock);
}
@@ -280,6 +374,8 @@ static const struct ioctl_sick_map fs_map[] = {
{ XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA },
{ XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA },
{ XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA },
+ { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK },
+ { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS },
{ 0, 0 },
};
@@ -335,6 +431,7 @@ static const struct ioctl_sick_map ag_map[] = {
{ XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT },
{ XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT },
{ XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT },
+ { XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES },
{ 0, 0 },
};
@@ -397,3 +494,92 @@ xfs_bulkstat_health(
bs->bs_sick |= m->ioctl_mask;
}
}
+
+/* Mark a block mapping sick. */
+void
+xfs_bmap_mark_sick(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int mask;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ mask = XFS_SICK_INO_BMBTD;
+ break;
+ case XFS_ATTR_FORK:
+ mask = XFS_SICK_INO_BMBTA;
+ break;
+ case XFS_COW_FORK:
+ mask = XFS_SICK_INO_BMBTC;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ xfs_inode_mark_sick(ip, mask);
+}
+
+/* Record observations of btree corruption with the health tracking system. */
+void
+xfs_btree_mark_sick(
+ struct xfs_btree_cur *cur)
+{
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ /* no health state tracking for ephemeral btrees */
+ return;
+ case XFS_BTREE_TYPE_AG:
+ ASSERT(cur->bc_ops->sick_mask);
+ xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask);
+ return;
+ case XFS_BTREE_TYPE_INODE:
+ if (xfs_btree_is_bmap(cur->bc_ops)) {
+ xfs_bmap_mark_sick(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
+ return;
+ }
+ fallthrough;
+ default:
+ ASSERT(0);
+ return;
+ }
+}
+
+/*
+ * Record observations of dir/attr btree corruption with the health tracking
+ * system.
+ */
+void
+xfs_dirattr_mark_sick(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int mask;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ mask = XFS_SICK_INO_DIR;
+ break;
+ case XFS_ATTR_FORK:
+ mask = XFS_SICK_INO_XATTR;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ xfs_inode_mark_sick(ip, mask);
+}
+
+/*
+ * Record observations of dir/attr btree corruption with the health tracking
+ * system.
+ */
+void
+xfs_da_mark_sick(
+ struct xfs_da_args *args)
+{
+ xfs_dirattr_mark_sick(args->dp, args->whichfork);
+}
diff --git a/fs/xfs/xfs_hooks.c b/fs/xfs/xfs_hooks.c
new file mode 100644
index 000000000000..a58d1de2d37d
--- /dev/null
+++ b/fs/xfs/xfs_hooks.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_trace.h"
+
+/* Initialize a notifier chain. */
+void
+xfs_hooks_init(
+ struct xfs_hooks *chain)
+{
+ BLOCKING_INIT_NOTIFIER_HEAD(&chain->head);
+}
+
+/* Make it so a function gets called whenever we hit a certain hook point. */
+int
+xfs_hooks_add(
+ struct xfs_hooks *chain,
+ struct xfs_hook *hook)
+{
+ ASSERT(hook->nb.notifier_call != NULL);
+ BUILD_BUG_ON(offsetof(struct xfs_hook, nb) != 0);
+
+ return blocking_notifier_chain_register(&chain->head, &hook->nb);
+}
+
+/* Remove a previously installed hook. */
+void
+xfs_hooks_del(
+ struct xfs_hooks *chain,
+ struct xfs_hook *hook)
+{
+ blocking_notifier_chain_unregister(&chain->head, &hook->nb);
+}
+
+/* Call a hook. Returns the NOTIFY_* value returned by the last hook. */
+int
+xfs_hooks_call(
+ struct xfs_hooks *chain,
+ unsigned long val,
+ void *priv)
+{
+ return blocking_notifier_call_chain(&chain->head, val, priv);
+}
diff --git a/fs/xfs/xfs_hooks.h b/fs/xfs/xfs_hooks.h
new file mode 100644
index 000000000000..60b8a5831536
--- /dev/null
+++ b/fs/xfs/xfs_hooks.h
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef XFS_HOOKS_H_
+#define XFS_HOOKS_H_
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+struct xfs_hooks {
+ struct blocking_notifier_head head;
+};
+
+/*
+ * If jump labels are enabled in Kconfig, the static key uses nop sleds and
+ * code patching to eliminate the overhead of taking the rwsem in
+ * blocking_notifier_call_chain when there are no hooks configured. If not,
+ * the static key per-call overhead is an atomic read. Most arches that can
+ * handle XFS also support jump labels.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) \
+ static DEFINE_STATIC_KEY_FALSE(name)
+# define xfs_hooks_switch_on(name) static_branch_inc(name)
+# define xfs_hooks_switch_off(name) static_branch_dec(name)
+# define xfs_hooks_switched_on(name) static_branch_unlikely(name)
+
+struct xfs_hook {
+ /* This must come at the start of the structure. */
+ struct notifier_block nb;
+};
+
+typedef int (*xfs_hook_fn_t)(struct xfs_hook *hook, unsigned long action,
+ void *data);
+
+void xfs_hooks_init(struct xfs_hooks *chain);
+int xfs_hooks_add(struct xfs_hooks *chain, struct xfs_hook *hook);
+void xfs_hooks_del(struct xfs_hooks *chain, struct xfs_hook *hook);
+int xfs_hooks_call(struct xfs_hooks *chain, unsigned long action,
+ void *priv);
+
+static inline void xfs_hook_setup(struct xfs_hook *hook, notifier_fn_t fn)
+{
+ hook->nb.notifier_call = fn;
+ hook->nb.priority = 0;
+}
+
+#else
+
+struct xfs_hooks { /* empty */ };
+
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name)
+# define xfs_hooks_switch_on(name) ((void)0)
+# define xfs_hooks_switch_off(name) ((void)0)
+# define xfs_hooks_switched_on(name) (false)
+
+# define xfs_hooks_init(chain) ((void)0)
+# define xfs_hooks_call(chain, val, priv) (NOTIFY_DONE)
+#endif
+
+#endif /* XFS_HOOKS_H_ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index dba514a2c84d..e64265bc0b33 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -24,6 +24,7 @@
#include "xfs_ialloc.h"
#include "xfs_ag.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
#include <linux/iversion.h>
@@ -415,6 +416,9 @@ xfs_iget_check_free_state(
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
ip->i_ino, VFS_I(ip)->i_mode);
+ xfs_agno_mark_sick(ip->i_mount,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
@@ -422,6 +426,9 @@ xfs_iget_check_free_state(
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx has blocks allocated!",
ip->i_ino);
+ xfs_agno_mark_sick(ip->i_mount,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
return 0;
@@ -640,6 +647,8 @@ xfs_iget_cache_miss(
xfs_buf_offset(bp, ip->i_imap.im_boffset));
if (!error)
xfs_buf_set_ref(bp, XFS_INO_REF);
+ else
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
xfs_trans_brelse(tp, bp);
if (error)
@@ -659,10 +668,9 @@ xfs_iget_cache_miss(
/*
* Preload the radix tree so we can insert safely under the
* write spinlock. Note that we cannot sleep inside the preload
- * region. Since we can be called from transaction context, don't
- * recurse into the file system.
+ * region.
*/
- if (radix_tree_preload(GFP_NOFS)) {
+ if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
error = -EAGAIN;
goto out_destroy;
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index b05314d48176..4345db501714 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,7 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
+ kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1fd94958aa97..ea48774f6b76 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -203,9 +203,9 @@ xfs_ilock(
}
if (lock_flags & XFS_ILOCK_EXCL)
- mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
else if (lock_flags & XFS_ILOCK_SHARED)
- mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
}
/*
@@ -246,10 +246,10 @@ xfs_ilock_nowait(
}
if (lock_flags & XFS_ILOCK_EXCL) {
- if (!mrtryupdate(&ip->i_lock))
+ if (!down_write_trylock(&ip->i_lock))
goto out_undo_mmaplock;
} else if (lock_flags & XFS_ILOCK_SHARED) {
- if (!mrtryaccess(&ip->i_lock))
+ if (!down_read_trylock(&ip->i_lock))
goto out_undo_mmaplock;
}
return 1;
@@ -298,9 +298,9 @@ xfs_iunlock(
up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_ILOCK_EXCL)
- mrunlock_excl(&ip->i_lock);
+ up_write(&ip->i_lock);
else if (lock_flags & XFS_ILOCK_SHARED)
- mrunlock_shared(&ip->i_lock);
+ up_read(&ip->i_lock);
trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
}
@@ -319,7 +319,7 @@ xfs_ilock_demote(
~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
if (lock_flags & XFS_ILOCK_EXCL)
- mrdemote(&ip->i_lock);
+ downgrade_write(&ip->i_lock);
if (lock_flags & XFS_MMAPLOCK_EXCL)
downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_IOLOCK_EXCL)
@@ -328,52 +328,30 @@ xfs_ilock_demote(
trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
}
-#if defined(DEBUG) || defined(XFS_WARN)
-static inline bool
-__xfs_rwsem_islocked(
- struct rw_semaphore *rwsem,
- bool shared)
-{
- if (!debug_locks)
- return rwsem_is_locked(rwsem);
-
- if (!shared)
- return lockdep_is_held_type(rwsem, 0);
-
- /*
- * We are checking that the lock is held at least in shared
- * mode but don't care that it might be held exclusively
- * (i.e. shared | excl). Hence we check if the lock is held
- * in any mode rather than an explicit shared mode.
- */
- return lockdep_is_held_type(rwsem, -1);
-}
-
-bool
-xfs_isilocked(
+void
+xfs_assert_ilocked(
struct xfs_inode *ip,
uint lock_flags)
{
- if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
- if (!(lock_flags & XFS_ILOCK_SHARED))
- return !!ip->i_lock.mr_writer;
- return rwsem_is_locked(&ip->i_lock.mr_lock);
- }
-
- if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
- (lock_flags & XFS_MMAPLOCK_SHARED));
- }
+ /*
+ * Sometimes we assert the ILOCK is held exclusively, but we're in
+ * a workqueue, so lockdep doesn't know we're the owner.
+ */
+ if (lock_flags & XFS_ILOCK_SHARED)
+ rwsem_assert_held(&ip->i_lock);
+ else if (lock_flags & XFS_ILOCK_EXCL)
+ rwsem_assert_held_write_nolockdep(&ip->i_lock);
- if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
- (lock_flags & XFS_IOLOCK_SHARED));
- }
+ if (lock_flags & XFS_MMAPLOCK_SHARED)
+ rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock);
+ else if (lock_flags & XFS_MMAPLOCK_EXCL)
+ rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock);
- ASSERT(0);
- return false;
+ if (lock_flags & XFS_IOLOCK_SHARED)
+ rwsem_assert_held(&VFS_I(ip)->i_rwsem);
+ else if (lock_flags & XFS_IOLOCK_EXCL)
+ rwsem_assert_held_write(&VFS_I(ip)->i_rwsem);
}
-#endif
/*
* xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
@@ -671,7 +649,7 @@ xfs_lookup(
out_free_name:
if (ci_name)
- kmem_free(ci_name->name);
+ kfree(ci_name->name);
out_unlock:
*ipp = NULL;
return error;
@@ -802,6 +780,8 @@ xfs_init_new_inode(
*/
if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+ xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
@@ -947,6 +927,81 @@ xfs_bumplink(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of directory live update hooks.
+ * If the compiler supports jump labels, the static branch will be replaced by
+ * a nop sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
+
+void
+xfs_dir_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_dir_hooks_switch);
+}
+
+void
+xfs_dir_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_dir_hooks_switch);
+}
+
+/* Call hooks for a directory update relating to a child dirent update. */
+inline void
+xfs_dir_update_hook(
+ struct xfs_inode *dp,
+ struct xfs_inode *ip,
+ int delta,
+ const struct xfs_name *name)
+{
+ if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
+ struct xfs_dir_update_params p = {
+ .dp = dp,
+ .ip = ip,
+ .delta = delta,
+ .name = name,
+ };
+ struct xfs_mount *mp = ip->i_mount;
+
+ xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+ }
+}
+
+/* Call the specified function during a directory update. */
+int
+xfs_dir_hook_add(
+ struct xfs_mount *mp,
+ struct xfs_dir_hook *hook)
+{
+ return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Stop calling the specified function during a directory update. */
+void
+xfs_dir_hook_del(
+ struct xfs_mount *mp,
+ struct xfs_dir_hook *hook)
+{
+ xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Configure directory update hook functions. */
+void
+xfs_dir_hook_setup(
+ struct xfs_dir_hook *hook,
+ notifier_fn_t mod_fn)
+{
+ xfs_hook_setup(&hook->dirent_hook, mod_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
int
xfs_create(
struct mnt_idmap *idmap,
@@ -1058,6 +1113,12 @@ xfs_create(
}
/*
+ * Create ip with a reference from dp, and add '.' and '..' references
+ * if it's a directory.
+ */
+ xfs_dir_update_hook(dp, ip, 1, name);
+
+ /*
* If this is a synchronous mount, make sure that the
* create transaction goes to disk before returning to
* the user.
@@ -1271,6 +1332,7 @@ xfs_link(
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
xfs_bumplink(tp, sip);
+ xfs_dir_update_hook(tdp, sip, 1, target_name);
/*
* If this is a synchronous mount, make sure that the
@@ -1342,9 +1404,9 @@ xfs_itruncate_extents_flags(
xfs_fileoff_t first_unmap_block;
int error = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+ if (atomic_read(&VFS_I(ip)->i_count))
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_itemp != NULL);
@@ -1596,7 +1658,7 @@ xfs_inactive_ifree(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_ifree(tp, ip);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (error) {
/*
* If we fail to free the inode, shut down. The cancel
@@ -1677,6 +1739,39 @@ xfs_inode_needs_inactive(
}
/*
+ * Save health status somewhere, if we're dumping an inode with uncorrected
+ * errors and online repair isn't running.
+ */
+static inline void
+xfs_inactive_health(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ unsigned int sick;
+ unsigned int checked;
+
+ xfs_inode_measure_sickness(ip, &sick, &checked);
+ if (!sick)
+ return;
+
+ trace_xfs_inode_unfixed_corruption(ip, sick);
+
+ if (sick & XFS_SICK_INO_FORGET)
+ return;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ if (!pag) {
+ /* There had better still be a perag structure! */
+ ASSERT(0);
+ return;
+ }
+
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES);
+ xfs_perag_put(pag);
+}
+
+/*
* xfs_inactive
*
* This is called when the vnode reference count for the vnode
@@ -1704,6 +1799,8 @@ xfs_inactive(
mp = ip->i_mount;
ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
+ xfs_inactive_health(ip);
+
/*
* If this is a read-only mount, don't do this (would generate I/O)
* unless we're in log recovery and cleaning the iunlinked list.
@@ -1910,6 +2007,7 @@ xfs_iunlink_update_bucket(
*/
if (old_value == new_agino) {
xfs_buf_mark_corrupt(agibp);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -1959,11 +2057,14 @@ xfs_iunlink_reload_next(
*/
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
- if (error)
+ if (error) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return error;
+ }
/* If this is not an unlinked inode, something is very wrong. */
if (VFS_I(next_ip)->i_nlink != 0) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
error = -EFSCORRUPTED;
goto rele;
}
@@ -2001,6 +2102,7 @@ xfs_iunlink_insert_inode(
if (next_agino == agino ||
!xfs_verify_agino_or_null(pag, next_agino)) {
xfs_buf_mark_corrupt(agibp);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -2088,6 +2190,7 @@ xfs_iunlink_remove_inode(
if (!xfs_verify_agino(pag, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -2116,8 +2219,10 @@ xfs_iunlink_remove_inode(
struct xfs_inode *prev_ip;
prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
- if (!prev_ip)
+ if (!prev_ip) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
+ }
error = xfs_iunlink_log_inode(tp, prev_ip, pag,
ip->i_next_unlinked);
@@ -2350,7 +2455,7 @@ xfs_ifree(
struct xfs_inode_log_item *iip = ip->i_itemp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(ip->i_df.if_nextents == 0);
ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
@@ -2378,7 +2483,7 @@ xfs_ifree(
* already been freed by xfs_attr_inactive.
*/
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- kmem_free(ip->i_df.if_data);
+ kfree(ip->i_df.if_data);
ip->i_df.if_data = NULL;
ip->i_df.if_bytes = 0;
}
@@ -2419,7 +2524,7 @@ static void
xfs_iunpin(
struct xfs_inode *ip)
{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
@@ -2585,6 +2690,12 @@ xfs_remove(
}
/*
+ * Drop the link from dp to ip, and if ip was a directory, remove the
+ * '.' and '..' references since we freed the directory.
+ */
+ xfs_dir_update_hook(dp, ip, -1, name);
+
+ /*
* If this is a synchronous mount, make sure that the
* remove transaction goes to disk before returning to
* the user.
@@ -2774,6 +2885,20 @@ xfs_cross_rename(
}
xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+
+ /*
+ * Inform our hook clients that we've finished an exchange operation as
+ * follows: removed the source and target files from their directories;
+ * added the target to the source directory; and added the source to
+ * the target directory. All inodes are locked, so it's ok to model a
+ * rename this way so long as we say we deleted entries before we add
+ * new ones.
+ */
+ xfs_dir_update_hook(dp1, ip1, -1, name1);
+ xfs_dir_update_hook(dp2, ip2, -1, name2);
+ xfs_dir_update_hook(dp1, ip2, 1, name1);
+ xfs_dir_update_hook(dp2, ip1, 1, name2);
+
return xfs_finish_rename(tp);
out_trans_abort:
@@ -3157,6 +3282,21 @@ retry:
if (new_parent)
xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+ /*
+ * Inform our hook clients that we've finished a rename operation as
+ * follows: removed the source and target files from their directories;
+ * that we've added the source to the target directory; and finally
+ * that we've added the whiteout, if there was one. All inodes are
+ * locked, so it's ok to model a rename this way so long as we say we
+ * deleted entries before we add new ones.
+ */
+ if (target_ip)
+ xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
+ xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
+ xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
+ if (wip)
+ xfs_dir_update_hook(src_dp, wip, 1, src_name);
+
error = xfs_finish_rename(tp);
if (wip)
xfs_irele(wip);
@@ -3182,7 +3322,7 @@ xfs_iflush(
struct xfs_mount *mp = ip->i_mount;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
@@ -3317,6 +3457,8 @@ flush_out:
/* generate the checksum. */
xfs_dinode_calc_crc(mp, dip);
+ if (error)
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return error;
}
@@ -3777,3 +3919,19 @@ xfs_ifork_zapped(
return false;
}
}
+
+/* Compute the number of data and realtime blocks used by a file. */
+void
+xfs_inode_count_blocks(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_filblks_t *dblocks,
+ xfs_filblks_t *rblocks)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+ *rblocks = 0;
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_bmap_count_leaves(ifp, rblocks);
+ *dblocks = ip->i_nblocks - *rblocks;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 97f63bacd4c2..ab46ffb3ac19 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -39,7 +39,7 @@ typedef struct xfs_inode {
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
- mrlock_t i_lock; /* inode lock */
+ struct rw_semaphore i_lock; /* inode lock */
atomic_t i_pincount; /* inode pin count */
struct llist_node i_gclist; /* deferred inactivation list */
@@ -171,6 +171,12 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
return &ip->i_vnode;
}
+/* convert from const xfs inode to const vfs inode */
+static inline const struct inode *VFS_IC(const struct xfs_inode *ip)
+{
+ return &ip->i_vnode;
+}
+
/*
* For regular files we only update the on-disk filesize when actually
* writing data back to disk. Until then only the copy in the VFS inode
@@ -523,7 +529,7 @@ void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
-bool xfs_isilocked(struct xfs_inode *, uint);
+void xfs_assert_ilocked(struct xfs_inode *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
@@ -623,5 +629,32 @@ int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip)
int xfs_inode_reload_unlinked(struct xfs_inode *ip);
bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
+void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
+
+struct xfs_dir_update_params {
+ const struct xfs_inode *dp;
+ const struct xfs_inode *ip;
+ const struct xfs_name *name;
+ int delta;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
+ int delta, const struct xfs_name *name);
+
+struct xfs_dir_hook {
+ struct xfs_hook dirent_hook;
+};
+
+void xfs_dir_hook_disable(void);
+void xfs_dir_hook_enable(void);
+
+int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn);
+#else
+# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0aee97ba0be8..f28d653300d1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -650,7 +650,7 @@ xfs_inode_item_pin(
{
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(lip->li_buf);
trace_xfs_inode_pin(ip, _RET_IP_);
@@ -756,7 +756,7 @@ xfs_inode_item_release(
unsigned short lock_flags;
ASSERT(ip->i_itemp != NULL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
lock_flags = iip->ili_lock_flags;
iip->ili_lock_flags = 0;
@@ -856,7 +856,7 @@ xfs_inode_item_destroy(
ASSERT(iip->ili_item.li_buf == NULL);
ip->i_itemp = NULL;
- kmem_free(iip->ili_item.li_lv_shadow);
+ kvfree(iip->ili_item.li_lv_shadow);
kmem_cache_free(xfs_ili_cache, iip);
}
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 144198a6b270..dbdab4ce7c44 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -291,7 +291,8 @@ xlog_recover_inode_commit_pass2(
if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
} else {
- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
+ in_f = kmalloc(sizeof(struct xfs_inode_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
need_free = 1;
error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
if (error)
@@ -553,7 +554,7 @@ out_release:
xfs_buf_relse(bp);
error:
if (need_free)
- kmem_free(in_f);
+ kfree(in_f);
return error;
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f02b6e558af5..d0e2cec6210d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -435,7 +435,7 @@ xfs_ioc_attr_list(
copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
error = -EFAULT;
out_free:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -493,7 +493,7 @@ xfs_attrmulti_attr_get(
error = -EFAULT;
out_kfree:
- kmem_free(args.value);
+ kvfree(args.value);
return error;
}
@@ -1506,7 +1506,7 @@ xfs_ioc_getbmap(
error = 0;
out_free_buf:
- kmem_free(buf);
+ kvfree(buf);
return error;
}
@@ -1636,7 +1636,7 @@ xfs_ioc_getfsmap(
}
out_free:
- kmem_free(recs);
+ kvfree(recs);
return error;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18c8f168b153..4087af7f3c9f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -27,6 +27,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"
+#include "xfs_health.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -45,6 +46,7 @@ xfs_alert_fsblock_zero(
(unsigned long long)imap->br_startoff,
(unsigned long long)imap->br_blockcount,
imap->br_state);
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -99,8 +101,10 @@ xfs_bmbt_to_iomap(
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
- if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return xfs_alert_fsblock_zero(ip, imap);
+ }
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
@@ -325,8 +329,10 @@ xfs_iomap_write_direct(
goto out_unlock;
}
- if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = xfs_alert_fsblock_zero(ip, imap);
+ }
out_unlock:
*seq = xfs_iomap_inode_sequence(ip, 0);
@@ -639,8 +645,10 @@ xfs_iomap_write_unwritten(
if (error)
return error;
- if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return xfs_alert_fsblock_zero(ip, &imap);
+ }
if ((numblks_fsb = imap.br_blockcount) == 0) {
/*
@@ -986,6 +994,7 @@ xfs_buffered_write_iomap_begin(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
}
@@ -1323,7 +1332,7 @@ xfs_seek_iomap_begin(
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
- xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb);
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
IOMAP_F_SHARED, seq);
@@ -1348,7 +1357,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
seq = xfs_iomap_inode_sequence(ip, 0);
- xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, lockmode);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a0d77f5f512e..66f8c47642e8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -346,7 +346,7 @@ xfs_vn_ci_lookup(
dname.name = ci_name.name;
dname.len = ci_name.len;
dentry = d_add_ci(dentry, VFS_I(ip), &dname);
- kmem_free(ci_name.name);
+ kfree(ci_name.name);
return dentry;
}
@@ -796,8 +796,7 @@ xfs_setattr_size(
uint lock_flags = 0;
bool did_zeroing = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
@@ -1285,9 +1284,9 @@ xfs_setup_inode(
*/
lockdep_set_class(&inode->i_rwsem,
&inode->i_sb->s_type->i_mutex_dir_key);
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+ lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class);
} else {
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+ lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class);
}
/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 14462614fcc8..95fc31b9f87d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -197,8 +197,8 @@ xfs_bulkstat_one(
ASSERT(breq->icount == 1);
- bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
- KM_MAYFAIL);
+ bc.buf = kzalloc(sizeof(struct xfs_bulkstat),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!bc.buf)
return -ENOMEM;
@@ -214,7 +214,7 @@ xfs_bulkstat_one(
breq->startino, &bc);
xfs_trans_cancel(tp);
out:
- kmem_free(bc.buf);
+ kfree(bc.buf);
/*
* If we reported one inode to userspace then we abort because we hit
@@ -289,8 +289,8 @@ xfs_bulkstat(
if (xfs_bulkstat_already_done(breq->mp, breq->startino))
return 0;
- bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
- KM_MAYFAIL);
+ bc.buf = kzalloc(sizeof(struct xfs_bulkstat),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!bc.buf)
return -ENOMEM;
@@ -309,7 +309,7 @@ xfs_bulkstat(
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
out:
- kmem_free(bc.buf);
+ kfree(bc.buf);
/*
* We found some inodes, so clear the error status and return them.
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index b3275e8d47b6..01b55f03a102 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -22,6 +22,7 @@
#include "xfs_trans.h"
#include "xfs_pwork.h"
#include "xfs_ag.h"
+#include "xfs_bit.h"
/*
* Walking Inodes in the Filesystem
@@ -99,6 +100,7 @@ xfs_iwalk_ichunk_ra(
struct xfs_inobt_rec_incore *irec)
{
struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agnumber_t agno = pag->pag_agno;
xfs_agblock_t agbno;
struct blk_plug plug;
int i; /* inode chunk index */
@@ -111,8 +113,9 @@ xfs_iwalk_ichunk_ra(
imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
if (imask & ~irec->ir_free) {
- xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
- igeo->blocks_per_cluster,
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, agbno),
+ igeo->blocks_per_cluster * mp->m_bsize,
&xfs_inode_buf_ops);
}
agbno += igeo->blocks_per_cluster;
@@ -131,21 +134,11 @@ xfs_iwalk_adjust_start(
struct xfs_inobt_rec_incore *irec) /* btree record */
{
int idx; /* index into inode chunk */
- int i;
idx = agino - irec->ir_startino;
- /*
- * We got a right chunk with some left inodes allocated at it. Grab
- * the chunk record. Mark all the uninteresting inodes free because
- * they're before our start point.
- */
- for (i = 0; i < idx; i++) {
- if (XFS_INOBT_MASK(i) & ~irec->ir_free)
- irec->ir_freecount++;
- }
-
irec->ir_free |= xfs_inobt_maskn(0, idx);
+ irec->ir_freecount = hweight64(irec->ir_free);
}
/* Allocate memory for a walk. */
@@ -160,7 +153,7 @@ xfs_iwalk_alloc(
/* Allocate a prefetch buffer for inobt records. */
size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
- iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+ iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (iwag->recs == NULL)
return -ENOMEM;
@@ -172,7 +165,7 @@ STATIC void
xfs_iwalk_free(
struct xfs_iwalk_ag *iwag)
{
- kmem_free(iwag->recs);
+ kfree(iwag->recs);
iwag->recs = NULL;
}
@@ -275,9 +268,10 @@ xfs_iwalk_ag_start(
/* Set up a fresh cursor and empty the inobt cache. */
iwag->nr_recs = 0;
- error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
if (error)
return error;
+ *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp);
/* Starting at the beginning of the AG? That's easy! */
if (agino == 0)
@@ -306,8 +300,10 @@ xfs_iwalk_ag_start(
error = xfs_inobt_get_rec(*curpp, irec, has_more);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, *has_more != 1))
+ if (XFS_IS_CORRUPT(mp, *has_more != 1)) {
+ xfs_btree_mark_sick(*curpp);
return -EFSCORRUPTED;
+ }
iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
@@ -390,11 +386,10 @@ xfs_iwalk_run_callbacks(
}
/* ...and recreate the cursor just past where we left off. */
- error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp,
- agi_bpp);
+ error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp);
if (error)
return error;
-
+ *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp);
return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
}
@@ -434,6 +429,7 @@ xfs_iwalk_ag(
rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
if (iwag->lastino != NULLFSINO &&
XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -627,7 +623,7 @@ xfs_iwalk_ag_work(
xfs_iwalk_free(iwag);
out:
xfs_perag_put(iwag->pag);
- kmem_free(iwag);
+ kfree(iwag);
return error;
}
@@ -663,7 +659,8 @@ xfs_iwalk_threaded(
if (xfs_pwork_ctl_want_abort(&pctl))
break;
- iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
+ iwag = kzalloc(sizeof(struct xfs_iwalk_ag),
+ GFP_KERNEL | __GFP_NOFAIL);
iwag->mp = mp;
/*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index d7873e0360f0..8f07c9f6157f 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -21,15 +21,13 @@ typedef __u32 xfs_nlink_t;
#include "xfs_types.h"
-#include "kmem.h"
-#include "mrlock.h"
-
#include <linux/semaphore.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/crc32c.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -51,6 +49,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/notifier.h>
#include <linux/delay.h>
#include <linux/log2.h>
+#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/ctype.h>
@@ -82,6 +81,7 @@ typedef __u32 xfs_nlink_t;
#include "xfs_buf.h"
#include "xfs_message.h"
#include "xfs_drain.h"
+#include "xfs_hooks.h"
#ifdef __BIG_ENDIAN
#define XFS_NATIVE_HOST 1
@@ -269,4 +269,15 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
# define PTR_FMT "%p"
#endif
+/*
+ * Helper for IO routines to grab backing pages from allocated kernel memory.
+ */
+static inline struct page *
+kmem_to_page(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ return virt_to_page(addr);
+}
+
#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a1650fc81382..5004f23d344e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -633,14 +633,14 @@ xlog_state_release_iclog(
*/
int
xfs_log_mount(
- xfs_mount_t *mp,
- xfs_buftarg_t *log_target,
- xfs_daddr_t blk_offset,
- int num_bblks)
+ xfs_mount_t *mp,
+ struct xfs_buftarg *log_target,
+ xfs_daddr_t blk_offset,
+ int num_bblks)
{
- struct xlog *log;
- int error = 0;
- int min_logfsbs;
+ struct xlog *log;
+ int error = 0;
+ int min_logfsbs;
if (!xfs_has_norecovery(mp)) {
xfs_notice(mp, "Mounting V%d Filesystem %pU",
@@ -1528,7 +1528,7 @@ xlog_alloc_log(
int error = -ENOMEM;
uint log2_size = 0;
- log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
+ log = kzalloc(sizeof(struct xlog), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!log) {
xfs_warn(mp, "Log allocation failed: No memory!");
goto out;
@@ -1605,7 +1605,8 @@ xlog_alloc_log(
size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
sizeof(struct bio_vec);
- iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+ iclog = kzalloc(sizeof(*iclog) + bvec_size,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog)
goto out_free_iclog;
@@ -1661,13 +1662,13 @@ out_destroy_workqueue:
out_free_iclog:
for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
prev_iclog = iclog->ic_next;
- kmem_free(iclog->ic_data);
- kmem_free(iclog);
+ kvfree(iclog->ic_data);
+ kfree(iclog);
if (prev_iclog == log->l_iclog)
break;
}
out_free_log:
- kmem_free(log);
+ kfree(log);
out:
return ERR_PTR(error);
} /* xlog_alloc_log */
@@ -2118,14 +2119,14 @@ xlog_dealloc_log(
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
- kmem_free(iclog->ic_data);
- kmem_free(iclog);
+ kvfree(iclog->ic_data);
+ kfree(iclog);
iclog = next_iclog;
}
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
- kmem_free(log);
+ kfree(log);
}
/*
@@ -3517,7 +3518,8 @@ xlog_ticket_alloc(
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
+ tic = kmem_cache_zalloc(xfs_log_ticket_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 67a99d94701e..73f5b7f628f4 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -100,7 +100,7 @@ xlog_cil_ctx_alloc(void)
{
struct xfs_cil_ctx *ctx;
- ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents.extent_list);
INIT_LIST_HEAD(&ctx->log_items);
@@ -339,7 +339,7 @@ xlog_cil_alloc_shadow_bufs(
* the buffer, only the log vector header and the iovec
* storage.
*/
- kmem_free(lip->li_lv_shadow);
+ kvfree(lip->li_lv_shadow);
lv = xlog_kvmalloc(buf_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
@@ -703,7 +703,7 @@ xlog_cil_free_logvec(
while (!list_empty(lv_chain)) {
lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
list_del_init(&lv->lv_list);
- kmem_free(lv);
+ kvfree(lv);
}
}
@@ -753,7 +753,7 @@ xlog_cil_committed(
return;
}
- kmem_free(ctx);
+ kfree(ctx);
}
void
@@ -1116,11 +1116,18 @@ xlog_cil_cleanup_whiteouts(
* same sequence twice. If we get a race between multiple pushes for the same
* sequence they will block on the first one and then abort, hence avoiding
* needless pushes.
+ *
+ * This runs from a workqueue so it does not inherent any specific memory
+ * allocation context. However, we do not want to block on memory reclaim
+ * recursing back into the filesystem because this push may have been triggered
+ * by memory reclaim itself. Hence we really need to run under full GFP_NOFS
+ * contraints here.
*/
static void
xlog_cil_push_work(
struct work_struct *work)
{
+ unsigned int nofs_flags = memalloc_nofs_save();
struct xfs_cil_ctx *ctx =
container_of(work, struct xfs_cil_ctx, push_work);
struct xfs_cil *cil = ctx->cil;
@@ -1334,12 +1341,14 @@ xlog_cil_push_work(
spin_unlock(&log->l_icloglock);
xlog_cil_cleanup_whiteouts(&whiteouts);
xfs_log_ticket_ungrant(log, ticket);
+ memalloc_nofs_restore(nofs_flags);
return;
out_skip:
up_write(&cil->xc_ctx_lock);
xfs_log_ticket_put(new_ctx->ticket);
- kmem_free(new_ctx);
+ kfree(new_ctx);
+ memalloc_nofs_restore(nofs_flags);
return;
out_abort_free_ticket:
@@ -1348,6 +1357,7 @@ out_abort_free_ticket:
if (!ctx->commit_iclog) {
xfs_log_ticket_ungrant(log, ctx->ticket);
xlog_cil_committed(ctx);
+ memalloc_nofs_restore(nofs_flags);
return;
}
spin_lock(&log->l_icloglock);
@@ -1356,6 +1366,7 @@ out_abort_free_ticket:
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
xfs_log_ticket_ungrant(log, ticket);
+ memalloc_nofs_restore(nofs_flags);
}
/*
@@ -1533,7 +1544,7 @@ xlog_cil_process_intents(
set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
trace_xfs_cil_whiteout_mark(ilip);
len += ilip->li_lv->lv_bytes;
- kmem_free(ilip->li_lv);
+ kvfree(ilip->li_lv);
ilip->li_lv = NULL;
xfs_trans_del_item(lip);
@@ -1747,7 +1758,7 @@ xlog_cil_init(
struct xlog_cil_pcp *cilpcp;
int cpu;
- cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
+ cil = kzalloc(sizeof(*cil), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!cil)
return -ENOMEM;
/*
@@ -1786,7 +1797,7 @@ xlog_cil_init(
out_destroy_wq:
destroy_workqueue(cil->xc_push_wq);
out_destroy_cil:
- kmem_free(cil);
+ kfree(cil);
return -ENOMEM;
}
@@ -1799,12 +1810,12 @@ xlog_cil_destroy(
if (cil->xc_ctx) {
if (cil->xc_ctx->ticket)
xfs_log_ticket_put(cil->xc_ctx->ticket);
- kmem_free(cil->xc_ctx);
+ kfree(cil->xc_ctx);
}
ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
free_percpu(cil->xc_pcp);
destroy_workqueue(cil->xc_push_wq);
- kmem_free(cil);
+ kfree(cil);
}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1251c81e55f9..13f1d2e91540 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -361,7 +361,7 @@ xlog_find_verify_cycle(
*new_blk = -1;
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -477,7 +477,7 @@ xlog_find_verify_log_record(
*last_blk = i;
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -731,7 +731,7 @@ validate_head:
goto out_free_buffer;
}
- kmem_free(buffer);
+ kvfree(buffer);
if (head_blk == log_bbnum)
*return_head_blk = 0;
else
@@ -745,7 +745,7 @@ validate_head:
return 0;
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
xfs_warn(log->l_mp, "failed to find log head");
return error;
@@ -999,7 +999,7 @@ xlog_verify_tail(
"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
orig_tail, *tail_blk);
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -1046,7 +1046,7 @@ xlog_verify_head(
error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
XLOG_MAX_ICLOGS, tmp_buffer,
&tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
- kmem_free(tmp_buffer);
+ kvfree(tmp_buffer);
if (error < 0)
return error;
@@ -1365,7 +1365,7 @@ xlog_find_tail(
error = xlog_clear_stale_blocks(log, tail_lsn);
done:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
xfs_warn(log->l_mp, "failed to locate log tail");
@@ -1399,6 +1399,7 @@ xlog_find_zeroed(
xfs_daddr_t new_blk, last_blk, start_blk;
xfs_daddr_t num_scan_bblks;
int error, log_bbnum = log->l_logBBsize;
+ int ret = 1;
*blk_no = 0;
@@ -1413,8 +1414,7 @@ xlog_find_zeroed(
first_cycle = xlog_get_cycle(offset);
if (first_cycle == 0) { /* completely zeroed log */
*blk_no = 0;
- kmem_free(buffer);
- return 1;
+ goto out_free_buffer;
}
/* check partially zeroed log */
@@ -1424,8 +1424,8 @@ xlog_find_zeroed(
last_cycle = xlog_get_cycle(offset);
if (last_cycle != 0) { /* log completely written to */
- kmem_free(buffer);
- return 0;
+ ret = 0;
+ goto out_free_buffer;
}
/* we have a partially zeroed log */
@@ -1471,10 +1471,10 @@ xlog_find_zeroed(
*blk_no = last_blk;
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
return error;
- return 1;
+ return ret;
}
/*
@@ -1583,7 +1583,7 @@ xlog_write_log_records(
}
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -2057,7 +2057,8 @@ xlog_recover_add_item(
{
struct xlog_recover_item *item;
- item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
+ item = kzalloc(sizeof(struct xlog_recover_item),
+ GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&item->ri_list);
list_add_tail(&item->ri_list, head);
}
@@ -2160,7 +2161,7 @@ xlog_recover_add_to_trans(
return 0;
}
- ptr = kmem_alloc(len, 0);
+ ptr = xlog_kvmalloc(len);
memcpy(ptr, dp, len);
in_f = (struct xfs_inode_log_format *)ptr;
@@ -2182,14 +2183,13 @@ xlog_recover_add_to_trans(
"bad number of regions (%d) in inode log format",
in_f->ilf_size);
ASSERT(0);
- kmem_free(ptr);
+ kvfree(ptr);
return -EFSCORRUPTED;
}
item->ri_total = in_f->ilf_size;
- item->ri_buf =
- kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
- 0);
+ item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+ GFP_KERNEL | __GFP_NOFAIL);
}
if (item->ri_total <= item->ri_cnt) {
@@ -2197,7 +2197,7 @@ xlog_recover_add_to_trans(
"log item region count (%d) overflowed size (%d)",
item->ri_cnt, item->ri_total);
ASSERT(0);
- kmem_free(ptr);
+ kvfree(ptr);
return -EFSCORRUPTED;
}
@@ -2227,13 +2227,13 @@ xlog_recover_free_trans(
/* Free the regions in the item. */
list_del(&item->ri_list);
for (i = 0; i < item->ri_cnt; i++)
- kmem_free(item->ri_buf[i].i_addr);
+ kvfree(item->ri_buf[i].i_addr);
/* Free the item itself */
- kmem_free(item->ri_buf);
- kmem_free(item);
+ kfree(item->ri_buf);
+ kfree(item);
}
/* Free the transaction recover structure */
- kmem_free(trans);
+ kfree(trans);
}
/*
@@ -2332,7 +2332,7 @@ xlog_recover_ophdr_to_trans(
* This is a new transaction so allocate a new recovery container to
* hold the recovery ops that will follow.
*/
- trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
+ trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL | __GFP_NOFAIL);
trans->r_log_tid = tid;
trans->r_lsn = be64_to_cpu(rhead->h_lsn);
INIT_LIST_HEAD(&trans->r_itemq);
@@ -3024,7 +3024,7 @@ xlog_do_recovery_pass(
hblks = xlog_logrec_hblks(log, rhead);
if (hblks != 1) {
- kmem_free(hbp);
+ kvfree(hbp);
hbp = xlog_alloc_buffer(log, hblks);
}
} else {
@@ -3038,7 +3038,7 @@ xlog_do_recovery_pass(
return -ENOMEM;
dbp = xlog_alloc_buffer(log, BTOBB(h_size));
if (!dbp) {
- kmem_free(hbp);
+ kvfree(hbp);
return -ENOMEM;
}
@@ -3199,16 +3199,33 @@ xlog_do_recovery_pass(
}
bread_err2:
- kmem_free(dbp);
+ kvfree(dbp);
bread_err1:
- kmem_free(hbp);
+ kvfree(hbp);
/*
- * Submit buffers that have been added from the last record processed,
- * regardless of error status.
+ * Submit buffers that have been dirtied by the last record recovered.
*/
- if (!list_empty(&buffer_list))
+ if (!list_empty(&buffer_list)) {
+ if (error) {
+ /*
+ * If there has been an item recovery error then we
+ * cannot allow partial checkpoint writeback to
+ * occur. We might have multiple checkpoints with the
+ * same start LSN in this buffer list, and partial
+ * writeback of a checkpoint in this situation can
+ * prevent future recovery of all the changes in the
+ * checkpoints at this start LSN.
+ *
+ * Note: Shutting down the filesystem will result in the
+ * delwri submission marking all the buffers stale,
+ * completing them and cleaning up _XBF_LOGRECOVERY
+ * state without doing any IO.
+ */
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ }
error2 = xfs_buf_delwri_submit(&buffer_list);
+ }
if (error && first_bad)
*first_bad = rhead_blk;
@@ -3443,12 +3460,19 @@ xlog_recover(
* part of recovery so that the root and real-time bitmap inodes can be read in
* from disk in between the two stages. This is necessary so that we can free
* space in the real-time portion of the file system.
+ *
+ * We run this whole process under GFP_NOFS allocation context. We do a
+ * combination of non-transactional and transactional work, yet we really don't
+ * want to recurse into the filesystem from direct reclaim during any of this
+ * processing. This allows all the recovery code run here not to care about the
+ * memory allocation context it is running in.
*/
int
xlog_recover_finish(
struct xlog *log)
{
- int error;
+ unsigned int nofs_flags = memalloc_nofs_save();
+ int error;
error = xlog_recover_process_intents(log);
if (error) {
@@ -3462,7 +3486,7 @@ xlog_recover_finish(
xlog_recover_cancel_intents(log);
xfs_alert(log->l_mp, "Failed to recover intents");
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return error;
+ goto out_error;
}
/*
@@ -3483,7 +3507,7 @@ xlog_recover_finish(
if (error < 0) {
xfs_alert(log->l_mp,
"Failed to clear log incompat features on recovery");
- return error;
+ goto out_error;
}
}
@@ -3508,9 +3532,13 @@ xlog_recover_finish(
* and AIL.
*/
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ error = 0;
+ goto out_error;
}
- return 0;
+out_error:
+ memalloc_nofs_restore(nofs_flags);
+ return error;
}
void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 57fa21ad7912..df370eb5dc15 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,7 @@ xfs_uuid_table_free(void)
{
if (xfs_uuid_table_size == 0)
return;
- kmem_free(xfs_uuid_table);
+ kfree(xfs_uuid_table);
xfs_uuid_table = NULL;
xfs_uuid_table_size = 0;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 503fe3c7edbf..e880aa48de68 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -94,9 +94,9 @@ typedef struct xfs_mount {
struct xfs_inode *m_rsumip; /* pointer to summary inode */
struct xfs_inode *m_rootip; /* pointer to root directory */
struct xfs_quotainfo *m_quotainfo; /* disk quota information */
- xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
- xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
- xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
+ struct xfs_buftarg *m_ddev_targp; /* data device */
+ struct xfs_buftarg *m_logdev_targp;/* log device */
+ struct xfs_buftarg *m_rtdev_targp; /* rt device */
void __percpu *m_inodegc; /* percpu inodegc structures */
/*
@@ -252,6 +252,9 @@ typedef struct xfs_mount {
/* cpus that have inodes queued for inactivation */
struct cpumask m_inodegc_cpumask;
+
+ /* Hook to feed dirent updates to an active online repair. */
+ struct xfs_hooks m_dir_update_hooks;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)
@@ -502,9 +505,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
-int xfs_buf_hash_init(struct xfs_perag *pag);
-void xfs_buf_hash_destroy(struct xfs_perag *pag);
-
extern void xfs_uuid_table_free(void);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f85e3b07ab44..7443debaffd6 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,13 +333,14 @@ xfs_mru_cache_create(
if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
return -EINVAL;
- if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
+ mru = kzalloc(sizeof(*mru), GFP_KERNEL | __GFP_NOFAIL);
+ if (!mru)
return -ENOMEM;
/* An extra list is needed to avoid reaping up to a grp_time early. */
mru->grp_count = grp_count + 1;
- mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
-
+ mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists),
+ GFP_KERNEL | __GFP_NOFAIL);
if (!mru->lists) {
err = -ENOMEM;
goto exit;
@@ -364,9 +365,9 @@ xfs_mru_cache_create(
exit:
if (err && mru && mru->lists)
- kmem_free(mru->lists);
+ kfree(mru->lists);
if (err && mru)
- kmem_free(mru);
+ kfree(mru);
return err;
}
@@ -406,8 +407,8 @@ xfs_mru_cache_destroy(
xfs_mru_cache_flush(mru);
- kmem_free(mru->lists);
- kmem_free(mru);
+ kfree(mru->lists);
+ kfree(mru);
}
/*
@@ -427,7 +428,7 @@ xfs_mru_cache_insert(
if (!mru || !mru->lists)
return -EINVAL;
- if (radix_tree_preload(GFP_NOFS))
+ if (radix_tree_preload(GFP_KERNEL))
return -ENOMEM;
INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 67d0a8564ff3..0f4cf4170c35 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -26,6 +26,7 @@
#include "xfs_ag.h"
#include "xfs_ialloc.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -254,7 +255,7 @@ xfs_qm_dqattach_one(
struct xfs_dquot *dqp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
error = 0;
/*
@@ -322,7 +323,7 @@ xfs_qm_dqattach_locked(
if (!xfs_qm_need_dqattach(ip))
return 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
@@ -353,7 +354,7 @@ done:
* Don't worry about the dquots that we may have attached before any
* error - they'll get detached later if it has not already been done.
*/
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -628,7 +629,8 @@ xfs_qm_init_quotainfo(
ASSERT(XFS_IS_QUOTA_ON(mp));
- qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
+ qinf = mp->m_quotainfo = kzalloc(sizeof(struct xfs_quotainfo),
+ GFP_KERNEL | __GFP_NOFAIL);
error = list_lru_init(&qinf->qi_lru);
if (error)
@@ -642,9 +644,9 @@ xfs_qm_init_quotainfo(
if (error)
goto out_free_lru;
- INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
- INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
- INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_KERNEL);
+ INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_KERNEL);
+ INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_KERNEL);
mutex_init(&qinf->qi_tree_lock);
/* mutex used to serialize quotaoffs */
@@ -691,6 +693,9 @@ xfs_qm_init_quotainfo(
shrinker_register(qinf->qi_shrinker);
+ xfs_hooks_init(&qinf->qi_mod_ino_dqtrx_hooks);
+ xfs_hooks_init(&qinf->qi_apply_dqtrx_hooks);
+
return 0;
out_free_inos:
@@ -700,7 +705,7 @@ out_free_inos:
out_free_lru:
list_lru_destroy(&qinf->qi_lru);
out_free_qinf:
- kmem_free(qinf);
+ kfree(qinf);
mp->m_quotainfo = NULL;
return error;
}
@@ -724,7 +729,7 @@ xfs_qm_destroy_quotainfo(
xfs_qm_destroy_quotainos(qi);
mutex_destroy(&qi->qi_tree_lock);
mutex_destroy(&qi->qi_quotaofflock);
- kmem_free(qi);
+ kfree(qi);
mp->m_quotainfo = NULL;
}
@@ -758,14 +763,18 @@ xfs_qm_qino_alloc(
(mp->m_sb.sb_gquotino != NULLFSINO)) {
ino = mp->m_sb.sb_gquotino;
if (XFS_IS_CORRUPT(mp,
- mp->m_sb.sb_pquotino != NULLFSINO))
+ mp->m_sb.sb_pquotino != NULLFSINO)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
return -EFSCORRUPTED;
+ }
} else if ((flags & XFS_QMOPT_GQUOTA) &&
(mp->m_sb.sb_pquotino != NULLFSINO)) {
ino = mp->m_sb.sb_pquotino;
if (XFS_IS_CORRUPT(mp,
- mp->m_sb.sb_gquotino != NULLFSINO))
+ mp->m_sb.sb_gquotino != NULLFSINO)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
return -EFSCORRUPTED;
+ }
}
if (ino != NULLFSINO) {
error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
@@ -996,7 +1005,8 @@ xfs_qm_reset_dqcounts_buf(
if (qip->i_nblocks == 0)
return 0;
- map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
+ map = kmalloc(XFS_DQITER_MAP_SIZE * sizeof(*map),
+ GFP_KERNEL | __GFP_NOFAIL);
lblkno = 0;
maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
@@ -1058,7 +1068,7 @@ xfs_qm_reset_dqcounts_buf(
} while (nmaps > 0);
out:
- kmem_free(map);
+ kfree(map);
return error;
}
@@ -1406,8 +1416,12 @@ error_return:
xfs_warn(mp,
"Quotacheck: Failed to reset quota flags.");
}
- } else
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
+ } else {
xfs_notice(mp, "Quotacheck: Done.");
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_QUOTACHECK);
+ }
+
return error;
error_purge:
@@ -1809,7 +1823,7 @@ xfs_qm_vop_chown(
XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
/* old dquot */
@@ -1817,12 +1831,12 @@ xfs_qm_vop_chown(
ASSERT(prevdq);
ASSERT(prevdq != newdq);
- xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks));
- xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks));
+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
/* the sparkling new dquot */
- xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks);
- xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+ xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks);
+ xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
* Back when we made quota reservations for the chown, we reserved the
@@ -1897,29 +1911,28 @@ xfs_qm_vop_create_dqattach(
if (!XFS_IS_QUOTA_ON(mp))
return;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id);
ip->i_udquot = xfs_qm_dqhold(udqp);
- xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id);
ip->i_gdquot = xfs_qm_dqhold(gdqp);
- xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(ip->i_pdquot == NULL);
ASSERT(ip->i_projid == pdqp->q_id);
ip->i_pdquot = xfs_qm_dqhold(pdqp);
- xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
+
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, 1);
}
/* Decide if this inode's dquot is near an enforcement boundary. */
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index d5c9fc4ba591..f5993012bf98 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -68,6 +68,10 @@ struct xfs_quotainfo {
/* Minimum and maximum quota expiration timestamp values. */
time64_t qi_expiry_min;
time64_t qi_expiry_max;
+
+ /* Hook to feed quota counter updates to an active online repair. */
+ struct xfs_hooks qi_mod_ino_dqtrx_hooks;
+ struct xfs_hooks qi_apply_dqtrx_hooks;
};
static inline struct radix_tree_root *
@@ -104,6 +108,18 @@ xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type)
return NULL;
}
+/*
+ * Parameters for tracking dqtrx changes on behalf of an inode. The hook
+ * function arg parameter is the field being updated.
+ */
+struct xfs_mod_ino_dqtrx_params {
+ uintptr_t tx_id;
+ xfs_ino_t ino;
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+ int64_t delta;
+};
+
extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
uint field, int64_t delta);
extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index b77673dd0558..271c1021c733 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -9,6 +9,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0d56489f3b2..85a4ae1a17f6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -74,6 +74,22 @@ struct xfs_dqtrx {
int64_t qt_icount_delta; /* dquot inode count changes */
};
+enum xfs_apply_dqtrx_type {
+ XFS_APPLY_DQTRX_COMMIT = 0,
+ XFS_APPLY_DQTRX_UNRESERVE,
+};
+
+/*
+ * Parameters for applying dqtrx changes to a dquot. The hook function arg
+ * parameter is enum xfs_apply_dqtrx_type.
+ */
+struct xfs_apply_dqtrx_params {
+ uintptr_t tx_id;
+ xfs_ino_t ino;
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+};
+
#ifdef CONFIG_XFS_QUOTA
extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
extern void xfs_trans_free_dqinfo(struct xfs_trans *);
@@ -114,6 +130,30 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
}
bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
+
+# ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip,
+ struct xfs_dquot *dqp, unsigned int field, int64_t delta);
+
+struct xfs_quotainfo;
+
+struct xfs_dqtrx_hook {
+ struct xfs_hook mod_hook;
+ struct xfs_hook apply_hook;
+};
+
+void xfs_dqtrx_hook_disable(void);
+void xfs_dqtrx_hook_enable(void);
+
+int xfs_dqtrx_hook_add(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook);
+void xfs_dqtrx_hook_del(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook);
+void xfs_dqtrx_hook_setup(struct xfs_dqtrx_hook *hook, notifier_fn_t mod_fn,
+ notifier_fn_t apply_fn);
+# else
+# define xfs_trans_mod_ino_dquot(tp, ip, dqp, field, delta) \
+ xfs_trans_mod_dquot((tp), (dqp), (field), (delta))
+# endif /* CONFIG_XFS_LIVE_HOOKS */
+
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -173,6 +213,12 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
#define xfs_inode_near_dquot_enforcement(ip, type) (false)
+
+# ifdef CONFIG_XFS_LIVE_HOOKS
+# define xfs_dqtrx_hook_enable() ((void)0)
+# define xfs_dqtrx_hook_disable() ((void)0)
+# endif /* CONFIG_XFS_LIVE_HOOKS */
+
#endif /* CONFIG_XFS_QUOTA */
static inline int
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 20ad8086da60..14919b33e4fe 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -36,9 +36,9 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
- kmem_free(cuip->cui_item.li_lv_shadow);
+ kvfree(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
- kmem_free(cuip);
+ kfree(cuip);
else
kmem_cache_free(xfs_cui_cache, cuip);
}
@@ -143,8 +143,8 @@ xfs_cui_init(
ASSERT(nextents > 0);
if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
- cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
- 0);
+ cuip = kzalloc(xfs_cui_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
else
cuip = kmem_cache_zalloc(xfs_cui_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -207,7 +207,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_free(cudp->cud_item.li_lv_shadow);
+ kvfree(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_cache, cudp);
}
@@ -425,7 +425,7 @@ xfs_cui_recover_work(
struct xfs_refcount_intent *ri;
ri = kmem_cache_alloc(xfs_refcount_intent_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
ri->ri_startblock = pmap->pe_startblock;
ri->ri_blockcount = pmap->pe_len;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d5ca8bcae65b..7da0e8f961d3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -29,6 +29,7 @@
#include "xfs_iomap.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_health.h"
/*
* Copy on Write of Shared Blocks
@@ -527,7 +528,7 @@ xfs_reflink_allocate_cow(
int error;
bool found;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
@@ -805,7 +806,7 @@ xfs_reflink_end_cow_extent(
* If the extent we're remapping is backed by storage (written
* or not), unmap the extent and drop its refcount.
*/
- xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
xfs_refcount_decrease_extent(tp, &data);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-data.br_blockcount);
@@ -829,7 +830,7 @@ xfs_reflink_end_cow_extent(
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
/* Map the new blocks into the data fork. */
- xfs_bmap_map_extent(tp, ip, &del);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
/* Charge this new data fork mapping to the on-disk quota. */
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1227,8 +1228,10 @@ xfs_reflink_remap_extent(
* extent if they're both holes or both the same physical extent.
*/
if (dmap->br_startblock == smap.br_startblock) {
- if (dmap->br_state != smap.br_state)
+ if (dmap->br_state != smap.br_state) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
+ }
goto out_cancel;
}
@@ -1291,7 +1294,7 @@ xfs_reflink_remap_extent(
* If the extent we're unmapping is backed by storage (written
* or not), unmap the extent and drop its refcount.
*/
- xfs_bmap_unmap_extent(tp, ip, &smap);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
xfs_refcount_decrease_extent(tp, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
@@ -1316,7 +1319,7 @@ xfs_reflink_remap_extent(
*/
if (dmap_written) {
xfs_refcount_increase_extent(tp, dmap);
- xfs_bmap_map_extent(tp, ip, dmap);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
qdelta += dmap->br_blockcount;
}
@@ -1391,6 +1394,7 @@ xfs_reflink_remap_blocks(
ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
if (imap.br_startblock == DELAYSTARTBLOCK) {
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ xfs_bmap_mark_sick(src, XFS_DATA_FORK);
error = -EFSCORRUPTED;
break;
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 79ad0087aeca..e473124e29cc 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -36,9 +36,9 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
- kmem_free(ruip->rui_item.li_lv_shadow);
+ kvfree(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
- kmem_free(ruip);
+ kfree(ruip);
else
kmem_cache_free(xfs_rui_cache, ruip);
}
@@ -142,7 +142,8 @@ xfs_rui_init(
ASSERT(nextents > 0);
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
- ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
+ ruip = kzalloc(xfs_rui_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
else
ruip = kmem_cache_zalloc(xfs_rui_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -205,7 +206,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_free(rudp->rud_item.li_lv_shadow);
+ kvfree(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_cache, rudp);
}
@@ -454,7 +455,7 @@ xfs_rui_recover_work(
{
struct xfs_rmap_intent *ri;
- ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
case XFS_RMAP_EXTENT_MAP:
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8649d981a097..e66f9bd5de5c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,6 +22,8 @@
#include "xfs_sb.h"
#include "xfs_rtbitmap.h"
#include "xfs_quota.h"
+#include "xfs_log_priv.h"
+#include "xfs_health.h"
/*
* Return whether there are any free extents in the size range given
@@ -903,7 +905,7 @@ xfs_growfs_rt(
/*
* Allocate a new (fake) mount/sb.
*/
- nmp = kmem_alloc(sizeof(*nmp), 0);
+ nmp = kmalloc(sizeof(*nmp), GFP_KERNEL | __GFP_NOFAIL);
/*
* Loop over the bitmap blocks.
* We will do everything one bitmap block at a time.
@@ -1050,7 +1052,7 @@ out_free:
/*
* Free the fake mp structure.
*/
- kmem_free(nmp);
+ kfree(nmp);
/*
* If we had to allocate a new rsum_cache, we either need to free the
@@ -1059,10 +1061,10 @@ out_free:
*/
if (rsum_cache != mp->m_rsum_cache) {
if (error) {
- kmem_free(mp->m_rsum_cache);
+ kvfree(mp->m_rsum_cache);
mp->m_rsum_cache = rsum_cache;
} else {
- kmem_free(rsum_cache);
+ kvfree(rsum_cache);
}
}
@@ -1202,6 +1204,8 @@ xfs_rtmount_inodes(
sbp = &mp->m_sb;
error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
if (error)
return error;
ASSERT(mp->m_rbmip != NULL);
@@ -1211,6 +1215,8 @@ xfs_rtmount_inodes(
goto out_rele_bitmap;
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY);
if (error)
goto out_rele_bitmap;
ASSERT(mp->m_rsumip != NULL);
@@ -1233,7 +1239,7 @@ void
xfs_rtunmount_inodes(
struct xfs_mount *mp)
{
- kmem_free(mp->m_rsum_cache);
+ kvfree(mp->m_rsum_cache);
if (mp->m_rbmip)
xfs_irele(mp->m_rbmip);
if (mp->m_rsumip)
@@ -1260,7 +1266,7 @@ xfs_rtpick_extent(
uint64_t seq; /* sequence number of file creation */
struct timespec64 ts; /* timespec in inode */
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
ts = inode_get_atime(VFS_I(mp->m_rbmip));
if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 90a77cd3ebad..ed97d72caa66 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -50,7 +50,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{ "ibt2", xfsstats_offset(xs_fibt_2) },
{ "fibt2", xfsstats_offset(xs_rmap_2) },
{ "rmapbt", xfsstats_offset(xs_refcbt_2) },
- { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)},
+ { "refcntbt", xfsstats_offset(xs_rmap_mem_2) },
+ { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) },
+ { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
{ "qm", xfsstats_offset(xs_xstrat_bytes)},
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 43ffba74f045..a61fb56ed2e6 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -125,6 +125,8 @@ struct __xfsstats {
uint32_t xs_fibt_2[__XBTS_MAX];
uint32_t xs_rmap_2[__XBTS_MAX];
uint32_t xs_refcbt_2[__XBTS_MAX];
+ uint32_t xs_rmap_mem_2[__XBTS_MAX];
+ uint32_t xs_rcbag_2[__XBTS_MAX];
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59c8c0541bdd..c21f10ab0f5d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -44,6 +44,7 @@
#include "xfs_dahash_test.h"
#include "xfs_rtbitmap.h"
#include "scrub/stats.h"
+#include "scrub/rcbag_btree.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -715,9 +716,7 @@ xfs_fs_inode_init_once(
/* xfs inode */
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
-
- mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
- "xfsino", ip->i_ino);
+ init_rwsem(&ip->i_lock);
}
/*
@@ -760,7 +759,7 @@ xfs_mount_free(
debugfs_remove(mp->m_debugfs);
kfree(mp->m_rtname);
kfree(mp->m_logname);
- kmem_free(mp);
+ kfree(mp);
}
STATIC int
@@ -1986,7 +1985,7 @@ static int xfs_init_fs_context(
{
struct xfs_mount *mp;
- mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO);
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
if (!mp)
return -ENOMEM;
@@ -2012,6 +2011,8 @@ static int xfs_init_fs_context(
mp->m_logbsize = -1;
mp->m_allocsize_log = 16; /* 64k */
+ xfs_hooks_init(&mp->m_dir_update_hooks);
+
fc->s_fs_info = mp;
fc->ops = &xfs_context_ops;
@@ -2058,10 +2059,14 @@ xfs_init_caches(void)
if (error)
goto out_destroy_log_ticket_cache;
- error = xfs_defer_init_item_caches();
+ error = rcbagbt_init_cur_cache();
if (error)
goto out_destroy_btree_cur_cache;
+ error = xfs_defer_init_item_caches();
+ if (error)
+ goto out_destroy_rcbagbt_cur_cache;
+
xfs_da_state_cache = kmem_cache_create("xfs_da_state",
sizeof(struct xfs_da_state),
0, 0, NULL);
@@ -2218,6 +2223,8 @@ xfs_init_caches(void)
kmem_cache_destroy(xfs_da_state_cache);
out_destroy_defer_item_cache:
xfs_defer_destroy_item_caches();
+ out_destroy_rcbagbt_cur_cache:
+ rcbagbt_destroy_cur_cache();
out_destroy_btree_cur_cache:
xfs_btree_destroy_cur_caches();
out_destroy_log_ticket_cache:
@@ -2255,6 +2262,7 @@ xfs_destroy_caches(void)
kmem_cache_destroy(xfs_ifork_cache);
kmem_cache_destroy(xfs_da_state_cache);
xfs_defer_destroy_item_caches();
+ rcbagbt_destroy_cur_cache();
xfs_btree_destroy_cur_caches();
kmem_cache_destroy(xfs_log_ticket_cache);
kmem_cache_destroy(xfs_buf_cache);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 92974a4414c8..3e376d24c7c1 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -24,77 +24,7 @@
#include "xfs_ialloc.h"
#include "xfs_error.h"
#include "xfs_health.h"
-
-/* ----- Kernel only functions below ----- */
-int
-xfs_readlink_bmap_ilocked(
- struct xfs_inode *ip,
- char *link)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
- struct xfs_buf *bp;
- xfs_daddr_t d;
- char *cur_chunk;
- int pathlen = ip->i_disk_size;
- int nmaps = XFS_SYMLINK_MAPS;
- int byte_cnt;
- int n;
- int error = 0;
- int fsblocks = 0;
- int offset;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-
- fsblocks = xfs_symlink_blocks(mp, pathlen);
- error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
- if (error)
- goto out;
-
- offset = 0;
- for (n = 0; n < nmaps; n++) {
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-
- error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
- &bp, &xfs_symlink_buf_ops);
- if (error)
- return error;
- byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
- if (pathlen < byte_cnt)
- byte_cnt = pathlen;
-
- cur_chunk = bp->b_addr;
- if (xfs_has_crc(mp)) {
- if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
- byte_cnt, bp)) {
- error = -EFSCORRUPTED;
- xfs_alert(mp,
-"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
- offset, byte_cnt, ip->i_ino);
- xfs_buf_relse(bp);
- goto out;
-
- }
-
- cur_chunk += sizeof(struct xfs_dsymlink_hdr);
- }
-
- memcpy(link + offset, cur_chunk, byte_cnt);
-
- pathlen -= byte_cnt;
- offset += byte_cnt;
-
- xfs_buf_relse(bp);
- }
- ASSERT(pathlen == 0);
-
- link[ip->i_disk_size] = '\0';
- error = 0;
-
- out:
- return error;
-}
+#include "xfs_symlink_remote.h"
int
xfs_readlink(
@@ -103,7 +33,7 @@ xfs_readlink(
{
struct xfs_mount *mp = ip->i_mount;
xfs_fsize_t pathlen;
- int error = -EFSCORRUPTED;
+ int error;
trace_xfs_readlink(ip);
@@ -116,14 +46,14 @@ xfs_readlink(
pathlen = ip->i_disk_size;
if (!pathlen)
- goto out;
+ goto out_corrupt;
if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- goto out;
+ goto out_corrupt;
}
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
@@ -132,17 +62,20 @@ xfs_readlink(
* if if_data is junk.
*/
if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data))
- goto out;
+ goto out_corrupt;
memcpy(link, ip->i_df.if_data, pathlen + 1);
error = 0;
} else {
- error = xfs_readlink_bmap_ilocked(ip, link);
+ error = xfs_symlink_remote_read(ip, link);
}
- out:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return error;
+ out_corrupt:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ return -EFSCORRUPTED;
}
int
@@ -160,15 +93,7 @@ xfs_symlink(
int error = 0;
int pathlen;
bool unlock_dp_on_error = false;
- xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
- int nmaps;
- struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
- xfs_daddr_t d;
- const char *cur_chunk;
- int byte_cnt;
- int n;
- struct xfs_buf *bp;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -256,62 +181,11 @@ xfs_symlink(
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
resblks -= XFS_IALLOC_SPACE_RES(mp);
- /*
- * If the symlink will fit into the inode, write it inline.
- */
- if (pathlen <= xfs_inode_data_fork_size(ip)) {
- xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
-
- ip->i_disk_size = pathlen;
- ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
- } else {
- int offset;
-
- first_fsb = 0;
- nmaps = XFS_SYMLINK_MAPS;
-
- error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
- XFS_BMAPI_METADATA, resblks, mval, &nmaps);
- if (error)
- goto out_trans_cancel;
-
- resblks -= fs_blocks;
- ip->i_disk_size = pathlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- cur_chunk = target_path;
- offset = 0;
- for (n = 0; n < nmaps; n++) {
- char *buf;
-
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- BTOBB(byte_cnt), 0, &bp);
- if (error)
- goto out_trans_cancel;
- bp->b_ops = &xfs_symlink_buf_ops;
-
- byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
- byte_cnt = min(byte_cnt, pathlen);
-
- buf = bp->b_addr;
- buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
- byte_cnt, bp);
-
- memcpy(buf, cur_chunk, byte_cnt);
-
- cur_chunk += byte_cnt;
- pathlen -= byte_cnt;
- offset += byte_cnt;
-
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
- xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
- (char *)bp->b_addr);
- }
- ASSERT(pathlen == 0);
- }
+ error = xfs_symlink_write_target(tp, ip, target_path, pathlen,
+ fs_blocks, resblks);
+ if (error)
+ goto out_trans_cancel;
+ resblks -= fs_blocks;
i_size_write(VFS_I(ip), ip->i_disk_size);
/*
@@ -322,6 +196,7 @@ xfs_symlink(
goto out_trans_cancel;
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ xfs_dir_update_hook(dp, ip, 1, link_name);
/*
* If this is a synchronous mount, make sure that the
@@ -496,6 +371,7 @@ xfs_inactive_symlink(
__func__, (unsigned long long)ip->i_ino, pathlen);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
ASSERT(0);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index d1ca1ce62a93..0d29a50e66fd 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -10,7 +10,6 @@
int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp,
struct xfs_name *link_name, const char *target_path,
umode_t mode, struct xfs_inode **ipp);
-int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link);
int xfs_readlink(struct xfs_inode *ip, char *link);
int xfs_inactive_symlink(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 17485666b672..d2391eec37fe 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -193,7 +193,6 @@ always_cow_show(
}
XFS_SYSFS_ATTR_RW(always_cow);
-#ifdef DEBUG
/*
* Override how many threads the parallel work queue is allowed to create.
* This has to be a debug-only global (instead of an errortag) because one of
@@ -260,7 +259,6 @@ larp_show(
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
}
XFS_SYSFS_ATTR_RW(larp);
-#endif /* DEBUG */
STATIC ssize_t
bload_leaf_slack_store(
@@ -319,10 +317,8 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
ATTR_LIST(always_cow),
-#ifdef DEBUG
ATTR_LIST(pwork_threads),
ATTR_LIST(larp),
-#endif
ATTR_LIST(bload_leaf_slack),
ATTR_LIST(bload_node_slack),
NULL,
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8a5dc1538aa8..1a963382e5e9 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -36,6 +36,9 @@
#include "xfs_error.h"
#include <linux/iomap.h>
#include "xfs_iomap.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bmap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0984a1c884c7..56b07d8ed431 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -79,6 +79,9 @@ union xfs_btree_ptr;
struct xfs_dqtrx;
struct xfs_icwalk;
struct xfs_perag;
+struct xfbtree;
+struct xfs_btree_ops;
+struct xfs_bmap_intent;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -640,6 +643,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
@@ -1710,12 +1714,10 @@ DECLARE_EVENT_CLASS(xfs_agf_class,
__entry->agno = be32_to_cpu(agf->agf_seqno),
__entry->flags = flags;
__entry->length = be32_to_cpu(agf->agf_length),
- __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
- __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
- __entry->bno_level =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
- __entry->cnt_level =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+ __entry->bno_root = be32_to_cpu(agf->agf_bno_root),
+ __entry->cnt_root = be32_to_cpu(agf->agf_cnt_root),
+ __entry->bno_level = be32_to_cpu(agf->agf_bno_level),
+ __entry->cnt_level = be32_to_cpu(agf->agf_cnt_level),
__entry->flfirst = be32_to_cpu(agf->agf_flfirst),
__entry->fllast = be32_to_cpu(agf->agf_fllast),
__entry->flcount = be32_to_cpu(agf->agf_flcount),
@@ -1890,28 +1892,28 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_near_bno);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_finish);
TRACE_EVENT(xfs_alloc_cur_check,
- TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, xfs_extlen_t diff, bool new),
- TP_ARGS(mp, btnum, bno, len, diff, new),
+ TP_ARGS(cur, bno, len, diff, new),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agblock_t, bno)
__field(xfs_extlen_t, len)
__field(xfs_extlen_t, diff)
__field(bool, new)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->btnum = btnum;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __assign_str(name, cur->bc_ops->name);
__entry->bno = bno;
__entry->len = len;
__entry->diff = diff;
__entry->new = new;
),
- TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
+ TP_printk("dev %d:%d %sbt agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->bno, __entry->len, __entry->diff, __entry->new)
)
@@ -2452,21 +2454,12 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
-/* btree cursor events */
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
TP_ARGS(cur, level, bp),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(int, nlevels)
__field(int, ptr)
@@ -2474,15 +2467,15 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
),
- TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
+ TP_printk("dev %d:%d %sbt level %d/%d ptr %d daddr 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->nlevels,
__entry->ptr,
@@ -2496,6 +2489,90 @@ DEFINE_EVENT(xfs_btree_cur_class, name, \
DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
+TRACE_EVENT(xfs_btree_alloc_block,
+ TP_PROTO(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int stat,
+ int error),
+ TP_ARGS(cur, ptr, stat, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __string(name, cur->bc_ops->name)
+ __field(int, error)
+ __field(xfs_agblock_t, agbno)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_INODE:
+ __entry->agno = 0;
+ __entry->ino = cur->bc_ino.ip->i_ino;
+ break;
+ case XFS_BTREE_TYPE_AG:
+ __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->ino = 0;
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ __entry->agno = 0;
+ __entry->ino = 0;
+ break;
+ }
+ __assign_str(name, cur->bc_ops->name);
+ __entry->error = error;
+ if (!error && stat) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
+
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp,
+ fsb);
+ __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp,
+ fsb);
+ } else {
+ __entry->agbno = be32_to_cpu(ptr->s);
+ }
+ } else {
+ __entry->agbno = NULLAGBLOCK;
+ }
+ ),
+ TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __get_str(name),
+ __entry->agno,
+ __entry->ino,
+ __entry->agbno,
+ __entry->error)
+);
+
+TRACE_EVENT(xfs_btree_free_block,
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_buf *bp),
+ TP_ARGS(cur, bp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __string(name, cur->bc_ops->name)
+ __field(xfs_agblock_t, agbno)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = xfs_daddr_to_agno(cur->bc_mp,
+ xfs_buf_daddr(bp));
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
+ __entry->ino = cur->bc_ino.ip->i_ino;
+ else
+ __entry->ino = 0;
+ __assign_str(name, cur->bc_ops->name);
+ __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp));
+ ),
+ TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __get_str(name),
+ __entry->agno,
+ __entry->ino,
+ __entry->agbno)
+);
+
/* deferred ops */
struct xfs_defer_pending;
@@ -2579,7 +2656,25 @@ DEFINE_EVENT(xfs_defer_pending_class, name, \
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \
TP_ARGS(mp, dfp))
-DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
+DEFINE_DEFER_EVENT(xfs_defer_cancel);
+DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
+DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
+DEFINE_DEFER_EVENT(xfs_defer_finish);
+DEFINE_DEFER_EVENT(xfs_defer_finish_done);
+
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
+
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
+
+DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
int type, xfs_agblock_t agbno, xfs_extlen_t len),
TP_ARGS(mp, agno, type, agbno, len),
@@ -2604,92 +2699,17 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
__entry->agbno,
__entry->len)
);
-#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \
-DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \
+#define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
int type, \
xfs_agblock_t bno, \
xfs_extlen_t len), \
TP_ARGS(mp, agno, type, bno, len))
-
-DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int op,
- xfs_agblock_t agbno,
- xfs_ino_t ino,
- int whichfork,
- xfs_fileoff_t offset,
- xfs_filblks_t len,
- xfs_exntst_t state),
- TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
- __field(xfs_ino_t, ino)
- __field(xfs_agblock_t, agbno)
- __field(int, whichfork)
- __field(xfs_fileoff_t, l_loff)
- __field(xfs_filblks_t, l_len)
- __field(xfs_exntst_t, l_state)
- __field(int, op)
- ),
- TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->ino = ino;
- __entry->agbno = agbno;
- __entry->whichfork = whichfork;
- __entry->l_loff = offset;
- __entry->l_len = len;
- __entry->l_state = state;
- __entry->op = op;
- ),
- TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->op,
- __entry->agno,
- __entry->agbno,
- __entry->ino,
- __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
- __entry->l_loff,
- __entry->l_len,
- __entry->l_state)
-);
-#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \
-DEFINE_EVENT(xfs_map_extent_deferred_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- int op, \
- xfs_agblock_t agbno, \
- xfs_ino_t ino, \
- int whichfork, \
- xfs_fileoff_t offset, \
- xfs_filblks_t len, \
- xfs_exntst_t state), \
- TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
-
-DEFINE_DEFER_EVENT(xfs_defer_cancel);
-DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
-DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
-DEFINE_DEFER_EVENT(xfs_defer_finish);
-DEFINE_DEFER_EVENT(xfs_defer_finish_done);
-
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
-
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
-
-#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred);
DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
@@ -2854,12 +2874,63 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT
+DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int op,
+ xfs_agblock_t agbno,
+ xfs_ino_t ino,
+ int whichfork,
+ xfs_fileoff_t offset,
+ xfs_filblks_t len,
+ xfs_exntst_t state),
+ TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agblock_t, agbno)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, l_loff)
+ __field(xfs_filblks_t, l_len)
+ __field(xfs_exntst_t, l_state)
+ __field(int, op)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->ino = ino;
+ __entry->agbno = agbno;
+ __entry->whichfork = whichfork;
+ __entry->l_loff = offset;
+ __entry->l_len = len;
+ __entry->l_state = state;
+ __entry->op = op;
+ ),
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->op,
+ __entry->agno,
+ __entry->agbno,
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->l_loff,
+ __entry->l_len,
+ __entry->l_state)
+);
+#define DEFINE_RMAP_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_rmap_deferred_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ int op, \
+ xfs_agblock_t agbno, \
+ xfs_ino_t ino, \
+ int whichfork, \
+ xfs_fileoff_t offset, \
+ xfs_filblks_t len, \
+ xfs_exntst_t state), \
+ TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
-DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block);
-DEFINE_BUSY_EVENT(xfs_rmapbt_free_block);
DEFINE_RMAPBT_EVENT(xfs_rmap_update);
DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
@@ -2876,7 +2947,66 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
/* deferred bmbt updates */
-#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT
+TRACE_DEFINE_ENUM(XFS_BMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_BMAP_UNMAP);
+
+DECLARE_EVENT_CLASS(xfs_bmap_deferred_class,
+ TP_PROTO(struct xfs_bmap_intent *bi),
+ TP_ARGS(bi),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, opdev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_fsblock_t, rtbno)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, l_loff)
+ __field(xfs_filblks_t, l_len)
+ __field(xfs_exntst_t, l_state)
+ __field(int, op)
+ ),
+ TP_fast_assign(
+ struct xfs_inode *ip = bi->bi_owner;
+
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) {
+ __entry->agno = 0;
+ __entry->agbno = 0;
+ __entry->rtbno = bi->bi_bmap.br_startblock;
+ __entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev;
+ } else {
+ __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount,
+ bi->bi_bmap.br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount,
+ bi->bi_bmap.br_startblock);
+ __entry->rtbno = 0;
+ __entry->opdev = __entry->dev;
+ }
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = bi->bi_whichfork;
+ __entry->l_loff = bi->bi_bmap.br_startoff;
+ __entry->l_len = bi->bi_bmap.br_blockcount;
+ __entry->l_state = bi->bi_bmap.br_state;
+ __entry->op = bi->bi_type;
+ ),
+ TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS),
+ MAJOR(__entry->opdev), MINOR(__entry->opdev),
+ __entry->ino,
+ __entry->agno,
+ __entry->agbno,
+ __entry->rtbno,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->l_loff,
+ __entry->l_len,
+ __entry->l_state)
+);
+#define DEFINE_BMAP_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_deferred_class, name, \
+ TP_PROTO(struct xfs_bmap_intent *bi), \
+ TP_ARGS(bi))
DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);
@@ -3217,8 +3347,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
TP_ARGS(mp, agno, i1, i2, i3))
/* refcount btree tracepoints */
-DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block);
-DEFINE_BUSY_EVENT(xfs_refcountbt_free_block);
DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
@@ -3255,7 +3383,39 @@ DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
-#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+
+DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int type, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, type, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, type)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->type = type;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+);
+#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_deferred_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ int type, \
+ xfs_agblock_t bno, \
+ xfs_extlen_t len), \
+ TP_ARGS(mp, agno, type, bno, len))
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
@@ -3926,9 +4086,11 @@ DEFINE_EVENT(xfs_fs_corrupt_class, name, \
TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
TP_ARGS(mp, flags))
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
@@ -3955,6 +4117,7 @@ DEFINE_EVENT(xfs_ag_corrupt_class, name, \
unsigned int flags), \
TP_ARGS(mp, agno, flags))
DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt);
DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
@@ -3980,7 +4143,9 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
TP_ARGS(ip, flags))
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt);
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption);
TRACE_EVENT(xfs_iwalk_ag,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -4040,31 +4205,6 @@ TRACE_EVENT(xfs_pwork_init,
__entry->nr_threads, __entry->pid)
)
-DECLARE_EVENT_CLASS(xfs_kmem_class,
- TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
- TP_ARGS(size, flags, caller_ip),
- TP_STRUCT__entry(
- __field(ssize_t, size)
- __field(int, flags)
- __field(unsigned long, caller_ip)
- ),
- TP_fast_assign(
- __entry->size = size;
- __entry->flags = flags;
- __entry->caller_ip = caller_ip;
- ),
- TP_printk("size %zd flags 0x%x caller %pS",
- __entry->size,
- __entry->flags,
- (char *)__entry->caller_ip)
-)
-
-#define DEFINE_KMEM_EVENT(name) \
-DEFINE_EVENT(xfs_kmem_class, name, \
- TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
- TP_ARGS(size, flags, caller_ip))
-DEFINE_KMEM_EVENT(kmem_alloc);
-
TRACE_EVENT(xfs_check_new_dalign,
TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
TP_ARGS(mp, new_dalign, calc_rootino),
@@ -4091,7 +4231,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
TP_ARGS(cur),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(unsigned int, levels)
@@ -4099,15 +4239,15 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = cur->bc_ag.afake->af_root;
__entry->levels = cur->bc_ag.afake->af_levels;
__entry->blocks = cur->bc_ag.afake->af_blocks;
),
- TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u",
+ TP_printk("dev %d:%d %sbt agno 0x%x levels %u blocks %u root %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->levels,
__entry->blocks,
@@ -4119,7 +4259,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
TP_ARGS(cur),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agino_t, agino)
__field(unsigned int, levels)
@@ -4128,7 +4268,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
cur->bc_ino.ip->i_ino);
__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
@@ -4137,9 +4277,9 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
__entry->blocks = cur->bc_ino.ifake->if_blocks;
__entry->whichfork = cur->bc_ino.whichfork;
),
- TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
+ TP_printk("dev %d:%d %sbt agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->agino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
@@ -4156,7 +4296,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
blocks_with_extra),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(unsigned int, level)
__field(unsigned int, nlevels)
__field(uint64_t, nr_this_level)
@@ -4167,7 +4307,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->nr_this_level = nr_this_level;
@@ -4176,9 +4316,9 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
__entry->blocks = blocks;
__entry->blocks_with_extra = blocks_with_extra;
),
- TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+ TP_printk("dev %d:%d %sbt level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->nlevels,
__entry->nr_this_level,
@@ -4195,7 +4335,7 @@ TRACE_EVENT(xfs_btree_bload_block,
TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(unsigned int, level)
__field(unsigned long long, block_idx)
__field(unsigned long long, nr_blocks)
@@ -4205,11 +4345,11 @@ TRACE_EVENT(xfs_btree_bload_block,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->block_idx = block_idx;
__entry->nr_blocks = nr_blocks;
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
@@ -4220,9 +4360,9 @@ TRACE_EVENT(xfs_btree_bload_block,
}
__entry->nr_records = nr_records;
),
- TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
+ TP_printk("dev %d:%d %sbt level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->block_idx,
__entry->nr_blocks,
@@ -4472,6 +4612,159 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
#endif /* CONFIG_XFS_DRAIN_INTENTS */
+#ifdef CONFIG_XFS_MEMORY_BUFS
+TRACE_EVENT(xmbuf_create,
+ TP_PROTO(struct xfs_buftarg *btp),
+ TP_ARGS(btp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, ino)
+ __array(char, pathname, 256)
+ ),
+ TP_fast_assign(
+ char pathname[257];
+ char *path;
+ struct file *file = btp->bt_file;
+
+ __entry->ino = file_inode(file)->i_ino;
+ memset(pathname, 0, sizeof(pathname));
+ path = file_path(file, pathname, sizeof(pathname) - 1);
+ if (IS_ERR(path))
+ path = "(unknown)";
+ strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+ ),
+ TP_printk("xmino 0x%lx path '%s'",
+ __entry->ino,
+ __entry->pathname)
+);
+
+TRACE_EVENT(xmbuf_free,
+ TP_PROTO(struct xfs_buftarg *btp),
+ TP_ARGS(btp),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, bytes)
+ __field(loff_t, size)
+ ),
+ TP_fast_assign(
+ struct file *file = btp->bt_file;
+ struct inode *inode = file_inode(file);
+
+ __entry->size = i_size_read(inode);
+ __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes;
+ __entry->ino = inode->i_ino;
+ ),
+ TP_printk("xmino 0x%lx mem_bytes 0x%llx isize 0x%llx",
+ __entry->ino,
+ __entry->bytes,
+ __entry->size)
+);
+#endif /* CONFIG_XFS_MEMORY_BUFS */
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+TRACE_EVENT(xfbtree_init,
+ TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt,
+ const struct xfs_btree_ops *ops),
+ TP_ARGS(mp, xfbt, ops),
+ TP_STRUCT__entry(
+ __field(const void *, btree_ops)
+ __field(unsigned long, xfino)
+ __field(unsigned int, leaf_mxr)
+ __field(unsigned int, leaf_mnr)
+ __field(unsigned int, node_mxr)
+ __field(unsigned int, node_mnr)
+ __field(unsigned long long, owner)
+ ),
+ TP_fast_assign(
+ __entry->btree_ops = ops;
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __entry->leaf_mxr = xfbt->maxrecs[0];
+ __entry->node_mxr = xfbt->maxrecs[1];
+ __entry->leaf_mnr = xfbt->minrecs[0];
+ __entry->node_mnr = xfbt->minrecs[1];
+ __entry->owner = xfbt->owner;
+ ),
+ TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u",
+ __entry->xfino,
+ __entry->btree_ops,
+ __entry->owner,
+ __entry->leaf_mxr,
+ __entry->leaf_mnr,
+ __entry->node_mxr,
+ __entry->node_mnr)
+);
+
+DECLARE_EVENT_CLASS(xfbtree_buf_class,
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp),
+ TP_ARGS(xfbt, bp),
+ TP_STRUCT__entry(
+ __field(unsigned long, xfino)
+ __field(xfs_daddr_t, bno)
+ __field(int, nblks)
+ __field(int, hold)
+ __field(int, pincount)
+ __field(unsigned int, lockval)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __entry->bno = xfs_buf_daddr(bp);
+ __entry->nblks = bp->b_length;
+ __entry->hold = atomic_read(&bp->b_hold);
+ __entry->pincount = atomic_read(&bp->b_pin_count);
+ __entry->lockval = bp->b_sema.count;
+ __entry->flags = bp->b_flags;
+ ),
+ TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s",
+ __entry->xfino,
+ (unsigned long long)__entry->bno,
+ __entry->nblks,
+ __entry->hold,
+ __entry->pincount,
+ __entry->lockval,
+ __print_flags(__entry->flags, "|", XFS_BUF_FLAGS))
+)
+
+#define DEFINE_XFBTREE_BUF_EVENT(name) \
+DEFINE_EVENT(xfbtree_buf_class, name, \
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \
+ TP_ARGS(xfbt, bp))
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf);
+
+DECLARE_EVENT_CLASS(xfbtree_freesp_class,
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur,
+ xfs_fileoff_t fileoff),
+ TP_ARGS(xfbt, cur, fileoff),
+ TP_STRUCT__entry(
+ __field(unsigned long, xfino)
+ __string(btname, cur->bc_ops->name)
+ __field(int, nlevels)
+ __field(xfs_fileoff_t, fileoff)
+ ),
+ TP_fast_assign(
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __assign_str(btname, cur->bc_ops->name);
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->fileoff = fileoff;
+ ),
+ TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx",
+ __entry->xfino,
+ __get_str(btname),
+ __entry->nlevels,
+ (unsigned long long)__entry->fileoff)
+)
+
+#define DEFINE_XFBTREE_FREESP_EVENT(name) \
+DEFINE_EVENT(xfbtree_freesp_class, name, \
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \
+ xfs_fileoff_t fileoff), \
+ TP_ARGS(xfbt, cur, fileoff))
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 12d45e93f07d..7350640059cc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1273,7 +1273,7 @@ xfs_trans_reserve_more_inode(
unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks);
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
if (error)
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 08ce757c7454..3f7e3a09a49f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -215,6 +215,7 @@ struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098452e7f95..e4c343096f95 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -901,7 +901,8 @@ xfs_trans_ail_init(
{
struct xfs_ail *ailp;
- ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+ ailp = kzalloc(sizeof(struct xfs_ail),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!ailp)
return -ENOMEM;
@@ -921,7 +922,7 @@ xfs_trans_ail_init(
return 0;
out_free_ailp:
- kmem_free(ailp);
+ kfree(ailp);
return -ENOMEM;
}
@@ -932,5 +933,5 @@ xfs_trans_ail_destroy(
struct xfs_ail *ailp = mp->m_ail;
kthread_stop(ailp->ail_task);
- kmem_free(ailp);
+ kfree(ailp);
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6549e50d852c..e28ab74af4f0 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -393,6 +393,48 @@ xfs_trans_brelse(
}
/*
+ * Forcibly detach a buffer previously joined to the transaction. The caller
+ * will retain its locked reference to the buffer after this function returns.
+ * The buffer must be completely clean and must not be held to the transaction.
+ */
+void
+xfs_trans_bdetach(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
+ ASSERT(tp != NULL);
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ trace_xfs_trans_bdetach(bip);
+
+ /*
+ * Erase all recursion count, since we're removing this buffer from the
+ * transaction.
+ */
+ bip->bli_recur = 0;
+
+ /*
+ * The buffer must be completely clean. Specifically, it had better
+ * not be dirty, stale, logged, ordered, or held to the transaction.
+ */
+ ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
+ ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY));
+ ASSERT(!(bip->bli_flags & XFS_BLI_HOLD));
+ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+
+ /* Unlink the log item from the transaction and drop the log item. */
+ xfs_trans_del_item(&bip->bli_item);
+ xfs_buf_item_put(bip);
+ bp->b_transp = NULL;
+}
+
+/*
* Mark the buffer as not needing to be unlocked when the buf item's
* iop_committing() routine is called. The buffer must already be locked
* and associated with the given transaction.
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index aa00cf67ad72..577b535a595c 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,6 +17,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_health.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -120,6 +121,116 @@ xfs_trans_dup_dqinfo(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of quota live updates. If the
+ * compiler supports jump labels, the static branch will be replaced by a nop
+ * sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dqtrx_hooks_switch);
+
+void
+xfs_dqtrx_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_dqtrx_hooks_switch);
+}
+
+void
+xfs_dqtrx_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_dqtrx_hooks_switch);
+}
+
+/* Schedule a transactional dquot update on behalf of an inode. */
+void
+xfs_trans_mod_ino_dquot(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot *dqp,
+ unsigned int field,
+ int64_t delta)
+{
+ xfs_trans_mod_dquot(tp, dqp, field, delta);
+
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_mod_ino_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .ino = ip->i_ino,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ .delta = delta
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_mod_ino_dqtrx_hooks, field, &p);
+ }
+}
+
+/* Call the specified functions during a dquot counter update. */
+int
+xfs_dqtrx_hook_add(
+ struct xfs_quotainfo *qi,
+ struct xfs_dqtrx_hook *hook)
+{
+ int error;
+
+ /*
+ * Transactional dquot updates first call the mod hook when changes
+ * are attached to the transaction and then call the apply hook when
+ * those changes are committed (or canceled).
+ *
+ * The apply hook must be installed before the mod hook so that we
+ * never fail to catch the end of a quota update sequence.
+ */
+ error = xfs_hooks_add(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+ if (error)
+ goto out;
+
+ error = xfs_hooks_add(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook);
+ if (error)
+ goto out_apply;
+
+ return 0;
+
+out_apply:
+ xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+out:
+ return error;
+}
+
+/* Stop calling the specified function during a dquot counter update. */
+void
+xfs_dqtrx_hook_del(
+ struct xfs_quotainfo *qi,
+ struct xfs_dqtrx_hook *hook)
+{
+ /*
+ * The mod hook must be removed before apply hook to avoid giving the
+ * hook consumer with an incomplete update. No hooks should be running
+ * after these functions return.
+ */
+ xfs_hooks_del(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook);
+ xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+}
+
+/* Configure dquot update hook functions. */
+void
+xfs_dqtrx_hook_setup(
+ struct xfs_dqtrx_hook *hook,
+ notifier_fn_t mod_fn,
+ notifier_fn_t apply_fn)
+{
+ xfs_hook_setup(&hook->mod_hook, mod_fn);
+ xfs_hook_setup(&hook->apply_hook, apply_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Wrap around mod_dquot to account for both user and group quotas.
*/
@@ -137,11 +248,11 @@ xfs_trans_mod_dquot_byino(
return;
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);
if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_gdquot, field, delta);
if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_pdquot, field, delta);
}
STATIC struct xfs_dqtrx *
@@ -321,6 +432,29 @@ xfs_apply_quota_reservation_deltas(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/* Call downstream hooks now that it's time to apply dquot deltas. */
+static inline void
+xfs_trans_apply_dquot_deltas_hook(
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
+{
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_apply_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_apply_dqtrx_hooks,
+ XFS_APPLY_DQTRX_COMMIT, &p);
+ }
+}
+#else
+# define xfs_trans_apply_dquot_deltas_hook(tp, dqp) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Called by xfs_trans_commit() and similar in spirit to
* xfs_trans_apply_sb_deltas().
@@ -366,6 +500,8 @@ xfs_trans_apply_dquot_deltas(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ xfs_trans_apply_dquot_deltas_hook(tp, dqp);
+
/*
* adjust the actual number of blocks used
*/
@@ -465,6 +601,29 @@ xfs_trans_apply_dquot_deltas(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/* Call downstream hooks now that it's time to cancel dquot deltas. */
+static inline void
+xfs_trans_unreserve_and_mod_dquots_hook(
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
+{
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_apply_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_apply_dqtrx_hooks,
+ XFS_APPLY_DQTRX_UNRESERVE, &p);
+ }
+}
+#else
+# define xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Release the reservations, and adjust the dquots accordingly.
* This is called only when the transaction is being aborted. If by
@@ -495,6 +654,9 @@ xfs_trans_unreserve_and_mod_dquots(
*/
if ((dqp = qtrx->qt_dquot) == NULL)
break;
+
+ xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp);
+
/*
* Unreserve the original reservation. We don't care
* about the number of blocks used field, or deltas.
@@ -706,6 +868,7 @@ error_return:
error_corrupt:
xfs_dqunlock(dqp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
return -EFSCORRUPTED;
}
@@ -796,7 +959,7 @@ xfs_trans_reserve_quota_nblks(
return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (force)
qflags |= XFS_QMOPT_FORCE_RES;