summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Makefile1
-rw-r--r--fs/9p/vfs_addr.c10
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/adfs.h1
-rw-r--r--fs/adfs/file.c1
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/amigaffs.c1
-rw-r--r--fs/affs/amigaffs.h1
-rw-r--r--fs/affs/bitmap.c1
-rw-r--r--fs/affs/dir.c1
-rw-r--r--fs/affs/file.c1
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/namei.c1
-rw-r--r--fs/affs/symlink.c1
-rw-r--r--fs/afs/Makefile1
-rw-r--r--fs/afs/netdevices.c1
-rw-r--r--fs/aio.c2
-rw-r--r--fs/attr.c1
-rw-r--r--fs/bad_inode.c1
-rw-r--r--fs/befs/befs.h1
-rw-r--r--fs/befs/befs_fs_types.h1
-rw-r--r--fs/befs/btree.h1
-rw-r--r--fs/befs/datastream.c1
-rw-r--r--fs/befs/datastream.h1
-rw-r--r--fs/befs/debug.c1
-rw-r--r--fs/befs/endian.h1
-rw-r--r--fs/befs/inode.c1
-rw-r--r--fs/befs/io.c1
-rw-r--r--fs/bfs/bfs.h1
-rw-r--r--fs/bfs/dir.c1
-rw-r--r--fs/bfs/file.c1
-rw-r--r--fs/binfmt_flat.c1
-rw-r--r--fs/binfmt_misc.c56
-rw-r--r--fs/binfmt_script.c17
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/Kconfig11
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/backref.c72
-rw-r--r--fs/btrfs/backref.h8
-rw-r--r--fs/btrfs/btrfs_inode.h29
-rw-r--r--fs/btrfs/check-integrity.c8
-rw-r--r--fs/btrfs/compression.c511
-rw-r--r--fs/btrfs/compression.h6
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/ctree.h33
-rw-r--r--fs/btrfs/delayed-inode.c46
-rw-r--r--fs/btrfs/delayed-ref.c296
-rw-r--r--fs/btrfs/delayed-ref.h54
-rw-r--r--fs/btrfs/disk-io.c236
-rw-r--r--fs/btrfs/export.c1
-rw-r--r--fs/btrfs/export.h1
-rw-r--r--fs/btrfs/extent-tree.c829
-rw-r--r--fs/btrfs/extent_io.c53
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode-map.h1
-rw-r--r--fs/btrfs/inode.c354
-rw-r--r--fs/btrfs/ioctl.c168
-rw-r--r--fs/btrfs/lzo.c5
-rw-r--r--fs/btrfs/ordered-data.c21
-rw-r--r--fs/btrfs/qgroup.c14
-rw-r--r--fs/btrfs/raid56.c30
-rw-r--r--fs/btrfs/ref-verify.c1031
-rw-r--r--fs/btrfs/ref-verify.h62
-rw-r--r--fs/btrfs/relocation.c19
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c22
-rw-r--r--fs/btrfs/send.c76
-rw-r--r--fs/btrfs/send.h2
-rw-r--r--fs/btrfs/super.c39
-rw-r--r--fs/btrfs/sysfs.c63
-rw-r--r--fs/btrfs/sysfs.h27
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c20
-rw-r--r--fs/btrfs/tests/qgroup-tests.c30
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/tree-checker.c425
-rw-r--r--fs/btrfs/tree-checker.h26
-rw-r--r--fs/btrfs/tree-log.c46
-rw-r--r--fs/btrfs/volumes.c170
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/btrfs/zlib.c15
-rw-r--r--fs/btrfs/zstd.c5
-rw-r--r--fs/buffer.c7
-rw-r--r--fs/cachefiles/Makefile1
-rw-r--r--fs/ceph/Makefile1
-rw-r--r--fs/ceph/addr.c1
-rw-r--r--fs/ceph/caps.c6
-rw-r--r--fs/ceph/ceph_frag.c1
-rw-r--r--fs/ceph/debugfs.c1
-rw-r--r--fs/ceph/dir.c1
-rw-r--r--fs/ceph/export.c1
-rw-r--r--fs/ceph/file.c1
-rw-r--r--fs/ceph/inode.c1
-rw-r--r--fs/ceph/ioctl.c1
-rw-r--r--fs/ceph/ioctl.h1
-rw-r--r--fs/ceph/locks.c1
-rw-r--r--fs/ceph/mds_client.c12
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/mdsmap.c1
-rw-r--r--fs/ceph/snap.c9
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/ceph/xattr.c1
-rw-r--r--fs/char_dev.c1
-rw-r--r--fs/cifs/Kconfig5
-rw-r--r--fs/cifs/Makefile1
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/dir.c5
-rw-r--r--fs/cifs/smb2maperror.c2
-rw-r--r--fs/cifs/smb2ops.c61
-rw-r--r--fs/cifs/smb2pdu.c33
-rw-r--r--fs/cifs/smb2pdu.h5
-rw-r--r--fs/cifs/smb2proto.h1
-rw-r--r--fs/cifs/smb2transport.c26
-rw-r--r--fs/coda/cache.c1
-rw-r--r--fs/coda/cnode.c1
-rw-r--r--fs/coda/coda_cache.h1
-rw-r--r--fs/coda/coda_fs_i.h1
-rw-r--r--fs/coda/coda_int.h1
-rw-r--r--fs/coda/coda_linux.c1
-rw-r--r--fs/coda/coda_linux.h1
-rw-r--r--fs/coda/dir.c1
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c1
-rw-r--r--fs/coda/pioctl.c1
-rw-r--r--fs/coda/symlink.c1
-rw-r--r--fs/coda/sysctl.c1
-rw-r--r--fs/coda/upcall.c1
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/coredump.c1
-rw-r--r--fs/cramfs/uncompress.c1
-rw-r--r--fs/crypto/Makefile2
-rw-r--r--fs/crypto/bio.c1
-rw-r--r--fs/crypto/crypto.c37
-rw-r--r--fs/crypto/fname.c40
-rw-r--r--fs/crypto/fscrypt_private.h14
-rw-r--r--fs/crypto/hooks.c112
-rw-r--r--fs/crypto/keyinfo.c31
-rw-r--r--fs/crypto/policy.c7
-rw-r--r--fs/dax.c2
-rw-r--r--fs/dcache.c22
-rw-r--r--fs/direct-io.c77
-rw-r--r--fs/dlm/Makefile1
-rw-r--r--fs/drop_caches.c1
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h24
-rw-r--r--fs/ecryptfs/keystore.c9
-rw-r--r--fs/efs/dir.c1
-rw-r--r--fs/efs/efs.h1
-rw-r--r--fs/efs/file.c1
-rw-r--r--fs/efs/namei.c1
-rw-r--r--fs/efs/super.c1
-rw-r--r--fs/efs/symlink.c1
-rw-r--r--fs/exec.c5
-rw-r--r--fs/ext2/Makefile1
-rw-r--r--fs/ext2/acl.c1
-rw-r--r--fs/ext2/acl.h1
-rw-r--r--fs/ext2/balloc.c1
-rw-r--r--fs/ext2/dir.c1
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/ialloc.c1
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext2/ioctl.c1
-rw-r--r--fs/ext2/namei.c1
-rw-r--r--fs/ext2/symlink.c1
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext2/xattr.h1
-rw-r--r--fs/ext2/xattr_security.c1
-rw-r--r--fs/ext2/xattr_trusted.c1
-rw-r--r--fs/ext2/xattr_user.c1
-rw-r--r--fs/ext4/Kconfig1
-rw-r--r--fs/ext4/Makefile1
-rw-r--r--fs/ext4/acl.c1
-rw-r--r--fs/ext4/acl.h1
-rw-r--r--fs/ext4/balloc.c16
-rw-r--r--fs/ext4/bitmap.c1
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/dir.c1
-rw-r--r--fs/ext4/ext4.h59
-rw-r--r--fs/ext4/ext4_jbd2.c1
-rw-r--r--fs/ext4/extents.c6
-rw-r--r--fs/ext4/extents_status.c1
-rw-r--r--fs/ext4/extents_status.h1
-rw-r--r--fs/ext4/file.c287
-rw-r--r--fs/ext4/fsync.c1
-rw-r--r--fs/ext4/ialloc.c5
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inline.c43
-rw-r--r--fs/ext4/inode.c171
-rw-r--r--fs/ext4/ioctl.c31
-rw-r--r--fs/ext4/mballoc.c28
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/mmp.c1
-rw-r--r--fs/ext4/namei.c63
-rw-r--r--fs/ext4/page-io.c1
-rw-r--r--fs/ext4/readpage.c1
-rw-r--r--fs/ext4/resize.c105
-rw-r--r--fs/ext4/super.c46
-rw-r--r--fs/ext4/symlink.c1
-rw-r--r--fs/ext4/sysfs.c1
-rw-r--r--fs/ext4/truncate.h1
-rw-r--r--fs/ext4/xattr.c1
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/ext4/xattr_security.c1
-rw-r--r--fs/ext4/xattr_trusted.c1
-rw-r--r--fs/ext4/xattr_user.c1
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/f2fs.h11
-rw-r--r--fs/f2fs/inode.c5
-rw-r--r--fs/f2fs/segment.c6
-rw-r--r--fs/f2fs/super.c9
-rw-r--r--fs/fat/Makefile1
-rw-r--r--fs/fat/cache.c1
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/fhandle.c1
-rw-r--r--fs/file.c1
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/filesystems.c1
-rw-r--r--fs/fs_pin.c5
-rw-r--r--fs/fscache/Makefile1
-rw-r--r--fs/fscache/object-list.c7
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/Makefile1
-rw-r--r--fs/gfs2/acl.c1
-rw-r--r--fs/gfs2/bmap.c322
-rw-r--r--fs/gfs2/bmap.h4
-rw-r--r--fs/gfs2/file.c124
-rw-r--r--fs/gfs2/glock.c14
-rw-r--r--fs/gfs2/inode.c89
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/super.c5
-rw-r--r--fs/gfs2/trace_gfs2.h66
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/gfs2/xattr.c63
-rw-r--r--fs/hfs/attr.c1
-rw-r--r--fs/hfs/bfind.c1
-rw-r--r--fs/hfs/bnode.c1
-rw-r--r--fs/hfs/brec.c1
-rw-r--r--fs/hfs/btree.c1
-rw-r--r--fs/hfs/btree.h1
-rw-r--r--fs/hfsplus/Makefile1
-rw-r--r--fs/hfsplus/acl.h1
-rw-r--r--fs/hfsplus/attributes.c1
-rw-r--r--fs/hfsplus/bfind.c1
-rw-r--r--fs/hfsplus/bitmap.c1
-rw-r--r--fs/hfsplus/bnode.c1
-rw-r--r--fs/hfsplus/brec.c1
-rw-r--r--fs/hfsplus/btree.c1
-rw-r--r--fs/hfsplus/catalog.c1
-rw-r--r--fs/hfsplus/dir.c1
-rw-r--r--fs/hfsplus/extents.c1
-rw-r--r--fs/hfsplus/hfsplus_fs.h1
-rw-r--r--fs/hfsplus/hfsplus_raw.h1
-rw-r--r--fs/hfsplus/inode.c1
-rw-r--r--fs/hfsplus/ioctl.c1
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hfsplus/posix_acl.c1
-rw-r--r--fs/hfsplus/tables.c1
-rw-r--r--fs/hfsplus/unicode.c1
-rw-r--r--fs/hfsplus/wrapper.c1
-rw-r--r--fs/hfsplus/xattr.c1
-rw-r--r--fs/hfsplus/xattr.h1
-rw-r--r--fs/hfsplus/xattr_security.c1
-rw-r--r--fs/hfsplus/xattr_trusted.c1
-rw-r--r--fs/hfsplus/xattr_user.c1
-rw-r--r--fs/hostfs/hostfs.h1
-rw-r--r--fs/hpfs/alloc.c1
-rw-r--r--fs/hpfs/anode.c1
-rw-r--r--fs/hpfs/buffer.c1
-rw-r--r--fs/hpfs/dentry.c1
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/dnode.c1
-rw-r--r--fs/hpfs/ea.c1
-rw-r--r--fs/hpfs/file.c1
-rw-r--r--fs/hpfs/hpfs.h1
-rw-r--r--fs/hpfs/hpfs_fn.h1
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/map.c1
-rw-r--r--fs/hpfs/name.c1
-rw-r--r--fs/hpfs/namei.c1
-rw-r--r--fs/hugetlbfs/inode.c5
-rw-r--r--fs/inode.c2
-rw-r--r--fs/ioctl.c1
-rw-r--r--fs/iomap.c67
-rw-r--r--fs/isofs/Makefile1
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/export.c1
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/joliet.c1
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/isofs/rock.c1
-rw-r--r--fs/isofs/rock.h1
-rw-r--r--fs/isofs/util.c1
-rw-r--r--fs/jbd2/journal.c9
-rw-r--r--fs/jffs2/Makefile1
-rw-r--r--fs/jfs/Makefile1
-rw-r--r--fs/jfs/ioctl.c1
-rw-r--r--fs/jfs/jfs_metapage.c2
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/lockd/Makefile1
-rw-r--r--fs/lockd/clnt4xdr.c1
-rw-r--r--fs/lockd/clntxdr.c1
-rw-r--r--fs/lockd/host.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/netns.h1
-rw-r--r--fs/lockd/procfs.c1
-rw-r--r--fs/lockd/procfs.h1
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/lockd/svcshare.c1
-rw-r--r--fs/lockd/xdr.c1
-rw-r--r--fs/lockd/xdr4.c1
-rw-r--r--fs/minix/bitmap.c1
-rw-r--r--fs/minix/dir.c1
-rw-r--r--fs/minix/file.c1
-rw-r--r--fs/minix/itree_common.c1
-rw-r--r--fs/minix/itree_v1.c1
-rw-r--r--fs/minix/itree_v2.c1
-rw-r--r--fs/minix/minix.h1
-rw-r--r--fs/minix/namei.c1
-rw-r--r--fs/mount.h1
-rw-r--r--fs/mpage.c15
-rw-r--r--fs/namei.c5
-rw-r--r--fs/namespace.c9
-rw-r--r--fs/ncpfs/Makefile1
-rw-r--r--fs/ncpfs/dir.c10
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/getopt.c1
-rw-r--r--fs/ncpfs/getopt.h1
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/ncpfs/mmap.c1
-rw-r--r--fs/ncpfs/ncp_fs.h1
-rw-r--r--fs/ncpfs/ncp_fs_i.h1
-rw-r--r--fs/ncpfs/ncp_fs_sb.h3
-rw-r--r--fs/ncpfs/ncplib_kernel.c1
-rw-r--r--fs/ncpfs/ncplib_kernel.h1
-rw-r--r--fs/ncpfs/ncpsign_kernel.c1
-rw-r--r--fs/ncpfs/ncpsign_kernel.h1
-rw-r--r--fs/ncpfs/sock.c7
-rw-r--r--fs/ncpfs/symlink.c1
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/blocklayout/dev.c1
-rw-r--r--fs/nfs/blocklayout/extent_tree.c1
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/cache_lib.h1
-rw-r--r--fs/nfs/callback.c1
-rw-r--r--fs/nfs/callback.h1
-rw-r--r--fs/nfs/callback_proc.c1
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c8
-rw-r--r--fs/nfs/dns_resolve.c1
-rw-r--r--fs/nfs/dns_resolve.h1
-rw-r--r--fs/nfs/export.c1
-rw-r--r--fs/nfs/filelayout/filelayout.c3
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c1
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/io.c1
-rw-r--r--fs/nfs/iostat.h1
-rw-r--r--fs/nfs/mount_clnt.c1
-rw-r--r--fs/nfs/netns.h1
-rw-r--r--fs/nfs/nfs.h1
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3_fs.h1
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c1
-rw-r--r--fs/nfs/nfs42xdr.c1
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4getroot.c1
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/nfs/nfs4session.h1
-rw-r--r--fs/nfs/nfs4sysctl.c1
-rw-r--r--fs/nfs/nfs4trace.c1
-rw-r--r--fs/nfs/nfs4trace.h1
-rw-r--r--fs/nfs/nfs4xdr.c4
-rw-r--r--fs/nfs/nfsroot.c1
-rw-r--r--fs/nfs/nfstrace.c1
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/symlink.c1
-rw-r--r--fs/nfs/sysctl.c1
-rw-r--r--fs/nfs/unlink.c1
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/auth.c1
-rw-r--r--fs/nfsd/auth.h1
-rw-r--r--fs/nfsd/blocklayout.c5
-rw-r--r--fs/nfsd/blocklayoutxdr.c1
-rw-r--r--fs/nfsd/blocklayoutxdr.h1
-rw-r--r--fs/nfsd/cache.h1
-rw-r--r--fs/nfsd/current_stateid.h1
-rw-r--r--fs/nfsd/export.c1
-rw-r--r--fs/nfsd/export.h1
-rw-r--r--fs/nfsd/fault_inject.c1
-rw-r--r--fs/nfsd/flexfilelayout.c1
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c1
-rw-r--r--fs/nfsd/flexfilelayoutxdr.h1
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs2acl.c1
-rw-r--r--fs/nfsd/nfs3acl.c1
-rw-r--r--fs/nfsd/nfs3proc.c1
-rw-r--r--fs/nfsd/nfs3xdr.c1
-rw-r--r--fs/nfsd/nfs4layouts.c1
-rw-r--r--fs/nfsd/nfs4proc.c9
-rw-r--r--fs/nfsd/nfscache.c1
-rw-r--r--fs/nfsd/nfsd.h1
-rw-r--r--fs/nfsd/nfsfh.c1
-rw-r--r--fs/nfsd/nfsfh.h1
-rw-r--r--fs/nfsd/nfsproc.c1
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/nfsd/nfsxdr.c1
-rw-r--r--fs/nfsd/pnfs.h1
-rw-r--r--fs/nfsd/stats.c1
-rw-r--r--fs/nfsd/stats.h1
-rw-r--r--fs/nfsd/trace.h1
-rw-r--r--fs/nfsd/vfs.c1
-rw-r--r--fs/nfsd/vfs.h1
-rw-r--r--fs/nfsd/xdr.h1
-rw-r--r--fs/nfsd/xdr3.h1
-rw-r--r--fs/nfsd/xdr4cb.h1
-rw-r--r--fs/nilfs2/Makefile1
-rw-r--r--fs/nilfs2/export.h1
-rw-r--r--fs/nls/Makefile1
-rw-r--r--fs/notify/Makefile1
-rw-r--r--fs/notify/fanotify/fanotify.c1
-rw-r--r--fs/notify/fanotify/fanotify.h1
-rw-r--r--fs/notify/fanotify/fanotify_user.c1
-rw-r--r--fs/notify/fdinfo.c1
-rw-r--r--fs/notify/fdinfo.h1
-rw-r--r--fs/notify/fsnotify.h1
-rw-r--r--fs/notify/inotify/inotify.h1
-rw-r--r--fs/nsfs.c1
-rw-r--r--fs/ntfs/Makefile1
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c24
-rw-r--r--fs/ocfs2/ioctl.c1
-rw-r--r--fs/ocfs2/ioctl.h1
-rw-r--r--fs/ocfs2/mmap.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h1
-rw-r--r--fs/ocfs2/quota.h1
-rw-r--r--fs/ocfs2/quota_global.c1
-rw-r--r--fs/ocfs2/quota_local.c1
-rw-r--r--fs/omfs/bitmap.c1
-rw-r--r--fs/omfs/omfs.h1
-rw-r--r--fs/omfs/omfs_fs.h1
-rw-r--r--fs/orangefs/Makefile1
-rw-r--r--fs/orangefs/acl.c1
-rw-r--r--fs/orangefs/dcache.c1
-rw-r--r--fs/orangefs/devorangefs-req.c1
-rw-r--r--fs/orangefs/dir.c1
-rw-r--r--fs/orangefs/downcall.h1
-rw-r--r--fs/orangefs/file.c1
-rw-r--r--fs/orangefs/inode.c1
-rw-r--r--fs/orangefs/namei.c1
-rw-r--r--fs/orangefs/orangefs-bufmap.c1
-rw-r--r--fs/orangefs/orangefs-bufmap.h1
-rw-r--r--fs/orangefs/orangefs-cache.c1
-rw-r--r--fs/orangefs/orangefs-debug.h1
-rw-r--r--fs/orangefs/orangefs-debugfs.c1
-rw-r--r--fs/orangefs/orangefs-debugfs.h1
-rw-r--r--fs/orangefs/orangefs-dev-proto.h1
-rw-r--r--fs/orangefs/orangefs-kernel.h1
-rw-r--r--fs/orangefs/orangefs-sysfs.c1
-rw-r--r--fs/orangefs/orangefs-utils.c1
-rw-r--r--fs/orangefs/protocol.h1
-rw-r--r--fs/orangefs/super.c1
-rw-r--r--fs/orangefs/symlink.c1
-rw-r--r--fs/orangefs/upcall.h1
-rw-r--r--fs/orangefs/waitqueue.c1
-rw-r--r--fs/orangefs/xattr.c1
-rw-r--r--fs/overlayfs/copy_up.c6
-rw-r--r--fs/overlayfs/dir.c20
-rw-r--r--fs/overlayfs/inode.c20
-rw-r--r--fs/overlayfs/namei.c33
-rw-r--r--fs/overlayfs/overlayfs.h4
-rw-r--r--fs/overlayfs/ovl_entry.h5
-rw-r--r--fs/overlayfs/readdir.c19
-rw-r--r--fs/overlayfs/super.c30
-rw-r--r--fs/overlayfs/util.c24
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c38
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/proc/cmdline.c1
-rw-r--r--fs/proc/cpuinfo.c1
-rw-r--r--fs/proc/devices.c1
-rw-r--r--fs/proc/fd.c1
-rw-r--r--fs/proc/fd.h1
-rw-r--r--fs/proc/inode.c1
-rw-r--r--fs/proc/interrupts.c1
-rw-r--r--fs/proc/kcore.c1
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/loadavg.c1
-rw-r--r--fs/proc/meminfo.c1
-rw-r--r--fs/proc/namespaces.c1
-rw-r--r--fs/proc/page.c1
-rw-r--r--fs/proc/proc_sysctl.c1
-rw-r--r--fs/proc/proc_tty.c4
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/self.c1
-rw-r--r--fs/proc/softirqs.c1
-rw-r--r--fs/proc/stat.c1
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/proc/thread_self.c1
-rw-r--r--fs/proc/uptime.c1
-rw-r--r--fs/proc/version.c1
-rw-r--r--fs/proc_namespace.c3
-rw-r--r--fs/pstore/Makefile1
-rw-r--r--fs/pstore/internal.h1
-rw-r--r--fs/pstore/platform.c7
-rw-r--r--fs/qnx4/bitmap.c1
-rw-r--r--fs/qnx4/dir.c1
-rw-r--r--fs/qnx4/namei.c1
-rw-r--r--fs/qnx4/qnx4.h1
-rw-r--r--fs/qnx6/dir.c1
-rw-r--r--fs/qnx6/namei.c1
-rw-r--r--fs/qnx6/qnx6.h1
-rw-r--r--fs/qnx6/super_mmi.c1
-rw-r--r--fs/quota/Makefile1
-rw-r--r--fs/quota/compat.c1
-rw-r--r--fs/quota/dquot.c30
-rw-r--r--fs/quota/kqid.c1
-rw-r--r--fs/quota/netlink.c1
-rw-r--r--fs/quota/quota.c1
-rw-r--r--fs/quota/quota_tree.h1
-rw-r--r--fs/quota/quota_v2.c4
-rw-r--r--fs/quota/quotaio_v1.h1
-rw-r--r--fs/quota/quotaio_v2.h1
-rw-r--r--fs/read_write.c5
-rw-r--r--fs/readdir.c12
-rw-r--r--fs/reiserfs/Makefile1
-rw-r--r--fs/reiserfs/acl.h1
-rw-r--r--fs/reiserfs/journal.c1
-rw-r--r--fs/reiserfs/lock.c1
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/reiserfs/tail_conversion.c1
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/reiserfs/xattr.h1
-rw-r--r--fs/reiserfs/xattr_acl.c1
-rw-r--r--fs/reiserfs/xattr_security.c1
-rw-r--r--fs/reiserfs/xattr_trusted.c1
-rw-r--r--fs/reiserfs/xattr_user.c1
-rw-r--r--fs/romfs/Makefile1
-rw-r--r--fs/select.c1
-rw-r--r--fs/seq_file.c1
-rw-r--r--fs/signalfd.c1
-rw-r--r--fs/splice.c2
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/stat.c1
-rw-r--r--fs/statfs.c1
-rw-r--r--fs/super.c1
-rw-r--r--fs/sync.c1
-rw-r--r--fs/sysv/balloc.c1
-rw-r--r--fs/sysv/dir.c1
-rw-r--r--fs/sysv/file.c1
-rw-r--r--fs/sysv/ialloc.c1
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/sysv/itree.c1
-rw-r--r--fs/sysv/namei.c1
-rw-r--r--fs/sysv/sysv.h1
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/Makefile1
-rw-r--r--fs/ubifs/crypto.c2
-rw-r--r--fs/ubifs/ioctl.c5
-rw-r--r--fs/ubifs/misc.c1
-rw-r--r--fs/ubifs/super.c8
-rw-r--r--fs/ubifs/ubifs.h18
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/udf_i.h1
-rw-r--r--fs/udf/udf_sb.h1
-rw-r--r--fs/udf/udfdecl.h1
-rw-r--r--fs/udf/udfend.h1
-rw-r--r--fs/ufs/balloc.c1
-rw-r--r--fs/ufs/cylinder.c1
-rw-r--r--fs/ufs/dir.c1
-rw-r--r--fs/ufs/file.c1
-rw-r--r--fs/ufs/ialloc.c1
-rw-r--r--fs/ufs/inode.c1
-rw-r--r--fs/ufs/namei.c1
-rw-r--r--fs/ufs/swab.h1
-rw-r--r--fs/ufs/ufs.h1
-rw-r--r--fs/ufs/ufs_fs.h1
-rw-r--r--fs/ufs/util.c1
-rw-r--r--fs/ufs/util.h1
-rw-r--r--fs/userfaultfd.c74
-rw-r--r--fs/utimes.c1
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/Kconfig17
-rw-r--r--fs/xfs/Makefile29
-rw-r--r--fs/xfs/kmem.h3
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c13
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c58
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h4
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2093
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h67
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c250
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h22
-rw-r--r--fs/xfs/libxfs/xfs_btree.c259
-rw-r--r--fs/xfs/libxfs/xfs_btree.h32
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h1
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c22
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c24
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h17
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h106
-rw-r--r--fs/xfs/libxfs/xfs_format.h37
-rw-r--r--fs/xfs/libxfs/xfs_fs.h77
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c95
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h7
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c1043
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c1
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c1333
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h138
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h51
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c1
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c1
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c13
-rw-r--r--fs/xfs/libxfs/xfs_types.h22
-rw-r--r--fs/xfs/scrub/agheader.c658
-rw-r--r--fs/xfs/scrub/alloc.c102
-rw-r--r--fs/xfs/scrub/attr.c471
-rw-r--r--fs/xfs/scrub/bmap.c363
-rw-r--r--fs/xfs/scrub/btree.c516
-rw-r--r--fs/xfs/scrub/btree.h57
-rw-r--r--fs/xfs/scrub/common.c574
-rw-r--r--fs/xfs/scrub/common.h144
-rw-r--r--fs/xfs/scrub/dabtree.c591
-rw-r--r--fs/xfs/scrub/dabtree.h59
-rw-r--r--fs/xfs/scrub/dir.c816
-rw-r--r--fs/xfs/scrub/ialloc.c337
-rw-r--r--fs/xfs/scrub/inode.c611
-rw-r--r--fs/xfs/scrub/parent.c317
-rw-r--r--fs/xfs/scrub/quota.c304
-rw-r--r--fs/xfs/scrub/refcount.c99
-rw-r--r--fs/xfs/scrub/rmap.c138
-rw-r--r--fs/xfs/scrub/rtbitmap.c108
-rw-r--r--fs/xfs/scrub/scrub.c392
-rw-r--r--fs/xfs/scrub/scrub.h115
-rw-r--r--fs/xfs/scrub/symlink.c92
-rw-r--r--fs/xfs/scrub/trace.c59
-rw-r--r--fs/xfs/scrub/trace.h499
-rw-r--r--fs/xfs/scrub/xfs_scrub.h29
-rw-r--r--fs/xfs/xfs.h1
-rw-r--r--fs/xfs/xfs_acl.c22
-rw-r--r--fs/xfs/xfs_aops.c50
-rw-r--r--fs/xfs/xfs_attr.h5
-rw-r--r--fs/xfs/xfs_attr_inactive.c71
-rw-r--r--fs/xfs/xfs_attr_list.c161
-rw-r--r--fs/xfs/xfs_bmap_util.c774
-rw-r--r--fs/xfs/xfs_bmap_util.h23
-rw-r--r--fs/xfs/xfs_buf.c18
-rw-r--r--fs/xfs/xfs_buf.h5
-rw-r--r--fs/xfs/xfs_dir2_readdir.c10
-rw-r--r--fs/xfs/xfs_discard.h1
-rw-r--r--fs/xfs/xfs_dquot.c21
-rw-r--r--fs/xfs/xfs_error.c8
-rw-r--r--fs/xfs/xfs_error.h81
-rw-r--r--fs/xfs/xfs_file.c66
-rw-r--r--fs/xfs/xfs_fsmap.c58
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_inode.c41
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_item.c104
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_ioctl.c161
-rw-r--r--fs/xfs/xfs_ioctl.h4
-rw-r--r--fs/xfs/xfs_ioctl32.c1
-rw-r--r--fs/xfs/xfs_iomap.c28
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_iops.c52
-rw-r--r--fs/xfs/xfs_itable.c13
-rw-r--r--fs/xfs/xfs_itable.h2
-rw-r--r--fs/xfs/xfs_linux.h21
-rw-r--r--fs/xfs/xfs_log.c35
-rw-r--r--fs/xfs/xfs_log_priv.h4
-rw-r--r--fs/xfs/xfs_log_recover.c62
-rw-r--r--fs/xfs/xfs_message.h1
-rw-r--r--fs/xfs/xfs_mount.c17
-rw-r--r--fs/xfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/xfs_pnfs.c3
-rw-r--r--fs/xfs/xfs_pnfs.h1
-rw-r--r--fs/xfs/xfs_reflink.c103
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_super.c12
-rw-r--r--fs/xfs/xfs_trace.h64
-rw-r--r--fs/xfs/xfs_trans_ail.c22
707 files changed, 16662 insertions, 7034 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 9619ccadd2fc..e7800a5c7395 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_9P_FS) := 9p.o
9p-objs := \
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index adaf6f6dd858..e1cbdfdb7c68 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -310,9 +310,13 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
- if (unlikely(copied < len && !PageUptodate(page))) {
- copied = 0;
- goto out;
+ if (!PageUptodate(page)) {
+ if (unlikely(copied < len)) {
+ copied = 0;
+ goto out;
+ } else if (len == PAGE_SIZE) {
+ SetPageUptodate(page);
+ }
}
/*
* No need to use i_size_read() here, the i_size
diff --git a/fs/Makefile b/fs/Makefile
index 7bbaca9c67b1..ef772f1eaff8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux filesystems.
#
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index fadf408bdd46..c76db75f02aa 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fs.h>
#include <linux/adfs_fs.h>
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 46c0d5671cd5..754afb14a6ff 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/adfs/file.c
*
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 773749be8290..a92eb6ae2ae2 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifdef pr_fmt
#undef pr_fmt
#endif
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 8cf941c3b511..185d5ab7e986 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/affs/amigaffs.c
*
diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
index 43b41c06aa37..f9bef9056659 100644
--- a/fs/affs/amigaffs.h
+++ b/fs/affs/amigaffs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef AMIGAFFS_H
#define AMIGAFFS_H
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 2b2112475ec2..2b1399611d9e 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/affs/bitmap.c
*
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 591ecd7f3063..a105e77df2c1 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/affs/dir.c
*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 00331810f690..a85817f54483 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/affs/file.c
*
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index fd4ef3c40e40..73598bff8506 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/affs/inode.c
*
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 46d3ace6761d..d8aa0ae3d037 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/affs/namei.c
*
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ae622cdce142..a7531b26e8f0 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/affs/symlink.c
*
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 095c54165dfd..641148208e90 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for Red Hat Linux AFS client.
#
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 40b2bab3e401..50bd5bb1c4fb 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* AFS network device helpers
*
* Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
diff --git a/fs/aio.c b/fs/aio.c
index 5a2487217072..e6de7715228c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -576,7 +576,7 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
* actually has a cancel function, hence the cmpxchg()
*/
- cancel = ACCESS_ONCE(kiocb->ki_cancel);
+ cancel = READ_ONCE(kiocb->ki_cancel);
do {
if (!cancel || cancel == KIOCB_CANCELLED)
return -EINVAL;
diff --git a/fs/attr.c b/fs/attr.c
index 135304146120..12ffdb6fb63c 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/attr.c
*
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index bb53728c7a31..213b51dbbb60 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/bad_inode.c
*
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index b914cfb03820..7cd47245694d 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* befs.h
*
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index 69c9d8cde955..8019fde814b7 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/befs/befs_fs_types.h
*
diff --git a/fs/befs/btree.h b/fs/befs/btree.h
index 60c6c728e64e..a253a6276d8e 100644
--- a/fs/befs/btree.h
+++ b/fs/befs/btree.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* btree.h
*
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index 720b3bc5c16a..97719a7c7e40 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/befs/datastream.c
*
diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h
index 7ff9ff09ec6e..39b1d4766ccf 100644
--- a/fs/befs/datastream.h
+++ b/fs/befs/datastream.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* datastream.h
*
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 36656c86f50e..eb7bd6c692c7 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/befs/debug.c
*
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 27223878ba9f..bb55a54c24c0 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/befs/endian.h
*
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index 5367a6470a69..791b46a6f2f9 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* inode.c
*
diff --git a/fs/befs/io.c b/fs/befs/io.c
index 227cb86e07fe..2caf50a4abbe 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/befs/io.c
*
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index f40006db36df..67aef3bb89e4 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/bfs/bfs.h
* Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3e5ac30e8b6f..ee832ca5f734 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/bfs/dir.c
* BFS directory operations.
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 97f1b5160155..1476cdd90cfb 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/bfs/file.c
* BFS file operations.
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 475d083f8088..5d6b94475f27 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/****************************************************************************/
/*
* linux/fs/binfmt_flat.c
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index ce7181ea60fa..a7c5a9861bef 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -54,7 +54,7 @@ typedef struct {
int size; /* size of magic/mask */
char *magic; /* magic or filename extension */
char *mask; /* mask, NULL for exact match */
- char *interpreter; /* filename of interpreter */
+ const char *interpreter; /* filename of interpreter */
char *name;
struct dentry *dentry;
struct file *interp_file;
@@ -131,27 +131,26 @@ static int load_misc_binary(struct linux_binprm *bprm)
{
Node *fmt;
struct file *interp_file = NULL;
- char iname[BINPRM_BUF_SIZE];
- const char *iname_addr = iname;
int retval;
int fd_binary = -1;
retval = -ENOEXEC;
if (!enabled)
- goto ret;
+ return retval;
/* to keep locking time low, we copy the interpreter string */
read_lock(&entries_lock);
fmt = check_file(bprm);
if (fmt)
- strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
+ dget(fmt->dentry);
read_unlock(&entries_lock);
if (!fmt)
- goto ret;
+ return retval;
/* Need to be able to load the file after exec */
+ retval = -ENOENT;
if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
- return -ENOENT;
+ goto ret;
if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
retval = remove_arg_zero(bprm);
@@ -195,22 +194,22 @@ static int load_misc_binary(struct linux_binprm *bprm)
bprm->argc++;
/* add the interp as argv[0] */
- retval = copy_strings_kernel(1, &iname_addr, bprm);
+ retval = copy_strings_kernel(1, &fmt->interpreter, bprm);
if (retval < 0)
goto error;
bprm->argc++;
/* Update interp in case binfmt_script needs it. */
- retval = bprm_change_interp(iname, bprm);
+ retval = bprm_change_interp(fmt->interpreter, bprm);
if (retval < 0)
goto error;
- if (fmt->flags & MISC_FMT_OPEN_FILE && fmt->interp_file) {
+ if (fmt->flags & MISC_FMT_OPEN_FILE) {
interp_file = filp_clone_open(fmt->interp_file);
if (!IS_ERR(interp_file))
deny_write_access(interp_file);
} else {
- interp_file = open_exec(iname);
+ interp_file = open_exec(fmt->interpreter);
}
retval = PTR_ERR(interp_file);
if (IS_ERR(interp_file))
@@ -238,6 +237,7 @@ static int load_misc_binary(struct linux_binprm *bprm)
goto error;
ret:
+ dput(fmt->dentry);
return retval;
error:
if (fd_binary > 0)
@@ -594,8 +594,13 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
static void bm_evict_inode(struct inode *inode)
{
+ Node *e = inode->i_private;
+
+ if (e && e->flags & MISC_FMT_OPEN_FILE)
+ filp_close(e->interp_file, NULL);
+
clear_inode(inode);
- kfree(inode->i_private);
+ kfree(e);
}
static void kill_node(Node *e)
@@ -603,24 +608,14 @@ static void kill_node(Node *e)
struct dentry *dentry;
write_lock(&entries_lock);
- dentry = e->dentry;
- if (dentry) {
- list_del_init(&e->list);
- e->dentry = NULL;
- }
+ list_del_init(&e->list);
write_unlock(&entries_lock);
- if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) {
- filp_close(e->interp_file, NULL);
- e->interp_file = NULL;
- }
-
- if (dentry) {
- drop_nlink(d_inode(dentry));
- d_drop(dentry);
- dput(dentry);
- simple_release_fs(&bm_mnt, &entry_count);
- }
+ dentry = e->dentry;
+ drop_nlink(d_inode(dentry));
+ d_drop(dentry);
+ dput(dentry);
+ simple_release_fs(&bm_mnt, &entry_count);
}
/* /<entry> */
@@ -665,7 +660,8 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
root = file_inode(file)->i_sb->s_root;
inode_lock(d_inode(root));
- kill_node(e);
+ if (!list_empty(&e->list))
+ kill_node(e);
inode_unlock(d_inode(root));
break;
@@ -794,7 +790,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
inode_lock(d_inode(root));
while (!list_empty(&entries))
- kill_node(list_entry(entries.next, Node, list));
+ kill_node(list_first_entry(&entries, Node, list));
inode_unlock(d_inode(root));
break;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index afdf4e3cafc2..7cde3f46ad26 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -19,7 +19,6 @@ static int load_script(struct linux_binprm *bprm)
const char *i_arg, *i_name;
char *cp;
struct file *file;
- char interp[BINPRM_BUF_SIZE];
int retval;
if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
@@ -55,7 +54,7 @@ static int load_script(struct linux_binprm *bprm)
break;
}
for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++);
- if (*cp == '\0')
+ if (*cp == '\0')
return -ENOEXEC; /* No interpreter name found */
i_name = cp;
i_arg = NULL;
@@ -65,7 +64,6 @@ static int load_script(struct linux_binprm *bprm)
*cp++ = '\0';
if (*cp)
i_arg = cp;
- strcpy (interp, i_name);
/*
* OK, we've parsed out the interpreter name and
* (optional) argument.
@@ -80,24 +78,27 @@ static int load_script(struct linux_binprm *bprm)
if (retval)
return retval;
retval = copy_strings_kernel(1, &bprm->interp, bprm);
- if (retval < 0) return retval;
+ if (retval < 0)
+ return retval;
bprm->argc++;
if (i_arg) {
retval = copy_strings_kernel(1, &i_arg, bprm);
- if (retval < 0) return retval;
+ if (retval < 0)
+ return retval;
bprm->argc++;
}
retval = copy_strings_kernel(1, &i_name, bprm);
- if (retval) return retval;
+ if (retval)
+ return retval;
bprm->argc++;
- retval = bprm_change_interp(interp, bprm);
+ retval = bprm_change_interp(i_name, bprm);
if (retval < 0)
return retval;
/*
* OK, now restart the process with the interpreter's dentry.
*/
- file = open_exec(interp);
+ file = open_exec(i_name);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 93d088ffc05c..789f55e851ae 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -716,10 +716,12 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
- if (result)
+ if (result) {
end_page_writeback(page);
- else
+ } else {
+ clean_page_buffers(page);
unlock_page(page);
+ }
blk_queue_exit(bdev->bd_queue);
return result;
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a26c63b4ad68..2e558227931a 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -91,3 +91,14 @@ config BTRFS_ASSERT
any of the assertions trip. This is meant for btrfs developers only.
If unsure, say N.
+
+config BTRFS_FS_REF_VERIFY
+ bool "Btrfs with the ref verify tool compiled in"
+ depends on BTRFS_FS
+ default n
+ help
+ Enable run-time extent reference verification instrumentation. This
+ is meant to be used by btrfs developers for tracking down extent
+ reference problems or verifying they didn't break something.
+
+ If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 962a95aefb81..6fe881d5cb38 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_BTRFS_FS) := btrfs.o
@@ -9,10 +10,11 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o free-space-tree.o
+ uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index e00c8a9fd5bb..d5540749f0e5 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -67,7 +67,7 @@ struct btrfs_workqueue {
static void normal_work_helper(struct btrfs_work *work);
#define BTRFS_WORK_HELPER(name) \
-void btrfs_##name(struct work_struct *arg) \
+noinline_for_stack void btrfs_##name(struct work_struct *arg) \
{ \
struct btrfs_work *work = container_of(arg, struct btrfs_work, \
normal_work); \
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b517ef1477ea..7d0dc100a09a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -40,12 +40,14 @@ static int check_extent_in_eb(const struct btrfs_key *key,
const struct extent_buffer *eb,
const struct btrfs_file_extent_item *fi,
u64 extent_item_pos,
- struct extent_inode_elem **eie)
+ struct extent_inode_elem **eie,
+ bool ignore_offset)
{
u64 offset = 0;
struct extent_inode_elem *e;
- if (!btrfs_file_extent_compression(eb, fi) &&
+ if (!ignore_offset &&
+ !btrfs_file_extent_compression(eb, fi) &&
!btrfs_file_extent_encryption(eb, fi) &&
!btrfs_file_extent_other_encoding(eb, fi)) {
u64 data_offset;
@@ -84,7 +86,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie)
static int find_extent_in_eb(const struct extent_buffer *eb,
u64 wanted_disk_byte, u64 extent_item_pos,
- struct extent_inode_elem **eie)
+ struct extent_inode_elem **eie,
+ bool ignore_offset)
{
u64 disk_byte;
struct btrfs_key key;
@@ -113,7 +116,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
if (disk_byte != wanted_disk_byte)
continue;
- ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+ ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset);
if (ret < 0)
return ret;
}
@@ -419,7 +422,7 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
struct ulist *parents, struct prelim_ref *ref,
int level, u64 time_seq, const u64 *extent_item_pos,
- u64 total_refs)
+ u64 total_refs, bool ignore_offset)
{
int ret = 0;
int slot;
@@ -472,7 +475,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (extent_item_pos) {
ret = check_extent_in_eb(&key, eb, fi,
*extent_item_pos,
- &eie);
+ &eie, ignore_offset);
if (ret < 0)
break;
}
@@ -510,7 +513,8 @@ next:
static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct prelim_ref *ref, struct ulist *parents,
- const u64 *extent_item_pos, u64 total_refs)
+ const u64 *extent_item_pos, u64 total_refs,
+ bool ignore_offset)
{
struct btrfs_root *root;
struct btrfs_key root_key;
@@ -581,7 +585,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
ret = add_all_parents(root, path, parents, ref, level, time_seq,
- extent_item_pos, total_refs);
+ extent_item_pos, total_refs, ignore_offset);
out:
path->lowest_level = 0;
btrfs_release_path(path);
@@ -616,7 +620,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct preftrees *preftrees,
const u64 *extent_item_pos, u64 total_refs,
- struct share_check *sc)
+ struct share_check *sc, bool ignore_offset)
{
int err;
int ret = 0;
@@ -661,7 +665,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
err = resolve_indirect_ref(fs_info, path, time_seq, ref,
parents, extent_item_pos,
- total_refs);
+ total_refs, ignore_offset);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
@@ -769,6 +773,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_key key;
struct btrfs_key tmp_op_key;
struct btrfs_key *op_key = NULL;
+ struct rb_node *n;
int count;
int ret = 0;
@@ -778,7 +783,9 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
}
spin_lock(&head->lock);
- list_for_each_entry(node, &head->ref_list, list) {
+ for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
+ node = rb_entry(n, struct btrfs_delayed_ref_node,
+ ref_node);
if (node->seq > seq)
continue;
@@ -1107,13 +1114,17 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
*
* Otherwise this returns 0 for success and <0 for an error.
*
+ * If ignore_offset is set to false, only extent refs whose offsets match
+ * extent_item_pos are returned. If true, every extent ref is returned
+ * and extent_item_pos is ignored.
+ *
* FIXME some caching might speed things up
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist *refs,
struct ulist *roots, const u64 *extent_item_pos,
- struct share_check *sc)
+ struct share_check *sc, bool ignore_offset)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -1178,7 +1189,7 @@ again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->node.refs);
+ refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -1189,7 +1200,7 @@ again:
*/
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
goto again;
}
spin_unlock(&delayed_refs->lock);
@@ -1235,7 +1246,7 @@ again:
WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
- extent_item_pos, total_refs, sc);
+ extent_item_pos, total_refs, sc, ignore_offset);
if (ret)
goto out;
@@ -1282,7 +1293,7 @@ again:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
ret = find_extent_in_eb(eb, bytenr,
- *extent_item_pos, &eie);
+ *extent_item_pos, &eie, ignore_offset);
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
@@ -1350,7 +1361,7 @@ static void free_leaf_list(struct ulist *blocks)
static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **leafs,
- const u64 *extent_item_pos)
+ const u64 *extent_item_pos, bool ignore_offset)
{
int ret;
@@ -1359,7 +1370,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
- *leafs, NULL, extent_item_pos, NULL);
+ *leafs, NULL, extent_item_pos, NULL, ignore_offset);
if (ret < 0 && ret != -ENOENT) {
free_leaf_list(*leafs);
return ret;
@@ -1383,7 +1394,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*/
static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots)
+ u64 time_seq, struct ulist **roots,
+ bool ignore_offset)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
@@ -1402,7 +1414,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
ULIST_ITER_INIT(&uiter);
while (1) {
ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
- tmp, *roots, NULL, NULL);
+ tmp, *roots, NULL, NULL, ignore_offset);
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
@@ -1421,14 +1433,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots)
+ u64 time_seq, struct ulist **roots,
+ bool ignore_offset)
{
int ret;
if (!trans)
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
- time_seq, roots);
+ time_seq, roots, ignore_offset);
if (!trans)
up_read(&fs_info->commit_root_sem);
return ret;
@@ -1483,7 +1496,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
ULIST_ITER_INIT(&uiter);
while (1) {
ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
- roots, NULL, &shared);
+ roots, NULL, &shared, false);
if (ret == BACKREF_FOUND_SHARED) {
/* this is the only condition under which we return 1 */
ret = 1;
@@ -1877,7 +1890,8 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
u64 extent_item_objectid, u64 extent_item_pos,
int search_commit_root,
- iterate_extent_inodes_t *iterate, void *ctx)
+ iterate_extent_inodes_t *iterate, void *ctx,
+ bool ignore_offset)
{
int ret;
struct btrfs_trans_handle *trans = NULL;
@@ -1903,14 +1917,15 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
tree_mod_seq_elem.seq, &refs,
- &extent_item_pos);
+ &extent_item_pos, ignore_offset);
if (ret)
goto out;
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots);
+ tree_mod_seq_elem.seq, &roots,
+ ignore_offset);
if (ret)
break;
ULIST_ITER_INIT(&root_uiter);
@@ -1943,7 +1958,8 @@ out:
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx)
+ iterate_extent_inodes_t *iterate, void *ctx,
+ bool ignore_offset)
{
int ret;
u64 extent_item_pos;
@@ -1961,7 +1977,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, search_commit_root,
- iterate, ctx);
+ iterate, ctx, ignore_offset);
return ret;
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e410335841aa..0c2fab8514ff 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -43,17 +43,19 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
u64 extent_item_objectid,
u64 extent_offset, int search_commit_root,
- iterate_extent_inodes_t *iterate, void *ctx);
+ iterate_extent_inodes_t *iterate, void *ctx,
+ bool ignore_offset);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx);
+ iterate_extent_inodes_t *iterate, void *ctx,
+ bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots);
+ u64 time_seq, struct ulist **roots, bool ignore_offset);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index eccadb5f62a5..63f0ccc92a71 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,14 +36,13 @@
#define BTRFS_INODE_ORPHAN_META_RESERVED 1
#define BTRFS_INODE_DUMMY 2
#define BTRFS_INODE_IN_DEFRAG 3
-#define BTRFS_INODE_DELALLOC_META_RESERVED 4
-#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
-#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
-#define BTRFS_INODE_NEEDS_FULL_SYNC 7
-#define BTRFS_INODE_COPY_EVERYTHING 8
-#define BTRFS_INODE_IN_DELALLOC_LIST 9
-#define BTRFS_INODE_READDIO_NEED_LOCK 10
-#define BTRFS_INODE_HAS_PROPS 11
+#define BTRFS_INODE_HAS_ORPHAN_ITEM 4
+#define BTRFS_INODE_HAS_ASYNC_EXTENT 5
+#define BTRFS_INODE_NEEDS_FULL_SYNC 6
+#define BTRFS_INODE_COPY_EVERYTHING 7
+#define BTRFS_INODE_IN_DELALLOC_LIST 8
+#define BTRFS_INODE_READDIO_NEED_LOCK 9
+#define BTRFS_INODE_HAS_PROPS 10
/* in memory btrfs inode */
struct btrfs_inode {
@@ -176,7 +175,8 @@ struct btrfs_inode {
* of extent items we've reserved metadata for.
*/
unsigned outstanding_extents;
- unsigned reserved_extents;
+
+ struct btrfs_block_rsv block_rsv;
/*
* Cached values of inode properties
@@ -267,6 +267,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
return false;
}
+static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
+ int mod)
+{
+ lockdep_assert_held(&inode->lock);
+ inode->outstanding_extents += mod;
+ if (btrfs_is_free_space_inode(inode))
+ return;
+ trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
+ mod);
+}
+
static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
int ret = 0;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d5a9b51f0d7..7d51b5a5b505 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -613,7 +613,7 @@ static void btrfsic_dev_state_hashtable_add(
struct btrfsic_dev_state_hashtable *h)
{
const unsigned int hashval =
- (((unsigned int)((uintptr_t)ds->bdev)) &
+ (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
(BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
list_add(&ds->collision_resolving_node, h->table + hashval);
@@ -2803,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
mutex_lock(&btrfsic_mutex);
/* since btrfsic_submit_bio() is also called before
* btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio_dev(bio));
+ dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
unsigned int i = 0;
@@ -2913,7 +2913,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
state = kvzalloc(sizeof(*state), GFP_KERNEL);
if (!state) {
pr_info("btrfs check-integrity: allocation failed!\n");
- return -1;
+ return -ENOMEM;
}
if (!btrfsic_is_initialized) {
@@ -2945,7 +2945,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
if (NULL == ds) {
pr_info("btrfs check-integrity: kmalloc() failed!\n");
mutex_unlock(&btrfsic_mutex);
- return -1;
+ return -ENOMEM;
}
ds->bdev = device->bdev;
ds->state = state;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b51d23f5cafa..b35ce16b3df3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,6 +33,8 @@
#include <linux/bit_spinlock.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
+#include <linux/sort.h>
+#include <linux/log2.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -107,7 +109,8 @@ static void end_compressed_bio_read(struct bio *bio)
struct inode *inode;
struct page *page;
unsigned long index;
- int ret;
+ unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+ int ret = 0;
if (bio->bi_status)
cb->errors = 1;
@@ -118,6 +121,21 @@ static void end_compressed_bio_read(struct bio *bio)
if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
+ /*
+ * Record the correct mirror_num in cb->orig_bio so that
+ * read-repair can work properly.
+ */
+ ASSERT(btrfs_io_bio(cb->orig_bio));
+ btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+ cb->mirror_num = mirror;
+
+ /*
+ * Some IO in this cb have failed, just skip checksum as there
+ * is no way it could be correct.
+ */
+ if (cb->errors == 1)
+ goto csum_failed;
+
inode = cb->inode;
ret = check_compressed_csum(BTRFS_I(inode), cb,
(u64)bio->bi_iter.bi_sector << 9);
@@ -239,7 +257,8 @@ static void end_compressed_bio_write(struct bio *bio)
cb->start,
cb->start + cb->len - 1,
NULL,
- bio->bi_status ? 0 : 1);
+ bio->bi_status ?
+ BLK_STS_OK : BLK_STS_NOTSUPP);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
@@ -690,7 +709,86 @@ out:
return ret;
}
-static struct {
+/*
+ * Heuristic uses systematic sampling to collect data from the input data
+ * range, the logic can be tuned by the following constants:
+ *
+ * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
+ * @SAMPLING_INTERVAL - range from which the sampled data can be collected
+ */
+#define SAMPLING_READ_SIZE (16)
+#define SAMPLING_INTERVAL (256)
+
+/*
+ * For statistical analysis of the input data we consider bytes that form a
+ * Galois Field of 256 objects. Each object has an attribute count, ie. how
+ * many times the object appeared in the sample.
+ */
+#define BUCKET_SIZE (256)
+
+/*
+ * The size of the sample is based on a statistical sampling rule of thumb.
+ * The common way is to perform sampling tests as long as the number of
+ * elements in each cell is at least 5.
+ *
+ * Instead of 5, we choose 32 to obtain more accurate results.
+ * If the data contain the maximum number of symbols, which is 256, we obtain a
+ * sample size bound by 8192.
+ *
+ * For a sample of at most 8KB of data per data range: 16 consecutive bytes
+ * from up to 512 locations.
+ */
+#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
+ SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
+
+struct bucket_item {
+ u32 count;
+};
+
+struct heuristic_ws {
+ /* Partial copy of input data */
+ u8 *sample;
+ u32 sample_size;
+ /* Buckets store counters for each byte value */
+ struct bucket_item *bucket;
+ struct list_head list;
+};
+
+static void free_heuristic_ws(struct list_head *ws)
+{
+ struct heuristic_ws *workspace;
+
+ workspace = list_entry(ws, struct heuristic_ws, list);
+
+ kvfree(workspace->sample);
+ kfree(workspace->bucket);
+ kfree(workspace);
+}
+
+static struct list_head *alloc_heuristic_ws(void)
+{
+ struct heuristic_ws *ws;
+
+ ws = kzalloc(sizeof(*ws), GFP_KERNEL);
+ if (!ws)
+ return ERR_PTR(-ENOMEM);
+
+ ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
+ if (!ws->sample)
+ goto fail;
+
+ ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
+ if (!ws->bucket)
+ goto fail;
+
+ INIT_LIST_HEAD(&ws->list);
+ return &ws->list;
+fail:
+ free_heuristic_ws(&ws->list);
+ return ERR_PTR(-ENOMEM);
+}
+
+struct workspaces_list {
struct list_head idle_ws;
spinlock_t ws_lock;
/* Number of free workspaces */
@@ -699,7 +797,11 @@ static struct {
atomic_t total_ws;
/* Waiters for a free workspace */
wait_queue_head_t ws_wait;
-} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
+};
+
+static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
+
+static struct workspaces_list btrfs_heuristic_ws;
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zlib_compress,
@@ -709,11 +811,25 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
void __init btrfs_init_compress(void)
{
+ struct list_head *workspace;
int i;
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- struct list_head *workspace;
+ INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
+ spin_lock_init(&btrfs_heuristic_ws.ws_lock);
+ atomic_set(&btrfs_heuristic_ws.total_ws, 0);
+ init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+
+ workspace = alloc_heuristic_ws();
+ if (IS_ERR(workspace)) {
+ pr_warn(
+ "BTRFS: cannot preallocate heuristic workspace, will try later\n");
+ } else {
+ atomic_set(&btrfs_heuristic_ws.total_ws, 1);
+ btrfs_heuristic_ws.free_ws = 1;
+ list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+ }
+ for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
spin_lock_init(&btrfs_comp_ws[i].ws_lock);
atomic_set(&btrfs_comp_ws[i].total_ws, 0);
@@ -740,18 +856,32 @@ void __init btrfs_init_compress(void)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-static struct list_head *find_workspace(int type)
+static struct list_head *__find_workspace(int type, bool heuristic)
{
struct list_head *workspace;
int cpus = num_online_cpus();
int idx = type - 1;
unsigned nofs_flag;
+ struct list_head *idle_ws;
+ spinlock_t *ws_lock;
+ atomic_t *total_ws;
+ wait_queue_head_t *ws_wait;
+ int *free_ws;
+
+ if (heuristic) {
+ idle_ws = &btrfs_heuristic_ws.idle_ws;
+ ws_lock = &btrfs_heuristic_ws.ws_lock;
+ total_ws = &btrfs_heuristic_ws.total_ws;
+ ws_wait = &btrfs_heuristic_ws.ws_wait;
+ free_ws = &btrfs_heuristic_ws.free_ws;
+ } else {
+ idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ total_ws = &btrfs_comp_ws[idx].total_ws;
+ ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ free_ws = &btrfs_comp_ws[idx].free_ws;
+ }
- struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
- spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
- atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
- wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
- int *free_ws = &btrfs_comp_ws[idx].free_ws;
again:
spin_lock(ws_lock);
if (!list_empty(idle_ws)) {
@@ -781,7 +911,10 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- workspace = btrfs_compress_op[idx]->alloc_workspace();
+ if (heuristic)
+ workspace = alloc_heuristic_ws();
+ else
+ workspace = btrfs_compress_op[idx]->alloc_workspace();
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -812,18 +945,38 @@ again:
return workspace;
}
+static struct list_head *find_workspace(int type)
+{
+ return __find_workspace(type, false);
+}
+
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-static void free_workspace(int type, struct list_head *workspace)
+static void __free_workspace(int type, struct list_head *workspace,
+ bool heuristic)
{
int idx = type - 1;
- struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
- spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
- atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
- wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
- int *free_ws = &btrfs_comp_ws[idx].free_ws;
+ struct list_head *idle_ws;
+ spinlock_t *ws_lock;
+ atomic_t *total_ws;
+ wait_queue_head_t *ws_wait;
+ int *free_ws;
+
+ if (heuristic) {
+ idle_ws = &btrfs_heuristic_ws.idle_ws;
+ ws_lock = &btrfs_heuristic_ws.ws_lock;
+ total_ws = &btrfs_heuristic_ws.total_ws;
+ ws_wait = &btrfs_heuristic_ws.ws_wait;
+ free_ws = &btrfs_heuristic_ws.free_ws;
+ } else {
+ idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ total_ws = &btrfs_comp_ws[idx].total_ws;
+ ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ free_ws = &btrfs_comp_ws[idx].free_ws;
+ }
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
@@ -834,7 +987,10 @@ static void free_workspace(int type, struct list_head *workspace)
}
spin_unlock(ws_lock);
- btrfs_compress_op[idx]->free_workspace(workspace);
+ if (heuristic)
+ free_heuristic_ws(workspace);
+ else
+ btrfs_compress_op[idx]->free_workspace(workspace);
atomic_dec(total_ws);
wake:
/*
@@ -845,6 +1001,11 @@ wake:
wake_up(ws_wait);
}
+static void free_workspace(int type, struct list_head *ws)
+{
+ return __free_workspace(type, ws, false);
+}
+
/*
* cleanup function for module exit
*/
@@ -853,6 +1014,13 @@ static void free_workspaces(void)
struct list_head *workspace;
int i;
+ while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
+ workspace = btrfs_heuristic_ws.idle_ws.next;
+ list_del(workspace);
+ free_heuristic_ws(workspace);
+ atomic_dec(&btrfs_heuristic_ws.total_ws);
+ }
+
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
workspace = btrfs_comp_ws[i].idle_ws.next;
@@ -867,6 +1035,11 @@ static void free_workspaces(void)
* Given an address space and start and length, compress the bytes into @pages
* that are allocated on demand.
*
+ * @type_level is encoded algorithm and level, where level 0 means whatever
+ * default the algorithm chooses and is opaque here;
+ * - compression algo are 0-3
+ * - the level are bits 4-7
+ *
* @out_pages is an in/out parameter, holds maximum number of pages to allocate
* and returns number of actually allocated pages
*
@@ -881,7 +1054,7 @@ static void free_workspaces(void)
* @max_out tells us the max number of bytes that we're allowed to
* stuff into pages
*/
-int btrfs_compress_pages(int type, struct address_space *mapping,
+int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
u64 start, struct page **pages,
unsigned long *out_pages,
unsigned long *total_in,
@@ -889,9 +1062,11 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
{
struct list_head *workspace;
int ret;
+ int type = type_level & 0xF;
workspace = find_workspace(type);
+ btrfs_compress_op[type - 1]->set_level(workspace, type_level);
ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
start, pages,
out_pages,
@@ -1050,6 +1225,211 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
}
/*
+ * Shannon Entropy calculation
+ *
+ * Pure byte distribution analysis fails to determine compressiability of data.
+ * Try calculating entropy to estimate the average minimum number of bits
+ * needed to encode the sampled data.
+ *
+ * For convenience, return the percentage of needed bits, instead of amount of
+ * bits directly.
+ *
+ * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
+ * and can be compressible with high probability
+ *
+ * @ENTROPY_LVL_HIGH - data are not compressible with high probability
+ *
+ * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
+ */
+#define ENTROPY_LVL_ACEPTABLE (65)
+#define ENTROPY_LVL_HIGH (80)
+
+/*
+ * For increasead precision in shannon_entropy calculation,
+ * let's do pow(n, M) to save more digits after comma:
+ *
+ * - maximum int bit length is 64
+ * - ilog2(MAX_SAMPLE_SIZE) -> 13
+ * - 13 * 4 = 52 < 64 -> M = 4
+ *
+ * So use pow(n, 4).
+ */
+static inline u32 ilog2_w(u64 n)
+{
+ return ilog2(n * n * n * n);
+}
+
+static u32 shannon_entropy(struct heuristic_ws *ws)
+{
+ const u32 entropy_max = 8 * ilog2_w(2);
+ u32 entropy_sum = 0;
+ u32 p, p_base, sz_base;
+ u32 i;
+
+ sz_base = ilog2_w(ws->sample_size);
+ for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
+ p = ws->bucket[i].count;
+ p_base = ilog2_w(p);
+ entropy_sum += p * (sz_base - p_base);
+ }
+
+ entropy_sum /= ws->sample_size;
+ return entropy_sum * 100 / entropy_max;
+}
+
+/* Compare buckets by size, ascending */
+static int bucket_comp_rev(const void *lv, const void *rv)
+{
+ const struct bucket_item *l = (const struct bucket_item *)lv;
+ const struct bucket_item *r = (const struct bucket_item *)rv;
+
+ return r->count - l->count;
+}
+
+/*
+ * Size of the core byte set - how many bytes cover 90% of the sample
+ *
+ * There are several types of structured binary data that use nearly all byte
+ * values. The distribution can be uniform and counts in all buckets will be
+ * nearly the same (eg. encrypted data). Unlikely to be compressible.
+ *
+ * Other possibility is normal (Gaussian) distribution, where the data could
+ * be potentially compressible, but we have to take a few more steps to decide
+ * how much.
+ *
+ * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
+ * compression algo can easy fix that
+ * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
+ * probability is not compressible
+ */
+#define BYTE_CORE_SET_LOW (64)
+#define BYTE_CORE_SET_HIGH (200)
+
+static int byte_core_set_size(struct heuristic_ws *ws)
+{
+ u32 i;
+ u32 coreset_sum = 0;
+ const u32 core_set_threshold = ws->sample_size * 90 / 100;
+ struct bucket_item *bucket = ws->bucket;
+
+ /* Sort in reverse order */
+ sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
+
+ for (i = 0; i < BYTE_CORE_SET_LOW; i++)
+ coreset_sum += bucket[i].count;
+
+ if (coreset_sum > core_set_threshold)
+ return i;
+
+ for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
+ coreset_sum += bucket[i].count;
+ if (coreset_sum > core_set_threshold)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Count byte values in buckets.
+ * This heuristic can detect textual data (configs, xml, json, html, etc).
+ * Because in most text-like data byte set is restricted to limited number of
+ * possible characters, and that restriction in most cases makes data easy to
+ * compress.
+ *
+ * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
+ * less - compressible
+ * more - need additional analysis
+ */
+#define BYTE_SET_THRESHOLD (64)
+
+static u32 byte_set_size(const struct heuristic_ws *ws)
+{
+ u32 i;
+ u32 byte_set_size = 0;
+
+ for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
+ if (ws->bucket[i].count > 0)
+ byte_set_size++;
+ }
+
+ /*
+ * Continue collecting count of byte values in buckets. If the byte
+ * set size is bigger then the threshold, it's pointless to continue,
+ * the detection technique would fail for this type of data.
+ */
+ for (; i < BUCKET_SIZE; i++) {
+ if (ws->bucket[i].count > 0) {
+ byte_set_size++;
+ if (byte_set_size > BYTE_SET_THRESHOLD)
+ return byte_set_size;
+ }
+ }
+
+ return byte_set_size;
+}
+
+static bool sample_repeated_patterns(struct heuristic_ws *ws)
+{
+ const u32 half_of_sample = ws->sample_size / 2;
+ const u8 *data = ws->sample;
+
+ return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
+}
+
+static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
+ struct heuristic_ws *ws)
+{
+ struct page *page;
+ u64 index, index_end;
+ u32 i, curr_sample_pos;
+ u8 *in_data;
+
+ /*
+ * Compression handles the input data by chunks of 128KiB
+ * (defined by BTRFS_MAX_UNCOMPRESSED)
+ *
+ * We do the same for the heuristic and loop over the whole range.
+ *
+ * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
+ * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
+ */
+ if (end - start > BTRFS_MAX_UNCOMPRESSED)
+ end = start + BTRFS_MAX_UNCOMPRESSED;
+
+ index = start >> PAGE_SHIFT;
+ index_end = end >> PAGE_SHIFT;
+
+ /* Don't miss unaligned end */
+ if (!IS_ALIGNED(end, PAGE_SIZE))
+ index_end++;
+
+ curr_sample_pos = 0;
+ while (index < index_end) {
+ page = find_get_page(inode->i_mapping, index);
+ in_data = kmap(page);
+ /* Handle case where the start is not aligned to PAGE_SIZE */
+ i = start % PAGE_SIZE;
+ while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
+ /* Don't sample any garbage from the last page */
+ if (start > end - SAMPLING_READ_SIZE)
+ break;
+ memcpy(&ws->sample[curr_sample_pos], &in_data[i],
+ SAMPLING_READ_SIZE);
+ i += SAMPLING_INTERVAL;
+ start += SAMPLING_INTERVAL;
+ curr_sample_pos += SAMPLING_READ_SIZE;
+ }
+ kunmap(page);
+ put_page(page);
+
+ index++;
+ }
+
+ ws->sample_size = curr_sample_pos;
+}
+
+/*
* Compression heuristic.
*
* For now is's a naive and optimistic 'return true', we'll extend the logic to
@@ -1066,18 +1446,87 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
*/
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
{
- u64 index = start >> PAGE_SHIFT;
- u64 end_index = end >> PAGE_SHIFT;
- struct page *page;
- int ret = 1;
+ struct list_head *ws_list = __find_workspace(0, true);
+ struct heuristic_ws *ws;
+ u32 i;
+ u8 byte;
+ int ret = 0;
- while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
- kmap(page);
- kunmap(page);
- put_page(page);
- index++;
+ ws = list_entry(ws_list, struct heuristic_ws, list);
+
+ heuristic_collect_sample(inode, start, end, ws);
+
+ if (sample_repeated_patterns(ws)) {
+ ret = 1;
+ goto out;
+ }
+
+ memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
+
+ for (i = 0; i < ws->sample_size; i++) {
+ byte = ws->sample[i];
+ ws->bucket[byte].count++;
+ }
+
+ i = byte_set_size(ws);
+ if (i < BYTE_SET_THRESHOLD) {
+ ret = 2;
+ goto out;
+ }
+
+ i = byte_core_set_size(ws);
+ if (i <= BYTE_CORE_SET_LOW) {
+ ret = 3;
+ goto out;
+ }
+
+ if (i >= BYTE_CORE_SET_HIGH) {
+ ret = 0;
+ goto out;
+ }
+
+ i = shannon_entropy(ws);
+ if (i <= ENTROPY_LVL_ACEPTABLE) {
+ ret = 4;
+ goto out;
+ }
+
+ /*
+ * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
+ * needed to give green light to compression.
+ *
+ * For now just assume that compression at that level is not worth the
+ * resources because:
+ *
+ * 1. it is possible to defrag the data later
+ *
+ * 2. the data would turn out to be hardly compressible, eg. 150 byte
+ * values, every bucket has counter at level ~54. The heuristic would
+ * be confused. This can happen when data have some internal repeated
+ * patterns like "abbacbbc...". This can be detected by analyzing
+ * pairs of bytes, which is too costly.
+ */
+ if (i < ENTROPY_LVL_HIGH) {
+ ret = 5;
+ goto out;
+ } else {
+ ret = 0;
+ goto out;
}
+out:
+ __free_workspace(0, ws_list, true);
return ret;
}
+
+unsigned int btrfs_compress_str2level(const char *str)
+{
+ if (strncmp(str, "zlib", 4) != 0)
+ return 0;
+
+ /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
+ if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
+ return str[5] - '0';
+
+ return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d2781ff8f994..da20755ebf21 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -76,7 +76,7 @@ struct compressed_bio {
void btrfs_init_compress(void);
void btrfs_exit_compress(void);
-int btrfs_compress_pages(int type, struct address_space *mapping,
+int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
u64 start, struct page **pages,
unsigned long *out_pages,
unsigned long *total_in,
@@ -95,6 +95,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
+unsigned btrfs_compress_str2level(const char *str);
+
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
@@ -124,6 +126,8 @@ struct btrfs_compress_op {
struct page *dest_page,
unsigned long start_byte,
size_t srclen, size_t destlen);
+
+ void (*set_level)(struct list_head *ws, unsigned int type);
};
extern const struct btrfs_compress_op btrfs_zlib_compress;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d49db7d86be..531e0a8645b0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -192,7 +192,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
* tree until you end up with a lock on the root. A locked buffer
* is returned, with a reference held.
*/
-static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
@@ -5496,8 +5496,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
} else if (left_end_reached) {
if (right_level == 0) {
- ret = changed_cb(left_root, right_root,
- left_path, right_path,
+ ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
ctx);
@@ -5508,8 +5507,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
continue;
} else if (right_end_reached) {
if (left_level == 0) {
- ret = changed_cb(left_root, right_root,
- left_path, right_path,
+ ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
ctx);
@@ -5523,8 +5521,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
if (left_level == 0 && right_level == 0) {
cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
if (cmp < 0) {
- ret = changed_cb(left_root, right_root,
- left_path, right_path,
+ ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
ctx);
@@ -5532,8 +5529,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
advance_left = ADVANCE;
} else if (cmp > 0) {
- ret = changed_cb(left_root, right_root,
- left_path, right_path,
+ ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
ctx);
@@ -5550,8 +5546,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
result = BTRFS_COMPARE_TREE_CHANGED;
else
result = BTRFS_COMPARE_TREE_SAME;
- ret = changed_cb(left_root, right_root,
- left_path, right_path,
+ ret = changed_cb(left_path, right_path,
&left_key, result, ctx);
if (ret < 0)
goto out;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5a8933da39a7..f7df5536ab61 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -523,7 +523,7 @@ struct btrfs_caching_control {
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
-#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+#define CACHING_CTL_WAKE_UP SZ_2M
struct btrfs_io_ctl {
void *cur, *orig;
@@ -709,7 +709,6 @@ struct btrfs_delayed_root;
#define BTRFS_FS_OPEN 5
#define BTRFS_FS_QUOTA_ENABLED 6
#define BTRFS_FS_QUOTA_ENABLING 7
-#define BTRFS_FS_QUOTA_DISABLING 8
#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9
#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10
#define BTRFS_FS_BTREE_ERR 11
@@ -723,7 +722,7 @@ struct btrfs_delayed_root;
* Indicate that a whole-filesystem exclusive operation is running
* (device replace, resize, device add/delete, balance)
*/
-#define BTRFS_FS_EXCL_OP 14
+#define BTRFS_FS_EXCL_OP 16
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -764,8 +763,6 @@ struct btrfs_fs_info {
* delayed dir index item
*/
struct btrfs_block_rsv global_block_rsv;
- /* block reservation for delay allocation */
- struct btrfs_block_rsv delalloc_block_rsv;
/* block reservation for metadata operations */
struct btrfs_block_rsv trans_block_rsv;
/* block reservation for chunk tree */
@@ -791,6 +788,7 @@ struct btrfs_fs_info {
*/
unsigned long pending_changes;
unsigned long compress_type:4;
+ unsigned int compress_level;
int commit_interval;
/*
* It is a suggestive number, the read side is safe even it gets a
@@ -879,9 +877,6 @@ struct btrfs_fs_info {
rwlock_t tree_mod_log_lock;
struct rb_root tree_mod_log;
- atomic_t nr_async_submits;
- atomic_t async_submit_draining;
- atomic_t nr_async_bios;
atomic_t async_delalloc_pages;
atomic_t open_ioctl_trans;
@@ -1101,6 +1096,11 @@ struct btrfs_fs_info {
u32 nodesize;
u32 sectorsize;
u32 stripesize;
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ spinlock_t ref_verify_lock;
+ struct rb_root block_tree;
+#endif
};
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -1339,6 +1339,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
+#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -2640,7 +2641,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- u64 root_objectid, u64 owner,
+ struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
@@ -2659,7 +2660,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 flags,
int level, int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset);
@@ -2671,7 +2672,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset);
@@ -2745,6 +2746,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
u64 *qgroup_reserved, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_space(struct inode *inode,
@@ -2752,6 +2755,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv,
+ unsigned short type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
@@ -2810,6 +2816,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
@@ -2822,9 +2829,7 @@ enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_CHANGED,
BTRFS_COMPARE_TREE_SAME,
};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- struct btrfs_path *left_path,
+typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
struct btrfs_path *right_path,
struct btrfs_key *key,
enum btrfs_compare_tree_result result,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 19e4ad2f3f2e..5d73f79ded8b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
- bool release = false;
src_rsv = trans->block_rsv;
dst_rsv = &fs_info->delayed_block_rsv;
@@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
/*
- * If our block_rsv is the delalloc block reserve then check and see if
- * we have our extra reservation for updating the inode. If not fall
- * through and try to reserve space quickly.
- *
- * We used to try and steal from the delalloc block rsv or the global
- * reserve, but we'd steal a full reservation, which isn't kind. We are
- * here through delalloc which means we've likely just cowed down close
- * to the leaf that contains the inode, so we would steal less just
- * doing the fallback inode update, so if we do end up having to steal
- * from the global block rsv we hopefully only steal one or two blocks
- * worth which is less likely to hurt us.
- */
- if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
- spin_lock(&inode->lock);
- if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &inode->runtime_flags))
- release = true;
- else
- src_rsv = NULL;
- spin_unlock(&inode->lock);
- }
-
- /*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
* which doesn't reserve space for speed. This is a problem since we
* still need to reserve space for this update, so try to reserve the
* space.
*
* Now if src_rsv == delalloc_block_rsv we'll let it just steal since
- * we're accounted for.
+ * we always reserve enough to update the inode item.
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
@@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata(
}
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-
- /*
- * Migrate only takes a reservation, it doesn't touch the size of the
- * block_rsv. This is to simplify people who don't normally have things
- * migrated from their block rsv. If they go to release their
- * reservation, that will decrease the size as well, so if migrate
- * reduced size we'd end up with a negative size. But for the
- * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
- * but we could in fact do this reserve/migrate dance several times
- * between the time we did the original reservation and we'd clean it
- * up. So to take care of this, release the space for the meta
- * reservation here. I think it may be time for a documentation page on
- * how block rsvs. work.
- */
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
btrfs_ino(inode), num_bytes, 1);
node->bytes_reserved = num_bytes;
}
- if (release) {
- trace_btrfs_space_reservation(fs_info, "delalloc",
- btrfs_ino(inode), num_bytes, 0);
- btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
- }
-
return ret;
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 93ffa898df6d..83be8f9fd906 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -40,10 +40,10 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
/*
* compare two delayed tree backrefs with same bytenr and type
*/
-static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
- struct btrfs_delayed_tree_ref *ref1, int type)
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
+ struct btrfs_delayed_tree_ref *ref2)
{
- if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+ if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
if (ref1->root < ref2->root)
return -1;
if (ref1->root > ref2->root)
@@ -60,8 +60,8 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
/*
* compare two delayed data backrefs with same bytenr and type
*/
-static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
- struct btrfs_delayed_data_ref *ref1)
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
+ struct btrfs_delayed_data_ref *ref2)
{
if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
if (ref1->root < ref2->root)
@@ -85,6 +85,34 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
return 0;
}
+static int comp_refs(struct btrfs_delayed_ref_node *ref1,
+ struct btrfs_delayed_ref_node *ref2,
+ bool check_seq)
+{
+ int ret = 0;
+
+ if (ref1->type < ref2->type)
+ return -1;
+ if (ref1->type > ref2->type)
+ return 1;
+ if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
+ btrfs_delayed_node_to_tree_ref(ref2));
+ else
+ ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
+ btrfs_delayed_node_to_data_ref(ref2));
+ if (ret)
+ return ret;
+ if (check_seq) {
+ if (ref1->seq < ref2->seq)
+ return -1;
+ if (ref1->seq > ref2->seq)
+ return 1;
+ }
+ return 0;
+}
+
/* insert a new ref to head ref rbtree */
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
struct rb_node *node)
@@ -96,15 +124,43 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
u64 bytenr;
ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
- bytenr = ins->node.bytenr;
+ bytenr = ins->bytenr;
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
href_node);
- if (bytenr < entry->node.bytenr)
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
+ struct btrfs_delayed_ref_node *ins)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *node = &ins->ref_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_node *entry;
+
+ while (*p) {
+ int comp;
+
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+ ref_node);
+ comp = comp_refs(ins, entry, true);
+ if (comp < 0)
p = &(*p)->rb_left;
- else if (bytenr > entry->node.bytenr)
+ else if (comp > 0)
p = &(*p)->rb_right;
else
return entry;
@@ -133,15 +189,15 @@ find_ref_head(struct rb_root *root, u64 bytenr,
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
- if (bytenr < entry->node.bytenr)
+ if (bytenr < entry->bytenr)
n = n->rb_left;
- else if (bytenr > entry->node.bytenr)
+ else if (bytenr > entry->bytenr)
n = n->rb_right;
else
return entry;
}
if (entry && return_bigger) {
- if (bytenr > entry->node.bytenr) {
+ if (bytenr > entry->bytenr) {
n = rb_next(&entry->href_node);
if (!n)
n = rb_first(root);
@@ -164,17 +220,17 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
if (mutex_trylock(&head->mutex))
return 0;
- refcount_inc(&head->node.refs);
+ refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
spin_lock(&delayed_refs->lock);
- if (!head->node.in_tree) {
+ if (RB_EMPTY_NODE(&head->href_node)) {
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
return -EAGAIN;
}
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
return 0;
}
@@ -183,15 +239,11 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
- if (btrfs_delayed_ref_is_head(ref)) {
- head = btrfs_delayed_node_to_head(ref);
- rb_erase(&head->href_node, &delayed_refs->href_root);
- } else {
- assert_spin_locked(&head->lock);
- list_del(&ref->list);
- if (!list_empty(&ref->add_list))
- list_del(&ref->add_list);
- }
+ assert_spin_locked(&head->lock);
+ rb_erase(&ref->ref_node, &head->ref_tree);
+ RB_CLEAR_NODE(&ref->ref_node);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
@@ -206,36 +258,18 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
u64 seq)
{
struct btrfs_delayed_ref_node *next;
+ struct rb_node *node = rb_next(&ref->ref_node);
bool done = false;
- next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
- list);
- while (!done && &next->list != &head->ref_list) {
+ while (!done && node) {
int mod;
- struct btrfs_delayed_ref_node *next2;
-
- next2 = list_next_entry(next, list);
-
- if (next == ref)
- goto next;
+ next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+ node = rb_next(node);
if (seq && next->seq >= seq)
- goto next;
-
- if (next->type != ref->type)
- goto next;
-
- if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
- comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
- btrfs_delayed_node_to_tree_ref(next),
- ref->type))
- goto next;
- if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
- ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
- comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
- btrfs_delayed_node_to_data_ref(next)))
- goto next;
+ break;
+ if (comp_refs(ref, next, false))
+ break;
if (ref->action == next->action) {
mod = next->ref_mod;
@@ -259,8 +293,6 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
}
-next:
- next = next2;
}
return done;
@@ -272,11 +304,12 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
+ struct rb_node *node;
u64 seq = 0;
assert_spin_locked(&head->lock);
- if (list_empty(&head->ref_list))
+ if (RB_EMPTY_ROOT(&head->ref_tree))
return;
/* We don't have too many refs to merge for data. */
@@ -293,22 +326,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
}
spin_unlock(&fs_info->tree_mod_seq_lock);
- ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
- list);
- while (&ref->list != &head->ref_list) {
+again:
+ for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
- goto next;
-
- if (merge_ref(trans, delayed_refs, head, ref, seq)) {
- if (list_empty(&head->ref_list))
- break;
- ref = list_first_entry(&head->ref_list,
- struct btrfs_delayed_ref_node,
- list);
continue;
- }
-next:
- ref = list_next_entry(ref, list);
+ if (merge_ref(trans, delayed_refs, head, ref, seq))
+ goto again;
}
}
@@ -380,8 +404,8 @@ again:
head->processing = 1;
WARN_ON(delayed_refs->num_heads_ready == 0);
delayed_refs->num_heads_ready--;
- delayed_refs->run_delayed_start = head->node.bytenr +
- head->node.num_bytes;
+ delayed_refs->run_delayed_start = head->bytenr +
+ head->num_bytes;
return head;
}
@@ -391,37 +415,19 @@ again:
* Return 0 for insert.
* Return >0 for merge.
*/
-static int
-add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *root,
- struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *ref)
+static int insert_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *root,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *ref)
{
struct btrfs_delayed_ref_node *exist;
int mod;
int ret = 0;
spin_lock(&href->lock);
- /* Check whether we can merge the tail node with ref */
- if (list_empty(&href->ref_list))
- goto add_tail;
- exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
- list);
- /* No need to compare bytenr nor is_head */
- if (exist->type != ref->type || exist->seq != ref->seq)
- goto add_tail;
-
- if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
- exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
- comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
- btrfs_delayed_node_to_tree_ref(ref),
- ref->type))
- goto add_tail;
- if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
- exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
- comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
- btrfs_delayed_node_to_data_ref(ref)))
- goto add_tail;
+ exist = tree_insert(&href->ref_tree, ref);
+ if (!exist)
+ goto inserted;
/* Now we are sure we can merge */
ret = 1;
@@ -452,9 +458,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
drop_delayed_ref(trans, root, href, exist);
spin_unlock(&href->lock);
return ret;
-
-add_tail:
- list_add_tail(&ref->list, &href->ref_list);
+inserted:
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
@@ -469,20 +473,16 @@ add_tail:
*/
static noinline void
update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_node *existing,
- struct btrfs_delayed_ref_node *update,
+ struct btrfs_delayed_ref_head *existing,
+ struct btrfs_delayed_ref_head *update,
int *old_ref_mod_ret)
{
- struct btrfs_delayed_ref_head *existing_ref;
- struct btrfs_delayed_ref_head *ref;
int old_ref_mod;
- existing_ref = btrfs_delayed_node_to_head(existing);
- ref = btrfs_delayed_node_to_head(update);
- BUG_ON(existing_ref->is_data != ref->is_data);
+ BUG_ON(existing->is_data != update->is_data);
- spin_lock(&existing_ref->lock);
- if (ref->must_insert_reserved) {
+ spin_lock(&existing->lock);
+ if (update->must_insert_reserved) {
/* if the extent was freed and then
* reallocated before the delayed ref
* entries were processed, we can end up
@@ -490,7 +490,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
* the must_insert_reserved flag set.
* Set it again here
*/
- existing_ref->must_insert_reserved = ref->must_insert_reserved;
+ existing->must_insert_reserved = update->must_insert_reserved;
/*
* update the num_bytes so we make sure the accounting
@@ -500,22 +500,22 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
}
- if (ref->extent_op) {
- if (!existing_ref->extent_op) {
- existing_ref->extent_op = ref->extent_op;
+ if (update->extent_op) {
+ if (!existing->extent_op) {
+ existing->extent_op = update->extent_op;
} else {
- if (ref->extent_op->update_key) {
- memcpy(&existing_ref->extent_op->key,
- &ref->extent_op->key,
- sizeof(ref->extent_op->key));
- existing_ref->extent_op->update_key = true;
+ if (update->extent_op->update_key) {
+ memcpy(&existing->extent_op->key,
+ &update->extent_op->key,
+ sizeof(update->extent_op->key));
+ existing->extent_op->update_key = true;
}
- if (ref->extent_op->update_flags) {
- existing_ref->extent_op->flags_to_set |=
- ref->extent_op->flags_to_set;
- existing_ref->extent_op->update_flags = true;
+ if (update->extent_op->update_flags) {
+ existing->extent_op->flags_to_set |=
+ update->extent_op->flags_to_set;
+ existing->extent_op->update_flags = true;
}
- btrfs_free_delayed_extent_op(ref->extent_op);
+ btrfs_free_delayed_extent_op(update->extent_op);
}
}
/*
@@ -523,23 +523,23 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
* only need the lock for this case cause we could be processing it
* currently, for refs we just added we know we're a-ok.
*/
- old_ref_mod = existing_ref->total_ref_mod;
+ old_ref_mod = existing->total_ref_mod;
if (old_ref_mod_ret)
*old_ref_mod_ret = old_ref_mod;
existing->ref_mod += update->ref_mod;
- existing_ref->total_ref_mod += update->ref_mod;
+ existing->total_ref_mod += update->ref_mod;
/*
* If we are going to from a positive ref mod to a negative or vice
* versa we need to make sure to adjust pending_csums accordingly.
*/
- if (existing_ref->is_data) {
- if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
+ if (existing->is_data) {
+ if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
delayed_refs->pending_csums -= existing->num_bytes;
- if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
+ if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
delayed_refs->pending_csums += existing->num_bytes;
}
- spin_unlock(&existing_ref->lock);
+ spin_unlock(&existing->lock);
}
/*
@@ -550,14 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref,
+ struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
int action, int is_data, int *qrecord_inserted_ret,
int *old_ref_mod, int *new_ref_mod)
{
struct btrfs_delayed_ref_head *existing;
- struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
int count_mod = 1;
int must_insert_reserved = 0;
@@ -593,26 +592,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
- /* first set the basic ref node struct up */
- refcount_set(&ref->refs, 1);
- ref->bytenr = bytenr;
- ref->num_bytes = num_bytes;
- ref->ref_mod = count_mod;
- ref->type = 0;
- ref->action = 0;
- ref->is_head = 1;
- ref->in_tree = 1;
- ref->seq = 0;
-
- head_ref = btrfs_delayed_node_to_head(ref);
+ refcount_set(&head_ref->refs, 1);
+ head_ref->bytenr = bytenr;
+ head_ref->num_bytes = num_bytes;
+ head_ref->ref_mod = count_mod;
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
- INIT_LIST_HEAD(&head_ref->ref_list);
+ head_ref->ref_tree = RB_ROOT;
INIT_LIST_HEAD(&head_ref->ref_add_list);
+ RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
head_ref->qgroup_reserved = 0;
head_ref->qgroup_ref_root = 0;
+ spin_lock_init(&head_ref->lock);
+ mutex_init(&head_ref->mutex);
/* Record qgroup extent info if provided */
if (qrecord) {
@@ -632,17 +626,14 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord_inserted = 1;
}
- spin_lock_init(&head_ref->lock);
- mutex_init(&head_ref->mutex);
-
- trace_add_delayed_ref_head(fs_info, ref, head_ref, action);
+ trace_add_delayed_ref_head(fs_info, head_ref, action);
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
WARN_ON(ref_root && reserved && existing->qgroup_ref_root
&& existing->qgroup_reserved);
- update_existing_head_ref(delayed_refs, &existing->node, ref,
+ update_existing_head_ref(delayed_refs, existing, head_ref,
old_ref_mod);
/*
* we've updated the existing ref, free the newly
@@ -699,7 +690,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
ref->seq = seq;
- INIT_LIST_HEAD(&ref->list);
+ RB_CLEAR_NODE(&ref->ref_node);
INIT_LIST_HEAD(&ref->add_list);
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -713,7 +704,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
- ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
/*
* XXX: memory should be freed at the same level allocated.
@@ -756,7 +747,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
ref->seq = seq;
- INIT_LIST_HEAD(&ref->list);
+ RB_CLEAR_NODE(&ref->ref_node);
INIT_LIST_HEAD(&ref->add_list);
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -772,8 +763,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
- ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
-
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
if (ret > 0)
kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
}
@@ -821,7 +811,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
+ head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
bytenr, num_bytes, 0, 0, action, 0,
&qrecord_inserted, old_ref_mod,
new_ref_mod);
@@ -888,7 +878,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
+ head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
bytenr, num_bytes, ref_root, reserved,
action, 1, &qrecord_inserted,
old_ref_mod, new_ref_mod);
@@ -920,7 +910,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
+ add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data, NULL, NULL, NULL);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ce88e4ac5276..a43af432f859 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -26,18 +26,8 @@
#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
-/*
- * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
- * same ref_node structure.
- * Ref_head is in a higher logic level than tree/data ref, and duplicated
- * bytenr/num_bytes in ref_node is really a waste or memory, they should be
- * referred from ref_head.
- * This gets more disgusting after we use list to store tree/data ref in
- * ref_head. Must clean this mess up later.
- */
struct btrfs_delayed_ref_node {
- /*data/tree ref use list, stored in ref_head->ref_list. */
- struct list_head list;
+ struct rb_node ref_node;
/*
* If action is BTRFS_ADD_DELAYED_REF, also link this node to
* ref_head->ref_add_list, then we do not need to iterate the
@@ -91,8 +81,9 @@ struct btrfs_delayed_extent_op {
* reference count modifications we've queued up.
*/
struct btrfs_delayed_ref_head {
- struct btrfs_delayed_ref_node node;
-
+ u64 bytenr;
+ u64 num_bytes;
+ refcount_t refs;
/*
* the mutex is held while running the refs, and it is also
* held when checking the sum of reference modifications.
@@ -100,7 +91,7 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct list_head ref_list;
+ struct rb_root ref_tree;
/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
struct list_head ref_add_list;
@@ -116,6 +107,14 @@ struct btrfs_delayed_ref_head {
int total_ref_mod;
/*
+ * This is the current outstanding mod references for this bytenr. This
+ * is used with lookup_extent_info to get an accurate reference count
+ * for a bytenr, so it is adjusted as delayed refs are run so that any
+ * on disk reference count + ref_mod is accurate.
+ */
+ int ref_mod;
+
+ /*
* For qgroup reserved space freeing.
*
* ref_root and reserved will be recorded after
@@ -234,15 +233,18 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
case BTRFS_SHARED_DATA_REF_KEY:
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
break;
- case 0:
- kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
- break;
default:
BUG();
}
}
}
+static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
+{
+ if (refcount_dec_and_test(&head->refs))
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head);
+}
+
int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -283,35 +285,17 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
u64 seq);
/*
- * a node might live in a head or a regular ref, this lets you
- * test for the proper type to use.
- */
-static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
-{
- return node->is_head;
-}
-
-/*
* helper functions to cast a node into its container
*/
static inline struct btrfs_delayed_tree_ref *
btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
{
- WARN_ON(btrfs_delayed_ref_is_head(node));
return container_of(node, struct btrfs_delayed_tree_ref, node);
}
static inline struct btrfs_delayed_data_ref *
btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
{
- WARN_ON(btrfs_delayed_ref_is_head(node));
return container_of(node, struct btrfs_delayed_data_ref, node);
}
-
-static inline struct btrfs_delayed_ref_head *
-btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
-{
- WARN_ON(!btrfs_delayed_ref_is_head(node));
- return container_of(node, struct btrfs_delayed_ref_head, node);
-}
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 487bbe4fb3c6..efce9a2fa9be 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,8 @@
#include "sysfs.h"
#include "qgroup.h"
#include "compression.h"
+#include "tree-checker.h"
+#include "ref-verify.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -543,146 +545,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
return ret;
}
-#define CORRUPT(reason, eb, root, slot) \
- btrfs_crit(root->fs_info, \
- "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \
- btrfs_header_level(eb) == 0 ? "leaf" : "node", \
- reason, btrfs_header_bytenr(eb), root->objectid, slot)
-
-static noinline int check_leaf(struct btrfs_root *root,
- struct extent_buffer *leaf)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_key key;
- struct btrfs_key leaf_key;
- u32 nritems = btrfs_header_nritems(leaf);
- int slot;
-
- /*
- * Extent buffers from a relocation tree have a owner field that
- * corresponds to the subvolume tree they are based on. So just from an
- * extent buffer alone we can not find out what is the id of the
- * corresponding subvolume tree, so we can not figure out if the extent
- * buffer corresponds to the root of the relocation tree or not. So skip
- * this check for relocation trees.
- */
- if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
- struct btrfs_root *check_root;
-
- key.objectid = btrfs_header_owner(leaf);
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- check_root = btrfs_get_fs_root(fs_info, &key, false);
- /*
- * The only reason we also check NULL here is that during
- * open_ctree() some roots has not yet been set up.
- */
- if (!IS_ERR_OR_NULL(check_root)) {
- struct extent_buffer *eb;
-
- eb = btrfs_root_node(check_root);
- /* if leaf is the root, then it's fine */
- if (leaf != eb) {
- CORRUPT("non-root leaf's nritems is 0",
- leaf, check_root, 0);
- free_extent_buffer(eb);
- return -EIO;
- }
- free_extent_buffer(eb);
- }
- return 0;
- }
-
- if (nritems == 0)
- return 0;
-
- /* Check the 0 item */
- if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
- BTRFS_LEAF_DATA_SIZE(fs_info)) {
- CORRUPT("invalid item offset size pair", leaf, root, 0);
- return -EIO;
- }
-
- /*
- * Check to make sure each items keys are in the correct order and their
- * offsets make sense. We only have to loop through nritems-1 because
- * we check the current slot against the next slot, which verifies the
- * next slot's offset+size makes sense and that the current's slot
- * offset is correct.
- */
- for (slot = 0; slot < nritems - 1; slot++) {
- btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
- btrfs_item_key_to_cpu(leaf, &key, slot + 1);
-
- /* Make sure the keys are in the right order */
- if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
- CORRUPT("bad key order", leaf, root, slot);
- return -EIO;
- }
-
- /*
- * Make sure the offset and ends are right, remember that the
- * item data starts at the end of the leaf and grows towards the
- * front.
- */
- if (btrfs_item_offset_nr(leaf, slot) !=
- btrfs_item_end_nr(leaf, slot + 1)) {
- CORRUPT("slot offset bad", leaf, root, slot);
- return -EIO;
- }
-
- /*
- * Check to make sure that we don't point outside of the leaf,
- * just in case all the items are consistent to each other, but
- * all point outside of the leaf.
- */
- if (btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(fs_info)) {
- CORRUPT("slot end outside of leaf", leaf, root, slot);
- return -EIO;
- }
- }
-
- return 0;
-}
-
-static int check_node(struct btrfs_root *root, struct extent_buffer *node)
-{
- unsigned long nr = btrfs_header_nritems(node);
- struct btrfs_key key, next_key;
- int slot;
- u64 bytenr;
- int ret = 0;
-
- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
- btrfs_crit(root->fs_info,
- "corrupt node: block %llu root %llu nritems %lu",
- node->start, root->objectid, nr);
- return -EIO;
- }
-
- for (slot = 0; slot < nr - 1; slot++) {
- bytenr = btrfs_node_blockptr(node, slot);
- btrfs_node_key_to_cpu(node, &key, slot);
- btrfs_node_key_to_cpu(node, &next_key, slot + 1);
-
- if (!bytenr) {
- CORRUPT("invalid item slot", node, root, slot);
- ret = -EIO;
- goto out;
- }
-
- if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
- CORRUPT("bad key order", node, root, slot);
- ret = -EIO;
- goto out;
- }
- }
-out:
- return ret;
-}
-
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
u64 phy_offset, struct page *page,
u64 start, u64 end, int mirror)
@@ -748,12 +610,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && check_leaf(root, eb)) {
+ if (found_level == 0 && btrfs_check_leaf(root, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
- if (found_level > 0 && check_node(root, eb))
+ if (found_level > 0 && btrfs_check_node(root, eb))
ret = -EIO;
if (!ret)
@@ -879,22 +741,9 @@ static void run_one_async_start(struct btrfs_work *work)
static void run_one_async_done(struct btrfs_work *work)
{
- struct btrfs_fs_info *fs_info;
struct async_submit_bio *async;
- int limit;
async = container_of(work, struct async_submit_bio, work);
- fs_info = async->fs_info;
-
- limit = btrfs_async_submit_limit(fs_info);
- limit = limit * 2 / 3;
-
- /*
- * atomic_dec_return implies a barrier for waitqueue_active
- */
- if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
- waitqueue_active(&fs_info->async_submit_wait))
- wake_up(&fs_info->async_submit_wait);
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
@@ -942,19 +791,10 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
async->status = 0;
- atomic_inc(&fs_info->nr_async_submits);
-
if (op_is_sync(bio->bi_opf))
btrfs_set_work_high_priority(&async->work);
btrfs_queue_work(fs_info->workers, &async->work);
-
- while (atomic_read(&fs_info->async_submit_draining) &&
- atomic_read(&fs_info->nr_async_submits)) {
- wait_event(fs_info->async_submit_wait,
- (atomic_read(&fs_info->nr_async_submits) == 0));
- }
-
return 0;
}
@@ -1005,9 +845,9 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
return ret;
}
-static int check_async_write(unsigned long bio_flags)
+static int check_async_write(struct btrfs_inode *bi)
{
- if (bio_flags & EXTENT_BIO_TREE_LOG)
+ if (atomic_read(&bi->sync_writers))
return 0;
#ifdef CONFIG_X86
if (static_cpu_has(X86_FEATURE_XMM4_2))
@@ -1022,7 +862,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
{
struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int async = check_async_write(bio_flags);
+ int async = check_async_write(BTRFS_I(inode));
blk_status_t ret;
if (bio_op(bio) != REQ_OP_WRITE) {
@@ -2607,14 +2447,6 @@ int open_ctree(struct super_block *sb,
goto fail_delalloc_bytes;
}
- fs_info->btree_inode = new_inode(sb);
- if (!fs_info->btree_inode) {
- err = -ENOMEM;
- goto fail_bio_counter;
- }
-
- mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2647,17 +2479,12 @@ int open_ctree(struct super_block *sb,
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
- btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
- BTRFS_BLOCK_RSV_DELALLOC);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
BTRFS_BLOCK_RSV_DELOPS);
- atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
- atomic_set(&fs_info->async_submit_draining, 0);
- atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
@@ -2673,12 +2500,21 @@ int open_ctree(struct super_block *sb,
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
+ btrfs_init_ref_verify(fs_info);
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
INIT_LIST_HEAD(&fs_info->ordered_roots);
spin_lock_init(&fs_info->ordered_root_lock);
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail_bio_counter;
+ }
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
GFP_KERNEL);
if (!fs_info->delayed_root) {
@@ -2895,12 +2731,13 @@ int open_ctree(struct super_block *sb,
sb->s_bdi->congested_fn = btrfs_congested_fn;
sb->s_bdi->congested_data = fs_info;
sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
+ memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(fs_info);
@@ -3083,6 +2920,9 @@ retry_root_backup:
if (ret)
goto fail_trans_kthread;
+ if (btrfs_build_ref_tree(fs_info))
+ btrfs_err(fs_info, "couldn't build ref tree");
+
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
@@ -3643,7 +3483,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
u64 flags;
do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
- backup_super_roots(fs_info);
+
+ /*
+ * max_mirrors == 0 indicates we're from commit_transaction,
+ * not from fsync where the tree roots in fs_info have not
+ * been consistent on disk.
+ */
+ if (max_mirrors == 0)
+ backup_super_roots(fs_info);
sb = fs_info->super_for_commit;
dev_item = &sb->dev_item;
@@ -3941,6 +3788,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
+ btrfs_free_ref_cache(fs_info);
__btrfs_free_block_rsv(root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
@@ -4000,7 +3848,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
buf->len,
fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
@@ -4265,26 +4113,28 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
- struct btrfs_delayed_ref_node *tmp;
+ struct rb_node *n;
bool pin_bytes = false;
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->node.refs);
+ refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
spin_lock(&delayed_refs->lock);
continue;
}
spin_lock(&head->lock);
- list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
- list) {
+ while ((n = rb_first(&head->ref_tree)) != NULL) {
+ ref = rb_entry(n, struct btrfs_delayed_ref_node,
+ ref_node);
ref->in_tree = 0;
- list_del(&ref->list);
+ rb_erase(&ref->ref_node, &head->ref_tree);
+ RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
@@ -4297,16 +4147,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->processing == 0)
delayed_refs->num_heads_ready--;
atomic_dec(&delayed_refs->num_entries);
- head->node.in_tree = 0;
rb_erase(&head->href_node, &delayed_refs->href_root);
+ RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
if (pin_bytes)
- btrfs_pin_extent(fs_info, head->node.bytenr,
- head->node.num_bytes, 1);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_pin_extent(fs_info, head->bytenr,
+ head->num_bytes, 1);
+ btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index fa66980726c9..3aeb5770f896 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/types.h>
#include "ctree.h"
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index 074348a95841..91b3908e7c54 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_EXPORT_H
#define BTRFS_EXPORT_H
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2d7e86b51d1..673ac4e01dd0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
#include "hash.h"
#include "tree-log.h"
#include "disk-io.h"
@@ -38,6 +39,7 @@
#include "math.h"
#include "sysfs.h"
#include "qgroup.h"
+#include "ref-verify.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -61,9 +63,6 @@ enum {
CHUNK_ALLOC_FORCE = 2,
};
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *node, u64 parent,
@@ -91,17 +90,8 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc);
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush,
- bool system_chunk);
static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes);
@@ -652,7 +642,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_FAST;
spin_unlock(&cache->lock);
- if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+ if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
mutex_lock(&caching_ctl->mutex);
ret = load_free_space_cache(fs_info, cache);
@@ -923,7 +913,7 @@ search_again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->node.refs);
+ refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -934,7 +924,7 @@ search_again:
*/
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
goto search_again;
}
spin_lock(&head->lock);
@@ -943,7 +933,7 @@ search_again:
else
BUG_ON(num_refs == 0);
- num_refs += head->node.ref_mod;
+ num_refs += head->ref_mod;
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
}
@@ -2189,16 +2179,20 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int old_ref_mod, new_ref_mod;
int ret;
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
root_objectid == BTRFS_TREE_LOG_OBJECTID);
+ btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
+ owner, offset, BTRFS_ADD_DELAYED_REF);
+
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes, parent,
@@ -2344,7 +2338,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_key key;
@@ -2366,14 +2360,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- key.objectid = node->bytenr;
+ key.objectid = head->bytenr;
if (metadata) {
key.type = BTRFS_METADATA_ITEM_KEY;
key.offset = extent_op->level;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = node->num_bytes;
+ key.offset = head->num_bytes;
}
again:
@@ -2390,17 +2384,17 @@ again:
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
- if (key.objectid == node->bytenr &&
+ if (key.objectid == head->bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == node->num_bytes)
+ key.offset == head->num_bytes)
ret = 0;
}
if (ret > 0) {
btrfs_release_path(path);
metadata = 0;
- key.objectid = node->bytenr;
- key.offset = node->num_bytes;
+ key.objectid = head->bytenr;
+ key.offset = head->num_bytes;
key.type = BTRFS_EXTENT_ITEM_KEY;
goto again;
}
@@ -2507,44 +2501,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return 0;
}
- if (btrfs_delayed_ref_is_head(node)) {
- struct btrfs_delayed_ref_head *head;
- /*
- * we've hit the end of the chain and we were supposed
- * to insert this extent into the tree. But, it got
- * deleted before we ever needed to insert it, so all
- * we have to do is clean up the accounting
- */
- BUG_ON(extent_op);
- head = btrfs_delayed_node_to_head(node);
- trace_run_delayed_ref_head(fs_info, node, head, node->action);
-
- if (head->total_ref_mod < 0) {
- struct btrfs_block_group_cache *cache;
-
- cache = btrfs_lookup_block_group(fs_info, node->bytenr);
- ASSERT(cache);
- percpu_counter_add(&cache->space_info->total_bytes_pinned,
- -node->num_bytes);
- btrfs_put_block_group(cache);
- }
-
- if (insert_reserved) {
- btrfs_pin_extent(fs_info, node->bytenr,
- node->num_bytes, 1);
- if (head->is_data) {
- ret = btrfs_del_csums(trans, fs_info,
- node->bytenr,
- node->num_bytes);
- }
- }
-
- /* Also free its reserved qgroup space */
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
- head->qgroup_reserved);
- return ret;
- }
-
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
node->type == BTRFS_SHARED_BLOCK_REF_KEY)
ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
@@ -2563,7 +2519,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
- if (list_empty(&head->ref_list))
+ if (RB_EMPTY_ROOT(&head->ref_tree))
return NULL;
/*
@@ -2576,12 +2532,114 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
- ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
- list);
+ ref = rb_entry(rb_first(&head->ref_tree),
+ struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
}
+static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ spin_lock(&delayed_refs->lock);
+ head->processing = 0;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(head);
+}
+
+static int cleanup_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+ int ret;
+
+ if (!extent_op)
+ return 0;
+ head->extent_op = NULL;
+ if (head->must_insert_reserved) {
+ btrfs_free_delayed_extent_op(extent_op);
+ return 0;
+ }
+ spin_unlock(&head->lock);
+ ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
+ return ret ? ret : 1;
+}
+
+static int cleanup_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ ret = cleanup_extent_op(trans, fs_info, head);
+ if (ret < 0) {
+ unselect_delayed_ref_head(delayed_refs, head);
+ btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+ return ret;
+ } else if (ret) {
+ return ret;
+ }
+
+ /*
+ * Need to drop our head ref lock and re-acquire the delayed ref lock
+ * and then re-check to make sure nobody got added.
+ */
+ spin_unlock(&head->lock);
+ spin_lock(&delayed_refs->lock);
+ spin_lock(&head->lock);
+ if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ return 1;
+ }
+ delayed_refs->num_heads--;
+ rb_erase(&head->href_node, &delayed_refs->href_root);
+ RB_CLEAR_NODE(&head->href_node);
+ spin_unlock(&delayed_refs->lock);
+ spin_unlock(&head->lock);
+ atomic_dec(&delayed_refs->num_entries);
+
+ trace_run_delayed_ref_head(fs_info, head, 0);
+
+ if (head->total_ref_mod < 0) {
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, head->bytenr);
+ ASSERT(cache);
+ percpu_counter_add(&cache->space_info->total_bytes_pinned,
+ -head->num_bytes);
+ btrfs_put_block_group(cache);
+
+ if (head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ }
+ }
+
+ if (head->must_insert_reserved) {
+ btrfs_pin_extent(fs_info, head->bytenr,
+ head->num_bytes, 1);
+ if (head->is_data) {
+ ret = btrfs_del_csums(trans, fs_info, head->bytenr,
+ head->num_bytes);
+ }
+ }
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
+ head->qgroup_reserved);
+ btrfs_delayed_ref_unlock(head);
+ btrfs_put_delayed_ref_head(head);
+ return 0;
+}
+
/*
* Returns 0 on success or if called with an already aborted transaction.
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
@@ -2655,11 +2713,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (ref && ref->seq &&
btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
spin_unlock(&locked_ref->lock);
- spin_lock(&delayed_refs->lock);
- locked_ref->processing = 0;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_delayed_ref_unlock(locked_ref);
+ unselect_delayed_ref_head(delayed_refs, locked_ref);
locked_ref = NULL;
cond_resched();
count++;
@@ -2667,102 +2721,55 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
/*
- * record the must insert reserved flag before we
- * drop the spin lock.
+ * We're done processing refs in this ref_head, clean everything
+ * up and move on to the next ref_head.
*/
- must_insert_reserved = locked_ref->must_insert_reserved;
- locked_ref->must_insert_reserved = 0;
-
- extent_op = locked_ref->extent_op;
- locked_ref->extent_op = NULL;
-
if (!ref) {
-
-
- /* All delayed refs have been processed, Go ahead
- * and send the head node to run_one_delayed_ref,
- * so that any accounting fixes can happen
- */
- ref = &locked_ref->node;
-
- if (extent_op && must_insert_reserved) {
- btrfs_free_delayed_extent_op(extent_op);
- extent_op = NULL;
- }
-
- if (extent_op) {
- spin_unlock(&locked_ref->lock);
- ret = run_delayed_extent_op(trans, fs_info,
- ref, extent_op);
- btrfs_free_delayed_extent_op(extent_op);
-
- if (ret) {
- /*
- * Need to reset must_insert_reserved if
- * there was an error so the abort stuff
- * can cleanup the reserved space
- * properly.
- */
- if (must_insert_reserved)
- locked_ref->must_insert_reserved = 1;
- spin_lock(&delayed_refs->lock);
- locked_ref->processing = 0;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_debug(fs_info,
- "run_delayed_extent_op returned %d",
- ret);
- btrfs_delayed_ref_unlock(locked_ref);
- return ret;
- }
+ ret = cleanup_ref_head(trans, fs_info, locked_ref);
+ if (ret > 0 ) {
+ /* We dropped our lock, we need to loop. */
+ ret = 0;
continue;
+ } else if (ret) {
+ return ret;
}
+ locked_ref = NULL;
+ count++;
+ continue;
+ }
- /*
- * Need to drop our head ref lock and re-acquire the
- * delayed ref lock and then re-check to make sure
- * nobody got added.
- */
- spin_unlock(&locked_ref->lock);
- spin_lock(&delayed_refs->lock);
- spin_lock(&locked_ref->lock);
- if (!list_empty(&locked_ref->ref_list) ||
- locked_ref->extent_op) {
- spin_unlock(&locked_ref->lock);
- spin_unlock(&delayed_refs->lock);
- continue;
- }
- ref->in_tree = 0;
- delayed_refs->num_heads--;
- rb_erase(&locked_ref->href_node,
- &delayed_refs->href_root);
- spin_unlock(&delayed_refs->lock);
- } else {
- actual_count++;
- ref->in_tree = 0;
- list_del(&ref->list);
- if (!list_empty(&ref->add_list))
- list_del(&ref->add_list);
+ actual_count++;
+ ref->in_tree = 0;
+ rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+ RB_CLEAR_NODE(&ref->ref_node);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
+ /*
+ * When we play the delayed ref, also correct the ref_mod on
+ * head
+ */
+ switch (ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ case BTRFS_ADD_DELAYED_EXTENT:
+ locked_ref->ref_mod -= ref->ref_mod;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ locked_ref->ref_mod += ref->ref_mod;
+ break;
+ default:
+ WARN_ON(1);
}
atomic_dec(&delayed_refs->num_entries);
- if (!btrfs_delayed_ref_is_head(ref)) {
- /*
- * when we play the delayed ref, also correct the
- * ref_mod on head
- */
- switch (ref->action) {
- case BTRFS_ADD_DELAYED_REF:
- case BTRFS_ADD_DELAYED_EXTENT:
- locked_ref->node.ref_mod -= ref->ref_mod;
- break;
- case BTRFS_DROP_DELAYED_REF:
- locked_ref->node.ref_mod += ref->ref_mod;
- break;
- default:
- WARN_ON(1);
- }
- }
+ /*
+ * Record the must-insert_reserved flag before we drop the spin
+ * lock.
+ */
+ must_insert_reserved = locked_ref->must_insert_reserved;
+ locked_ref->must_insert_reserved = 0;
+
+ extent_op = locked_ref->extent_op;
+ locked_ref->extent_op = NULL;
spin_unlock(&locked_ref->lock);
ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
@@ -2770,33 +2777,13 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- spin_lock(&delayed_refs->lock);
- locked_ref->processing = 0;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_delayed_ref_unlock(locked_ref);
+ unselect_delayed_ref_head(delayed_refs, locked_ref);
btrfs_put_delayed_ref(ref);
btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
ret);
return ret;
}
- /*
- * If this node is a head, that means all the refs in this head
- * have been dealt with, and we will pick the next head to deal
- * with, so we must unlock the head and drop it from the cluster
- * list before we release it.
- */
- if (btrfs_delayed_ref_is_head(ref)) {
- if (locked_ref->is_data &&
- locked_ref->total_ref_mod < 0) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= ref->num_bytes;
- spin_unlock(&delayed_refs->lock);
- }
- btrfs_delayed_ref_unlock(locked_ref);
- locked_ref = NULL;
- }
btrfs_put_delayed_ref(ref);
count++;
cond_resched();
@@ -3100,33 +3087,16 @@ again:
spin_unlock(&delayed_refs->lock);
goto out;
}
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
- while (node) {
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
- if (btrfs_delayed_ref_is_head(&head->node)) {
- struct btrfs_delayed_ref_node *ref;
-
- ref = &head->node;
- refcount_inc(&ref->refs);
-
- spin_unlock(&delayed_refs->lock);
- /*
- * Mutex was contended, block until it's
- * released and try again
- */
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
+ /* Mutex was contended, block until it's released and retry. */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(ref);
- cond_resched();
- goto again;
- } else {
- WARN_ON(1);
- }
- node = rb_next(node);
- }
- spin_unlock(&delayed_refs->lock);
+ btrfs_put_delayed_ref_head(head);
cond_resched();
goto again;
}
@@ -3169,6 +3139,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_transaction *cur_trans;
+ struct rb_node *node;
int ret = 0;
cur_trans = root->fs_info->running_transaction;
@@ -3184,7 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->node.refs);
+ refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -3195,13 +3166,18 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
*/
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
return -EAGAIN;
}
spin_unlock(&delayed_refs->lock);
spin_lock(&head->lock);
- list_for_each_entry(ref, &head->ref_list, list) {
+ /*
+ * XXX: We should replace this with a proper search function in the
+ * future.
+ */
+ for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
ret = 1;
@@ -3351,7 +3327,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *,
- struct btrfs_fs_info *,
+ struct btrfs_root *,
u64, u64, u64, u64, u64, u64);
@@ -3391,7 +3367,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
key.offset -= btrfs_file_extent_offset(buf, fi);
- ret = process_func(trans, fs_info, bytenr, num_bytes,
+ ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
key.offset);
if (ret)
@@ -3399,7 +3375,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = fs_info->nodesize;
- ret = process_func(trans, fs_info, bytenr, num_bytes,
+ ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0);
if (ret)
goto fail;
@@ -4843,7 +4819,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
u64 orig, bool wait_ordered)
{
- struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
@@ -4859,8 +4834,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
to_reclaim = items * EXTENT_SIZE_PER_ITEM;
trans = (struct btrfs_trans_handle *)current->journal_info;
- block_rsv = &fs_info->delalloc_block_rsv;
- space_info = block_rsv->space_info;
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
@@ -4919,6 +4893,13 @@ skip_async:
}
}
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ struct list_head list;
+ wait_queue_head_t wait;
+};
+
/**
* maybe_commit_transaction - possibly commit the transaction if its ok to
* @root - the root we're allocating for
@@ -4930,18 +4911,29 @@ skip_async:
* will return -ENOSPC.
*/
static int may_commit_transaction(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 bytes, int force)
+ struct btrfs_space_info *space_info)
{
+ struct reserve_ticket *ticket = NULL;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
struct btrfs_trans_handle *trans;
+ u64 bytes;
trans = (struct btrfs_trans_handle *)current->journal_info;
if (trans)
return -EAGAIN;
- if (force)
- goto commit;
+ spin_lock(&space_info->lock);
+ if (!list_empty(&space_info->priority_tickets))
+ ticket = list_first_entry(&space_info->priority_tickets,
+ struct reserve_ticket, list);
+ else if (!list_empty(&space_info->tickets))
+ ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ bytes = (ticket) ? ticket->bytes : 0;
+ spin_unlock(&space_info->lock);
+
+ if (!bytes)
+ return 0;
/* See if there is enough pinned space to make this reservation */
if (percpu_counter_compare(&space_info->total_bytes_pinned,
@@ -4956,8 +4948,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
return -ENOSPC;
spin_lock(&delayed_rsv->lock);
+ if (delayed_rsv->size > bytes)
+ bytes = 0;
+ else
+ bytes -= delayed_rsv->size;
if (percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes - delayed_rsv->size) < 0) {
+ bytes) < 0) {
spin_unlock(&delayed_rsv->lock);
return -ENOSPC;
}
@@ -4971,13 +4967,6 @@ commit:
return btrfs_commit_transaction(trans);
}
-struct reserve_ticket {
- u64 bytes;
- int error;
- struct list_head list;
- wait_queue_head_t wait;
-};
-
/*
* Try to flush some data based on policy set by @state. This is only advisory
* and may fail for various reasons. The caller is supposed to examine the
@@ -5027,8 +5016,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = 0;
break;
case COMMIT_TRANS:
- ret = may_commit_transaction(fs_info, space_info,
- num_bytes, 0);
+ ret = may_commit_transaction(fs_info, space_info);
break;
default:
ret = -ENOSPC;
@@ -5582,11 +5570,12 @@ again:
}
}
-static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 ret;
spin_lock(&block_rsv->lock);
if (num_bytes == (u64)-1)
@@ -5601,6 +5590,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
}
spin_unlock(&block_rsv->lock);
+ ret = num_bytes;
if (num_bytes > 0) {
if (dest) {
spin_lock(&dest->lock);
@@ -5620,6 +5610,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
space_info_add_old_bytes(fs_info, space_info,
num_bytes);
}
+ return ret;
}
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
@@ -5643,6 +5634,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
rsv->type = type;
}
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv,
+ unsigned short type)
+{
+ btrfs_init_block_rsv(rsv, type);
+ rsv->space_info = __find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+}
+
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type)
{
@@ -5652,9 +5652,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
if (!block_rsv)
return NULL;
- btrfs_init_block_rsv(block_rsv, type);
- block_rsv->space_info = __find_space_info(fs_info,
- BTRFS_BLOCK_GROUP_METADATA);
+ btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
return block_rsv;
}
@@ -5737,6 +5735,66 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
+/**
+ * btrfs_inode_rsv_refill - refill the inode block rsv.
+ * @inode - the inode we are refilling.
+ * @flush - the flusing restriction.
+ *
+ * Essentially the same as btrfs_block_rsv_refill, except it uses the
+ * block_rsv->size as the minimum size. We'll either refill the missing amount
+ * or return if we already have enough space. This will also handle the resreve
+ * tracepoint for the reserved amount.
+ */
+int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size)
+ num_bytes = block_rsv->size - block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (num_bytes == 0)
+ return 0;
+
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), num_bytes, 1);
+ }
+ return ret;
+}
+
+/**
+ * btrfs_inode_rsv_release - release any excessive reservation.
+ * @inode - the inode we need to release from.
+ *
+ * This is the same as btrfs_block_rsv_release, except that it handles the
+ * tracepoint for the reservation.
+ */
+void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 released = 0;
+
+ /*
+ * Since we statically set the block_rsv->size we just want to say we
+ * are releasing 0 bytes, and then we'll just get the reservation over
+ * the size free'd.
+ */
+ released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delalloc",
+ btrfs_ino(inode), released, 0);
+}
+
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
@@ -5808,7 +5866,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
fs_info->global_block_rsv.space_info = space_info;
- fs_info->delalloc_block_rsv.space_info = space_info;
fs_info->trans_block_rsv.space_info = space_info;
fs_info->empty_block_rsv.space_info = space_info;
fs_info->delayed_block_rsv.space_info = space_info;
@@ -5828,8 +5885,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
(u64)-1);
- WARN_ON(fs_info->delalloc_block_rsv.size > 0);
- WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
WARN_ON(fs_info->trans_block_rsv.size > 0);
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5841,12 +5896,15 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
- if (!trans->block_rsv)
+ if (!trans->block_rsv) {
+ ASSERT(!trans->bytes_reserved);
return;
+ }
if (!trans->bytes_reserved)
return;
+ ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
@@ -5968,104 +6026,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
}
-/**
- * drop_outstanding_extent - drop an outstanding extent
- * @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're releasing.
- *
- * This is called when we are freeing up an outstanding extent, either called
- * after an error or after an extent is written. This will return the number of
- * reserved extents that need to be freed. This must be called with
- * BTRFS_I(inode)->lock held.
- */
-static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
- u64 num_bytes)
-{
- unsigned drop_inode_space = 0;
- unsigned dropped_extents = 0;
- unsigned num_extents;
-
- num_extents = count_max_extents(num_bytes);
- ASSERT(num_extents);
- ASSERT(inode->outstanding_extents >= num_extents);
- inode->outstanding_extents -= num_extents;
-
- if (inode->outstanding_extents == 0 &&
- test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &inode->runtime_flags))
- drop_inode_space = 1;
-
- /*
- * If we have more or the same amount of outstanding extents than we have
- * reserved then we need to leave the reserved extents count alone.
- */
- if (inode->outstanding_extents >= inode->reserved_extents)
- return drop_inode_space;
-
- dropped_extents = inode->reserved_extents - inode->outstanding_extents;
- inode->reserved_extents -= dropped_extents;
- return dropped_extents + drop_inode_space;
-}
-
-/**
- * calc_csum_metadata_size - return the amount of metadata space that must be
- * reserved/freed for the given bytes.
- * @inode: the inode we're manipulating
- * @num_bytes: the number of bytes in question
- * @reserve: 1 if we are reserving space, 0 if we are freeing space
- *
- * This adjusts the number of csum_bytes in the inode and then returns the
- * correct amount of metadata that must either be reserved or freed. We
- * calculate how many checksums we can fit into one leaf and then divide the
- * number of bytes that will need to be checksumed by this value to figure out
- * how many checksums will be required. If we are adding bytes then the number
- * may go up and we will return the number of additional bytes that must be
- * reserved. If it is going down we will return the number of bytes that must
- * be freed.
- *
- * This must be called with BTRFS_I(inode)->lock held.
- */
-static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
- int reserve)
+static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- u64 old_csums, num_csums;
-
- if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
- return 0;
-
- old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
- if (reserve)
- inode->csum_bytes += num_bytes;
- else
- inode->csum_bytes -= num_bytes;
- num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
-
- /* No change, no need to reserve more */
- if (old_csums == num_csums)
- return 0;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 reserve_size = 0;
+ u64 csum_leaves;
+ unsigned outstanding_extents;
- if (reserve)
- return btrfs_calc_trans_metadata_size(fs_info,
- num_csums - old_csums);
+ lockdep_assert_held(&inode->lock);
+ outstanding_extents = inode->outstanding_extents;
+ if (outstanding_extents)
+ reserve_size = btrfs_calc_trans_metadata_size(fs_info,
+ outstanding_extents + 1);
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+ inode->csum_bytes);
+ reserve_size += btrfs_calc_trans_metadata_size(fs_info,
+ csum_leaves);
- return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
+ spin_lock(&block_rsv->lock);
+ block_rsv->size = reserve_size;
+ spin_unlock(&block_rsv->lock);
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
struct btrfs_root *root = inode->root;
- struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
- u64 to_reserve = 0;
- u64 csum_bytes;
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
- u64 to_free = 0;
- unsigned dropped;
- bool release_extra = false;
/* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
@@ -6091,19 +6082,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ /* Add our new extents and calculate the new rsv size. */
spin_lock(&inode->lock);
nr_extents = count_max_extents(num_bytes);
- inode->outstanding_extents += nr_extents;
-
- nr_extents = 0;
- if (inode->outstanding_extents > inode->reserved_extents)
- nr_extents += inode->outstanding_extents -
- inode->reserved_extents;
-
- /* We always want to reserve a slot for updating the inode. */
- to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
- to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
- csum_bytes = inode->csum_bytes;
+ btrfs_mod_outstanding_extents(inode, nr_extents);
+ inode->csum_bytes += num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
@@ -6113,92 +6097,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
goto out_fail;
}
- ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
+ ret = btrfs_inode_rsv_refill(inode, flush);
if (unlikely(ret)) {
btrfs_qgroup_free_meta(root,
nr_extents * fs_info->nodesize);
goto out_fail;
}
- spin_lock(&inode->lock);
- if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &inode->runtime_flags)) {
- to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
- release_extra = true;
- }
- inode->reserved_extents += nr_extents;
- spin_unlock(&inode->lock);
-
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
-
- if (to_reserve)
- trace_btrfs_space_reservation(fs_info, "delalloc",
- btrfs_ino(inode), to_reserve, 1);
- if (release_extra)
- btrfs_block_rsv_release(fs_info, block_rsv,
- btrfs_calc_trans_metadata_size(fs_info, 1));
return 0;
out_fail:
spin_lock(&inode->lock);
- dropped = drop_outstanding_extent(inode, num_bytes);
- /*
- * If the inodes csum_bytes is the same as the original
- * csum_bytes then we know we haven't raced with any free()ers
- * so we can just reduce our inodes csum bytes and carry on.
- */
- if (inode->csum_bytes == csum_bytes) {
- calc_csum_metadata_size(inode, num_bytes, 0);
- } else {
- u64 orig_csum_bytes = inode->csum_bytes;
- u64 bytes;
-
- /*
- * This is tricky, but first we need to figure out how much we
- * freed from any free-ers that occurred during this
- * reservation, so we reset ->csum_bytes to the csum_bytes
- * before we dropped our lock, and then call the free for the
- * number of bytes that were freed while we were trying our
- * reservation.
- */
- bytes = csum_bytes - inode->csum_bytes;
- inode->csum_bytes = csum_bytes;
- to_free = calc_csum_metadata_size(inode, bytes, 0);
-
-
- /*
- * Now we need to see how much we would have freed had we not
- * been making this reservation and our ->csum_bytes were not
- * artificially inflated.
- */
- inode->csum_bytes = csum_bytes - num_bytes;
- bytes = csum_bytes - orig_csum_bytes;
- bytes = calc_csum_metadata_size(inode, bytes, 0);
-
- /*
- * Now reset ->csum_bytes to what it should be. If bytes is
- * more than to_free then we would have freed more space had we
- * not had an artificially high ->csum_bytes, so we need to free
- * the remainder. If bytes is the same or less then we don't
- * need to do anything, the other free-ers did the correct
- * thing.
- */
- inode->csum_bytes = orig_csum_bytes - num_bytes;
- if (bytes > to_free)
- to_free = bytes - to_free;
- else
- to_free = 0;
- }
+ nr_extents = count_max_extents(num_bytes);
+ btrfs_mod_outstanding_extents(inode, -nr_extents);
+ inode->csum_bytes -= num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- if (dropped)
- to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
- if (to_free) {
- btrfs_block_rsv_release(fs_info, block_rsv, to_free);
- trace_btrfs_space_reservation(fs_info, "delalloc",
- btrfs_ino(inode), to_free, 0);
- }
+ btrfs_inode_rsv_release(inode);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return ret;
@@ -6206,36 +6124,55 @@ out_fail:
/**
* btrfs_delalloc_release_metadata - release a metadata reservation for an inode
- * @inode: the inode to release the reservation for
- * @num_bytes: the number of bytes we're releasing
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
- * reservations.
+ * reservations, or on error for the same reason.
*/
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- u64 to_free = 0;
- unsigned dropped;
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
- dropped = drop_outstanding_extent(inode, num_bytes);
-
- if (num_bytes)
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ inode->csum_bytes -= num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- if (dropped > 0)
- to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
if (btrfs_is_testing(fs_info))
return;
- trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
- to_free, 0);
+ btrfs_inode_rsv_release(inode);
+}
+
+/**
+ * btrfs_delalloc_release_extents - release our outstanding_extents
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
+ *
+ * When we reserve space we increase outstanding_extents for the extents we may
+ * add. Once we've set the range as delalloc or created our ordered extents we
+ * have outstanding_extents to track the real usage, so we use this to free our
+ * temporarily tracked outstanding_extents. This _must_ be used in conjunction
+ * with btrfs_delalloc_reserve_metadata.
+ */
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ unsigned num_extents;
+
+ spin_lock(&inode->lock);
+ num_extents = count_max_extents(num_bytes);
+ btrfs_mod_outstanding_extents(inode, -num_extents);
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
- btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
+ btrfs_inode_rsv_release(inode);
}
/**
@@ -6282,10 +6219,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
* @inode: inode we're releasing space for
* @start: start position of the space already reserved
* @len: the len of the space already reserved
- *
- * This must be matched with a call to btrfs_delalloc_reserve_space. This is
- * called in the case that we don't need the metadata AND data reservations
- * anymore. So if there is an error or we insert an inline extent.
+ * @release_bytes: the len of the space we consumed or didn't use
*
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
@@ -6293,7 +6227,8 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
* Also it will handle the qgroup reserved space.
*/
void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len)
+ struct extent_changeset *reserved,
+ u64 start, u64 len)
{
btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
btrfs_free_reserved_data_space(inode, reserved, start, len);
@@ -6958,7 +6893,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(!is_data && refs_to_drop != 1);
if (is_data)
- skinny_metadata = 0;
+ skinny_metadata = false;
ret = lookup_extent_backref(trans, info, path, &iref,
bytenr, num_bytes, parent,
@@ -7213,7 +7148,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (!list_empty(&head->ref_list))
+ if (!RB_EMPTY_ROOT(&head->ref_tree))
goto out;
if (head->extent_op) {
@@ -7234,9 +7169,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
* at this point we have a head with no other entries. Go
* ahead and process it.
*/
- head->node.in_tree = 0;
rb_erase(&head->href_node, &delayed_refs->href_root);
-
+ RB_CLEAR_NODE(&head->href_node);
atomic_dec(&delayed_refs->num_entries);
/*
@@ -7255,7 +7189,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
ret = 1;
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
return ret;
out:
spin_unlock(&head->lock);
@@ -7277,6 +7211,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
int old_ref_mod, new_ref_mod;
+ btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
+ root->root_key.objectid,
+ btrfs_header_level(buf), 0,
+ BTRFS_DROP_DELAYED_REF);
ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
buf->len, parent,
root->root_key.objectid,
@@ -7329,16 +7267,21 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
return 0;
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
+ btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
+ root_objectid, owner, offset,
+ BTRFS_DROP_DELAYED_REF);
/*
* tree log blocks never actually go into the extent allocation
@@ -8306,17 +8249,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- u64 root_objectid, u64 owner,
+ struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
- BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+ BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
+ root->root_key.objectid, owner, offset,
+ BTRFS_ADD_DELAYED_EXTENT);
ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
- ins->offset, 0, root_objectid, owner,
+ ins->offset, 0,
+ root->root_key.objectid, owner,
offset, ram_bytes,
BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
return ret;
@@ -8538,6 +8486,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->is_data = false;
extent_op->level = level;
+ btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
+ root_objectid, level, 0,
+ BTRFS_ADD_DELAYED_EXTENT);
ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
ins.offset, parent,
root_objectid, level,
@@ -8894,7 +8845,7 @@ skip:
ret);
}
}
- ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize,
parent, root->root_key.objectid,
level - 1, 0);
if (ret)
@@ -9311,7 +9262,7 @@ out:
* don't have it in the radix (like when we recover after a power fail
* or unmount) so we don't leak memory.
*/
- if (!for_reloc && root_dropped == false)
+ if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
btrfs_handle_fs_error(fs_info, err, NULL);
@@ -9968,9 +9919,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
return 0;
}
-static void __link_block_group(struct btrfs_space_info *space_info,
- struct btrfs_block_group_cache *cache)
+static void link_block_group(struct btrfs_block_group_cache *cache)
{
+ struct btrfs_space_info *space_info = cache->space_info;
int index = get_block_group_index(cache);
bool first = false;
@@ -10178,7 +10129,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
cache->space_info = space_info;
- __link_block_group(space_info, cache);
+ link_block_group(cache);
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_readonly(info, cache->key.objectid)) {
@@ -10337,7 +10288,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->bytes_super, &cache->space_info);
update_global_block_rsv(fs_info);
- __link_block_group(cache->space_info, cache);
+ link_block_group(cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
@@ -10387,6 +10338,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* remove it.
*/
free_excluded_extents(fs_info, block_group);
+ btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
+ block_group->key.offset);
memcpy(&key, &block_group->key, sizeof(key));
index = get_block_group_index(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e5bb0cdd3cd..adbbc017191c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
@@ -109,7 +110,6 @@ struct extent_page_data {
struct bio *bio;
struct extent_io_tree *tree;
get_extent_t *get_extent;
- unsigned long bio_flags;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
@@ -2761,8 +2761,8 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
*/
static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
struct writeback_control *wbc,
- struct page *page, sector_t sector,
- size_t size, unsigned long offset,
+ struct page *page, u64 offset,
+ size_t size, unsigned long pg_offset,
struct block_device *bdev,
struct bio **bio_ret,
bio_end_io_t end_io_func,
@@ -2776,6 +2776,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
int contig = 0;
int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
size_t page_size = min_t(size_t, size, PAGE_SIZE);
+ sector_t sector = offset >> 9;
if (bio_ret && *bio_ret) {
bio = *bio_ret;
@@ -2786,8 +2787,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
if (prev_bio_flags != bio_flags || !contig ||
force_bio_submit ||
- merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
- bio_add_page(bio, page, page_size, offset) < page_size) {
+ merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) ||
+ bio_add_page(bio, page, page_size, pg_offset) < page_size) {
ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
*bio_ret = NULL;
@@ -2801,8 +2802,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
}
}
- bio = btrfs_bio_alloc(bdev, sector << 9);
- bio_add_page(bio, page, page_size, offset);
+ bio = btrfs_bio_alloc(bdev, offset);
+ bio_add_page(bio, page, page_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
@@ -2892,7 +2893,6 @@ static int __do_readpage(struct extent_io_tree *tree,
u64 last_byte = i_size_read(inode);
u64 block_start;
u64 cur_end;
- sector_t sector;
struct extent_map *em;
struct block_device *bdev;
int ret = 0;
@@ -2928,6 +2928,7 @@ static int __do_readpage(struct extent_io_tree *tree,
}
while (cur <= end) {
bool force_bio_submit = false;
+ u64 offset;
if (cur >= last_byte) {
char *userpage;
@@ -2967,9 +2968,9 @@ static int __do_readpage(struct extent_io_tree *tree,
iosize = ALIGN(iosize, blocksize);
if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
disk_io_size = em->block_len;
- sector = em->block_start >> 9;
+ offset = em->block_start;
} else {
- sector = (em->block_start + extent_offset) >> 9;
+ offset = em->block_start + extent_offset;
disk_io_size = iosize;
}
bdev = em->bdev;
@@ -3062,8 +3063,8 @@ static int __do_readpage(struct extent_io_tree *tree,
}
ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
- page, sector, disk_io_size, pg_offset,
- bdev, bio,
+ page, offset, disk_io_size,
+ pg_offset, bdev, bio,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag,
@@ -3324,7 +3325,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
u64 extent_offset;
u64 block_start;
u64 iosize;
- sector_t sector;
struct extent_map *em;
struct block_device *bdev;
size_t pg_offset = 0;
@@ -3367,6 +3367,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
while (cur <= end) {
u64 em_end;
+ u64 offset;
if (cur >= i_size) {
if (tree->ops && tree->ops->writepage_end_io_hook)
@@ -3388,7 +3389,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
BUG_ON(end < cur);
iosize = min(em_end - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
- sector = (em->block_start + extent_offset) >> 9;
+ offset = em->block_start + extent_offset;
bdev = em->bdev;
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -3431,7 +3432,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
}
ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
- page, sector, iosize, pg_offset,
+ page, offset, iosize, pg_offset,
bdev, &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
@@ -3471,8 +3472,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
unsigned int write_flags = 0;
unsigned long nr_written = 0;
- if (wbc->sync_mode == WB_SYNC_ALL)
- write_flags = REQ_SYNC;
+ write_flags = wbc_to_write_flags(wbc);
trace___extent_writepage(page, inode, wbc);
@@ -3716,16 +3716,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
u64 offset = eb->start;
u32 nritems;
unsigned long i, num_pages;
- unsigned long bio_flags = 0;
unsigned long start, end;
- unsigned int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META;
+ unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
int ret = 0;
clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
atomic_set(&eb->io_pages, num_pages);
- if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
- bio_flags = EXTENT_BIO_TREE_LOG;
/* set btree blocks beyond nritems with 0 to avoid stale content. */
nritems = btrfs_header_nritems(eb);
@@ -3749,11 +3746,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
- p, offset >> 9, PAGE_SIZE, 0, bdev,
+ p, offset, PAGE_SIZE, 0, bdev,
&epd->bio,
end_bio_extent_buffer_writepage,
- 0, epd->bio_flags, bio_flags, false);
- epd->bio_flags = bio_flags;
+ 0, 0, 0, false);
if (ret) {
set_btree_ioerr(p);
if (PageWriteback(p))
@@ -3790,7 +3786,6 @@ int btree_write_cache_pages(struct address_space *mapping,
.tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
- .bio_flags = 0,
};
int ret = 0;
int done = 0;
@@ -4063,10 +4058,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
if (epd->bio) {
int ret;
- bio_set_op_attrs(epd->bio, REQ_OP_WRITE,
- epd->sync_io ? REQ_SYNC : 0);
-
- ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
+ ret = submit_one_bio(epd->bio, 0, 0);
BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
@@ -4089,7 +4081,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
- .bio_flags = 0,
};
ret = __extent_writepage(page, wbc, &epd);
@@ -4114,7 +4105,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
.get_extent = get_extent,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
- .bio_flags = 0,
};
struct writeback_control wbc_writepages = {
.sync_mode = mode,
@@ -4154,7 +4144,6 @@ int extent_writepages(struct extent_io_tree *tree,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
- .bio_flags = 0,
};
ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index faffa28ba707..4a8861379d3e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __EXTENTIO__
#define __EXTENTIO__
@@ -33,7 +34,6 @@
* type for this bio
*/
#define EXTENT_BIO_COMPRESSED 1
-#define EXTENT_BIO_TREE_LOG 2
#define EXTENT_BIO_FLAG_SHIFT 16
/* these are bit numbers for test/set bit */
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 69850155870c..2e348fb0b280 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index a67b2def5413..64365bbc9b16 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __EXTENTMAP__
#define __EXTENTMAP__
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aafcc785f840..f80254d82f40 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -856,7 +856,7 @@ next_slot:
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) {
- ret = btrfs_inc_extent_ref(trans, fs_info,
+ ret = btrfs_inc_extent_ref(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
@@ -940,7 +940,7 @@ delete_extent_item:
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
- ret = btrfs_free_extent(trans, fs_info,
+ ret = btrfs_free_extent(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
@@ -1234,7 +1234,7 @@ again:
extent_end - split);
btrfs_mark_buffer_dirty(leaf);
- ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes,
+ ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset);
if (ret) {
@@ -1268,7 +1268,7 @@ again:
extent_end = other_end;
del_slot = path->slots[0] + 1;
del_nr++;
- ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset);
if (ret) {
@@ -1288,7 +1288,7 @@ again:
key.offset = other_start;
del_slot = path->slots[0];
del_nr++;
- ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset);
if (ret) {
@@ -1590,7 +1590,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
int ret = 0;
bool only_release_metadata = false;
bool force_page_uptodate = false;
- bool need_unlock;
nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
PAGE_SIZE / (sizeof(struct page *)));
@@ -1613,6 +1612,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
size_t copied;
size_t dirty_sectors;
size_t num_sectors;
+ int extents_locked;
WARN_ON(num_pages > nrptrs);
@@ -1656,6 +1656,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
}
+ WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes);
if (ret) {
@@ -1669,7 +1670,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
release_bytes = reserve_bytes;
- need_unlock = false;
again:
/*
* This is going to setup the pages array with the number of
@@ -1679,19 +1679,23 @@ again:
ret = prepare_pages(inode, pages, num_pages,
pos, write_bytes,
force_page_uptodate);
- if (ret)
+ if (ret) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ reserve_bytes);
break;
+ }
- ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages,
+ extents_locked = lock_and_cleanup_extent_if_need(
+ BTRFS_I(inode), pages,
num_pages, pos, write_bytes, &lockstart,
&lockend, &cached_state);
- if (ret < 0) {
- if (ret == -EAGAIN)
+ if (extents_locked < 0) {
+ if (extents_locked == -EAGAIN)
goto again;
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ reserve_bytes);
+ ret = extents_locked;
break;
- } else if (ret > 0) {
- need_unlock = true;
- ret = 0;
}
copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
@@ -1718,23 +1722,10 @@ again:
PAGE_SIZE);
}
- /*
- * If we had a short copy we need to release the excess delaloc
- * bytes we reserved. We need to increment outstanding_extents
- * because btrfs_delalloc_release_space and
- * btrfs_delalloc_release_metadata will decrement it, but
- * we still have an outstanding extent for the chunk we actually
- * managed to copy.
- */
if (num_sectors > dirty_sectors) {
/* release everything except the sectors we dirtied */
release_bytes -= dirty_sectors <<
fs_info->sb->s_blocksize_bits;
- if (copied > 0) {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
- }
if (only_release_metadata) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes);
@@ -1756,10 +1747,11 @@ again:
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
pos, copied, NULL);
- if (need_unlock)
+ if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state,
GFP_NOFS);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
@@ -2046,7 +2038,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
- bool full_sync = 0;
+ bool full_sync = false;
u64 len;
/*
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 684f12247db7..fe5e0324dca9 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1286,12 +1286,8 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
- u64 start, end;
int ret;
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
-
block_group->needs_free_space = 0;
ret = add_new_free_space_info(trans, fs_info, block_group, path);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d02019747d00..022b19336fee 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -500,11 +500,12 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
goto out_put;
}
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
out_put:
iput(inode);
out_release:
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
index c8e864b2d530..6734ec92a1e9 100644
--- a/fs/btrfs/inode-map.h
+++ b/fs/btrfs/inode-map.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BTRFS_INODE_MAP
#define __BTRFS_INODE_MAP
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 128f3e58634f..b93fe05a39c7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
#include <linux/blkdev.h>
#include <linux/posix_acl_xattr.h>
#include <linux/uio.h>
+#include <linux/magic.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -67,7 +68,6 @@ struct btrfs_iget_args {
};
struct btrfs_dio_data {
- u64 outstanding_extents;
u64 reserve;
u64 unsubmitted_oe_range_start;
u64 unsubmitted_oe_range_end;
@@ -135,6 +135,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
const u64 offset,
const u64 bytes)
{
+ unsigned long index = offset >> PAGE_SHIFT;
+ unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(inode->i_mapping, index);
+ index++;
+ if (!page)
+ continue;
+ ClearPagePrivate2(page);
+ put_page(page);
+ }
return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
bytes - PAGE_SIZE, false);
}
@@ -304,7 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
btrfs_free_path(path);
return PTR_ERR(trans);
}
- trans->block_rsv = &fs_info->delalloc_block_rsv;
+ trans->block_rsv = &BTRFS_I(inode)->block_rsv;
if (compressed_size && compressed_pages)
extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -336,7 +348,6 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
}
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
- btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start);
btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
out:
/*
@@ -446,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 num_bytes;
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
u64 isize = i_size_read(inode);
@@ -496,8 +506,6 @@ again:
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
- num_bytes = ALIGN(end - start + 1, blocksize);
- num_bytes = max(blocksize, num_bytes);
total_in = 0;
ret = 0;
@@ -530,7 +538,10 @@ again:
*/
extent_range_clear_dirty_for_io(inode, start, end);
redirty = 1;
- ret = btrfs_compress_pages(compress_type,
+
+ /* Compression level is applied here and only here */
+ ret = btrfs_compress_pages(
+ compress_type | (fs_info->compress_level << 4),
inode->i_mapping, start,
pages,
&nr_pages,
@@ -558,7 +569,7 @@ again:
cont:
if (start == 0) {
/* lets try to make an inline extent */
- if (ret || total_in < (actual_end - start)) {
+ if (ret || total_in < actual_end) {
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
@@ -572,16 +583,21 @@ cont:
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+ EXTENT_DO_ACCOUNTING;
unsigned long page_error_op;
- clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
/*
* inline extent creation worked or returned error,
* we don't need to create any more async work items.
* Unlock and free up our temp pages.
+ *
+ * We use DO_ACCOUNTING here because we need the
+ * delalloc_release_metadata to be done _after_ we drop
+ * our outstanding extent for clearing delalloc for this
+ * range.
*/
extent_clear_unlock_delalloc(inode, start, end, end,
NULL, clear_flags,
@@ -590,10 +606,6 @@ cont:
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
- if (ret == 0)
- btrfs_free_reserved_data_space_noquota(inode,
- start,
- end - start + 1);
goto free_pages_out;
}
}
@@ -613,7 +625,6 @@ cont:
*/
total_in = ALIGN(total_in, PAGE_SIZE);
if (total_compressed + blocksize <= total_in) {
- num_bytes = total_in;
*num_added += 1;
/*
@@ -621,12 +632,12 @@ cont:
* allocation on disk for these compressed pages, and
* will submit them to the elevator.
*/
- add_async_extent(async_cow, start, num_bytes,
+ add_async_extent(async_cow, start, total_in,
total_compressed, pages, nr_pages,
compress_type);
- if (start + num_bytes < end) {
- start += num_bytes;
+ if (start + total_in < end) {
+ start += total_in;
pages = NULL;
cond_resched();
goto again;
@@ -970,15 +981,19 @@ static noinline int cow_file_range(struct inode *inode,
ret = cow_file_range_inline(root, inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL);
if (ret == 0) {
+ /*
+ * We use DO_ACCOUNTING here because we need the
+ * delalloc_release_metadata to be run _after_ we drop
+ * our outstanding extent for clearing delalloc for this
+ * range.
+ */
extent_clear_unlock_delalloc(inode, start, end,
delalloc_end, NULL,
EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW |
- EXTENT_DEFRAG, PAGE_UNLOCK |
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+ EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
PAGE_END_WRITEBACK);
- btrfs_free_reserved_data_space_noquota(inode, start,
- end - start + 1);
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
@@ -1214,13 +1229,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
- while (atomic_read(&fs_info->async_submit_draining) &&
- atomic_read(&fs_info->async_delalloc_pages)) {
- wait_event(fs_info->async_submit_wait,
- (atomic_read(&fs_info->async_delalloc_pages) ==
- 0));
- }
-
*nr_written += nr_pages;
start = cur_end + 1;
}
@@ -1623,7 +1631,7 @@ static void btrfs_split_extent_hook(void *private_data,
}
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
}
@@ -1653,7 +1661,7 @@ static void btrfs_merge_extent_hook(void *private_data,
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents--;
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
spin_unlock(&BTRFS_I(inode)->lock);
return;
}
@@ -1684,7 +1692,7 @@ static void btrfs_merge_extent_hook(void *private_data,
return;
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents--;
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
spin_unlock(&BTRFS_I(inode)->lock);
}
@@ -1754,15 +1762,12 @@ static void btrfs_set_bit_hook(void *private_data,
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
+ u32 num_extents = count_max_extents(len);
bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
- if (*bits & EXTENT_FIRST_DELALLOC) {
- *bits &= ~EXTENT_FIRST_DELALLOC;
- } else {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
- }
+ spin_lock(&BTRFS_I(inode)->lock);
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
+ spin_unlock(&BTRFS_I(inode)->lock);
/* For sanity tests */
if (btrfs_is_testing(fs_info))
@@ -1816,13 +1821,9 @@ static void btrfs_clear_bit_hook(void *private_data,
struct btrfs_root *root = inode->root;
bool do_list = !btrfs_is_free_space_inode(inode);
- if (*bits & EXTENT_FIRST_DELALLOC) {
- *bits &= ~EXTENT_FIRST_DELALLOC;
- } else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
- spin_lock(&inode->lock);
- inode->outstanding_extents -= num_extents;
- spin_unlock(&inode->lock);
- }
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, -num_extents);
+ spin_unlock(&inode->lock);
/*
* We don't reserve metadata space for space cache inodes so we
@@ -2093,6 +2094,7 @@ again:
0);
ClearPageChecked(page);
set_page_dirty(page);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state, GFP_NOFS);
@@ -2217,8 +2219,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
qg_released = ret;
- ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins);
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(BTRFS_I(inode)),
+ file_pos, qg_released, &ins);
out:
btrfs_free_path(path);
@@ -2452,7 +2455,7 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
ret = iterate_inodes_from_logical(old->bytenr +
old->extent_offset, fs_info,
path, record_one_backref,
- old);
+ old, false);
if (ret < 0 && ret != -ENOENT)
return false;
@@ -2670,7 +2673,7 @@ again:
inode_add_bytes(inode, len);
btrfs_release_path(path);
- ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr,
+ ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
new->disk_len, 0,
backref->root_id, backref->inum,
new->file_pos); /* start - extent_offset */
@@ -2952,7 +2955,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = NULL;
goto out;
}
- trans->block_rsv = &fs_info->delalloc_block_rsv;
+ trans->block_rsv = &BTRFS_I(inode)->block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
@@ -2988,7 +2991,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- trans->block_rsv = &fs_info->delalloc_block_rsv;
+ trans->block_rsv = &BTRFS_I(inode)->block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
@@ -3046,9 +3049,6 @@ out:
0, &cached_state, GFP_NOFS);
}
- if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- ordered_extent->len);
if (trans)
btrfs_end_transaction(trans);
@@ -4360,47 +4360,11 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
}
-static int truncate_inline_extent(struct inode *inode,
- struct btrfs_path *path,
- struct btrfs_key *found_key,
- const u64 item_end,
- const u64 new_size)
-{
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
- struct btrfs_file_extent_item *fi;
- u32 size = (u32)(new_size - found_key->offset);
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
- loff_t offset = new_size;
- loff_t page_end = ALIGN(offset, PAGE_SIZE);
-
- /*
- * Zero out the remaining of the last page of our inline extent,
- * instead of directly truncating our inline extent here - that
- * would be much more complex (decompressing all the data, then
- * compressing the truncated data, which might be bigger than
- * the size of the inline extent, resize the extent, etc).
- * We release the path because to get the page we might need to
- * read the extent item from disk (data not in the page cache).
- */
- btrfs_release_path(path);
- return btrfs_truncate_block(inode, offset, page_end - offset,
- 0);
- }
-
- btrfs_set_file_extent_ram_bytes(leaf, fi, size);
- size = btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(root->fs_info, path, size, 1);
-
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- inode_sub_bytes(inode, item_end + 1 - new_size);
-
- return 0;
-}
+/*
+ * Return this if we need to call truncate_block for the last bit of the
+ * truncate.
+ */
+#define NEED_TRUNCATE_BLOCK 1
/*
* this can truncate away extent items, csum items and directory items.
@@ -4439,9 +4403,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int err = 0;
u64 ino = btrfs_ino(BTRFS_I(inode));
u64 bytes_deleted = 0;
- bool be_nice = 0;
- bool should_throttle = 0;
- bool should_end = 0;
+ bool be_nice = false;
+ bool should_throttle = false;
+ bool should_end = false;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
@@ -4451,7 +4415,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
*/
if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- be_nice = 1;
+ be_nice = true;
path = btrfs_alloc_path();
if (!path)
@@ -4561,11 +4525,6 @@ search_again:
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
- if (del_item)
- last_size = found_key.offset;
- else
- last_size = new_size;
-
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -4607,40 +4566,30 @@ search_again:
*/
if (!del_item &&
btrfs_file_extent_encryption(leaf, fi) == 0 &&
- btrfs_file_extent_other_encoding(leaf, fi) == 0) {
-
+ btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
+ btrfs_file_extent_compression(leaf, fi) == 0) {
+ u32 size = (u32)(new_size - found_key.offset);
+
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size = btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(root->fs_info, path, size, 1);
+ } else if (!del_item) {
/*
- * Need to release path in order to truncate a
- * compressed extent. So delete any accumulated
- * extent items so far.
+ * We have to bail so the last_size is set to
+ * just before this extent.
*/
- if (btrfs_file_extent_compression(leaf, fi) !=
- BTRFS_COMPRESS_NONE && pending_del_nr) {
- err = btrfs_del_items(trans, root, path,
- pending_del_slot,
- pending_del_nr);
- if (err) {
- btrfs_abort_transaction(trans,
- err);
- goto error;
- }
- pending_del_nr = 0;
- }
+ err = NEED_TRUNCATE_BLOCK;
+ break;
+ }
- err = truncate_inline_extent(inode, path,
- &found_key,
- item_end,
- new_size);
- if (err) {
- btrfs_abort_transaction(trans, err);
- goto error;
- }
- } else if (test_bit(BTRFS_ROOT_REF_COWS,
- &root->state)) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
inode_sub_bytes(inode, item_end + 1 - new_size);
- }
}
delete:
+ if (del_item)
+ last_size = found_key.offset;
+ else
+ last_size = new_size;
if (del_item) {
if (!pending_del_nr) {
/* no pending yet, add ourselves */
@@ -4657,14 +4606,14 @@ delete:
} else {
break;
}
- should_throttle = 0;
+ should_throttle = false;
if (found_extent &&
(test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
root == fs_info->tree_root)) {
btrfs_set_path_blocking(path);
bytes_deleted += extent_num_bytes;
- ret = btrfs_free_extent(trans, fs_info, extent_start,
+ ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
ino, extent_offset);
@@ -4676,11 +4625,11 @@ delete:
if (be_nice) {
if (truncate_space_check(trans, root,
extent_num_bytes)) {
- should_end = 1;
+ should_end = true;
}
if (btrfs_should_throttle_delayed_refs(trans,
fs_info))
- should_throttle = 1;
+ should_throttle = true;
}
}
@@ -4789,8 +4738,11 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
+ block_start = round_down(from, blocksize);
+ block_end = block_start + blocksize - 1;
+
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- round_down(from, blocksize), blocksize);
+ block_start, blocksize);
if (ret)
goto out;
@@ -4798,15 +4750,12 @@ again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved,
- round_down(from, blocksize),
- blocksize);
+ block_start, blocksize);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
ret = -ENOMEM;
goto out;
}
- block_start = round_down(from, blocksize);
- block_end = block_start + blocksize - 1;
-
if (!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
lock_page(page);
@@ -4871,6 +4820,7 @@ out_unlock:
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
unlock_page(page);
put_page(page);
out:
@@ -7785,33 +7735,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
return em;
}
-static void adjust_dio_outstanding_extents(struct inode *inode,
- struct btrfs_dio_data *dio_data,
- const u64 len)
-{
- unsigned num_extents = count_max_extents(len);
-
- /*
- * If we have an outstanding_extents count still set then we're
- * within our reservation, otherwise we need to adjust our inode
- * counter appropriately.
- */
- if (dio_data->outstanding_extents >= num_extents) {
- dio_data->outstanding_extents -= num_extents;
- } else {
- /*
- * If dio write length has been split due to no large enough
- * contiguous space, we need to compensate our inode counter
- * appropriately.
- */
- u64 num_needed = num_extents - dio_data->outstanding_extents;
-
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents += num_needed;
- spin_unlock(&BTRFS_I(inode)->lock);
- }
-}
-
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -7973,7 +7896,6 @@ unlock:
if (!dio_data->overwrite && start + len > i_size_read(inode))
i_size_write(inode, start + len);
- adjust_dio_outstanding_extents(inode, dio_data, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
dio_data->unsubmitted_oe_range_end = start + len;
@@ -8003,14 +7925,6 @@ unlock_err:
err:
if (dio_data)
current->journal_info = dio_data;
- /*
- * Compensate the delalloc release we do in btrfs_direct_IO() when we
- * write less data then expected, so that we don't underflow our inode's
- * outstanding extents counter.
- */
- if (create && dio_data)
- adjust_dio_outstanding_extents(inode, dio_data, len);
-
return ret;
}
@@ -8357,11 +8271,8 @@ static void btrfs_endio_direct_read(struct bio *bio)
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
blk_status_t err = bio->bi_status;
- if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) {
+ if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
err = btrfs_subio_endio_read(inode, io_bio, err);
- if (!err)
- bio->bi_status = 0;
- }
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
@@ -8369,7 +8280,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
kfree(dip);
- dio_bio->bi_status = bio->bi_status;
+ dio_bio->bi_status = err;
dio_end_io(dio_bio);
if (io_bio->end_io)
@@ -8387,6 +8298,7 @@ static void __endio_write_update_ordered(struct inode *inode,
btrfs_work_func_t func;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
+ u64 last_offset;
int ret;
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
@@ -8398,6 +8310,7 @@ static void __endio_write_update_ordered(struct inode *inode,
}
again:
+ last_offset = ordered_offset;
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
ordered_bytes,
@@ -8409,6 +8322,12 @@ again:
btrfs_queue_work(wq, &ordered->work);
out_test:
/*
+ * If btrfs_dec_test_ordered_pending does not find any ordered extent
+ * in the range, we can exit.
+ */
+ if (ordered_offset == last_offset)
+ return;
+ /*
* our bio might span multiple ordered extents. If we haven't
* completed the accounting for the whole dio, go back and try again
*/
@@ -8478,7 +8397,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
if (dip->errors) {
bio_io_error(dip->orig_bio);
} else {
- dip->dio_bio->bi_status = 0;
+ dip->dio_bio->bi_status = BLK_STS_OK;
bio_endio(dip->orig_bio);
}
out:
@@ -8560,7 +8479,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
goto err;
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0, async_submit);
+ ret = btrfs_map_bio(fs_info, bio, 0, 0);
err:
bio_put(bio);
return ret;
@@ -8769,7 +8688,6 @@ free_ordered:
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- struct kiocb *iocb,
const struct iov_iter *iter, loff_t offset)
{
int seg;
@@ -8816,7 +8734,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
bool relock = false;
ssize_t ret;
- if (check_direct_IO(fs_info, iocb, iter, offset))
+ if (check_direct_IO(fs_info, iter, offset))
return 0;
inode_dio_begin(inode);
@@ -8851,7 +8769,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
offset, count);
if (ret)
goto out;
- dio_data.outstanding_extents = count_max_extents(count);
/*
* We need to know how many extents we reserved so that we can
@@ -8898,6 +8815,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, data_reserved,
offset, count - (size_t)ret);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
out:
if (wakeup)
@@ -9215,9 +9133,6 @@ again:
fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, data_reserved,
page_start, PAGE_SIZE - reserved_space);
}
@@ -9269,12 +9184,14 @@ again:
out_unlock:
if (!ret) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
}
unlock_page(page);
out:
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved, page_start,
reserved_space);
out_noreserve:
@@ -9370,12 +9287,12 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
+ trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN) {
err = ret;
break;
}
- trans->block_rsv = &fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
err = ret;
@@ -9399,6 +9316,27 @@ static int btrfs_truncate(struct inode *inode)
trans->block_rsv = rsv;
}
+ /*
+ * We can't call btrfs_truncate_block inside a trans handle as we could
+ * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
+ * we've truncated everything except the last little bit, and can do
+ * btrfs_truncate_block and then update the disk_i_size.
+ */
+ if (ret == NEED_TRUNCATE_BLOCK) {
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+
+ ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+ if (ret)
+ goto out;
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ }
+
if (ret == 0 && inode->i_nlink > 0) {
trans->block_rsv = root->orphan_block_rsv;
ret = btrfs_orphan_del(trans, BTRFS_I(inode));
@@ -9463,6 +9401,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct inode *btrfs_alloc_inode(struct super_block *sb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
@@ -9489,8 +9428,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
- ei->reserved_extents = 0;
-
+ if (sb->s_magic != BTRFS_TEST_MAGIC)
+ btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
+ BTRFS_BLOCK_RSV_DELALLOC);
ei->runtime_flags = 0;
ei->prop_compress = BTRFS_COMPRESS_NONE;
ei->defrag_compress = BTRFS_COMPRESS_NONE;
@@ -9540,8 +9480,9 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(!hlist_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
+ WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
+ WARN_ON(BTRFS_I(inode)->block_rsv.size);
WARN_ON(BTRFS_I(inode)->outstanding_extents);
- WARN_ON(BTRFS_I(inode)->reserved_extents);
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
WARN_ON(BTRFS_I(inode)->csum_bytes);
@@ -10320,19 +10261,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
ret = __start_delalloc_inodes(root, delay_iput, -1);
if (ret > 0)
ret = 0;
- /*
- * the filemap_flush will queue IO into the worker threads, but
- * we have to make sure the IO is actually started and that
- * ordered extents get created before we return
- */
- atomic_inc(&fs_info->async_submit_draining);
- while (atomic_read(&fs_info->nr_async_submits) ||
- atomic_read(&fs_info->async_delalloc_pages)) {
- wait_event(fs_info->async_submit_wait,
- (atomic_read(&fs_info->nr_async_submits) == 0 &&
- atomic_read(&fs_info->async_delalloc_pages) == 0));
- }
- atomic_dec(&fs_info->async_submit_draining);
return ret;
}
@@ -10374,14 +10302,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
spin_unlock(&fs_info->delalloc_root_lock);
ret = 0;
- atomic_inc(&fs_info->async_submit_draining);
- while (atomic_read(&fs_info->nr_async_submits) ||
- atomic_read(&fs_info->async_delalloc_pages)) {
- wait_event(fs_info->async_submit_wait,
- (atomic_read(&fs_info->nr_async_submits) == 0 &&
- atomic_read(&fs_info->async_delalloc_pages) == 0));
- }
- atomic_dec(&fs_info->async_submit_draining);
out:
if (!list_empty_careful(&splice)) {
spin_lock(&fs_info->delalloc_root_lock);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d6715c2bcdc4..fd172a93d11a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -86,6 +86,19 @@ struct btrfs_ioctl_received_subvol_args_32 {
struct btrfs_ioctl_received_subvol_args_32)
#endif
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+struct btrfs_ioctl_send_args_32 {
+ __s64 send_fd; /* in */
+ __u64 clone_sources_count; /* in */
+ compat_uptr_t clone_sources; /* in */
+ __u64 parent_root; /* in */
+ __u64 flags; /* in */
+ __u64 reserved[4]; /* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
+ struct btrfs_ioctl_send_args_32)
+#endif
static int btrfs_clone(struct inode *src, struct inode *inode,
u64 off, u64 olen, u64 olen_aligned, u64 destoff,
@@ -609,23 +622,6 @@ fail_free:
return ret;
}
-static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root)
-{
- s64 writers;
- DEFINE_WAIT(wait);
-
- do {
- prepare_to_wait(&root->subv_writers->wait, &wait,
- TASK_UNINTERRUPTIBLE);
-
- writers = percpu_counter_sum(&root->subv_writers->counter);
- if (writers)
- schedule();
-
- finish_wait(&root->subv_writers->wait, &wait);
- } while (writers);
-}
-
static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct dentry *dentry,
u64 *async_transid, bool readonly,
@@ -654,7 +650,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
atomic_inc(&root->will_be_snapshotted);
smp_mb__after_atomic();
- btrfs_wait_for_no_snapshotting_writes(root);
+ /* wait for no snapshot writes */
+ wait_event(root->subv_writers->wait,
+ percpu_counter_sum(&root->subv_writers->counter) == 0);
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
@@ -1219,6 +1217,7 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return i_done;
out:
@@ -1229,6 +1228,7 @@ out:
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;
@@ -1420,21 +1420,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
filemap_flush(inode->i_mapping);
}
- if (do_compress) {
- /* the filemap_flush will queue IO into the worker threads, but
- * we have to make sure the IO is actually started and that
- * ordered extents get created before we return
- */
- atomic_inc(&fs_info->async_submit_draining);
- while (atomic_read(&fs_info->nr_async_submits) ||
- atomic_read(&fs_info->async_delalloc_pages)) {
- wait_event(fs_info->async_submit_wait,
- (atomic_read(&fs_info->nr_async_submits) == 0 &&
- atomic_read(&fs_info->async_delalloc_pages) == 0));
- }
- atomic_dec(&fs_info->async_submit_draining);
- }
-
if (range->compress_type == BTRFS_COMPRESS_LZO) {
btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
@@ -1842,8 +1827,13 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
+ if (ret < 0) {
+ btrfs_end_transaction(trans);
+ goto out_reset;
+ }
+
+ ret = btrfs_commit_transaction(trans);
- btrfs_commit_transaction(trans);
out_reset:
if (ret)
btrfs_set_root_flags(&root->root_item, root_flags);
@@ -2179,7 +2169,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
inode = file_inode(file);
ret = search_ioctl(inode, &args.key, &buf_size,
- (char *)(&uarg->buf[0]));
+ (char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
ret = -EFAULT;
else if (ret == -EOVERFLOW &&
@@ -2773,9 +2763,9 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
}
mutex_unlock(&fs_devices->device_list_mutex);
- fi_args->nodesize = fs_info->super_copy->nodesize;
- fi_args->sectorsize = fs_info->super_copy->sectorsize;
- fi_args->clone_alignment = fs_info->super_copy->sectorsize;
+ fi_args->nodesize = fs_info->nodesize;
+ fi_args->sectorsize = fs_info->sectorsize;
+ fi_args->clone_alignment = fs_info->sectorsize;
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
@@ -3032,7 +3022,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
out:
if (ret)
btrfs_cmp_data_free(cmp);
- return 0;
+ return ret;
}
static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
@@ -3706,7 +3696,7 @@ process_slot:
if (disko) {
inode_add_bytes(inode, datal);
ret = btrfs_inc_extent_ref(trans,
- fs_info,
+ root,
disko, diskl, 0,
root->root_key.objectid,
btrfs_ino(BTRFS_I(inode)),
@@ -4061,6 +4051,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
+ if (!is_fstree(new_root->objectid)) {
+ ret = -ENOENT;
+ goto out;
+ }
path = btrfs_alloc_path();
if (!path) {
@@ -4125,10 +4119,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_space_info *dest_orig;
struct btrfs_ioctl_space_info __user *user_dest;
struct btrfs_space_info *info;
- u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
- BTRFS_BLOCK_GROUP_SYSTEM,
- BTRFS_BLOCK_GROUP_METADATA,
- BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+ static const u64 types[] = {
+ BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
+ };
int num_types = 4;
int alloc_size;
int ret = 0;
@@ -4500,8 +4496,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
ipath->fspath->val[i] = rel_ptr;
}
- ret = copy_to_user((void *)(unsigned long)ipa->fspath,
- (void *)(unsigned long)ipath->fspath, size);
+ ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
+ ipath->fspath, size);
if (ret) {
ret = -EFAULT;
goto out;
@@ -4536,13 +4532,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
}
static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
- void __user *arg)
+ void __user *arg, int version)
{
int ret = 0;
int size;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
struct btrfs_path *path = NULL;
+ bool ignore_offset;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4551,13 +4548,30 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
if (IS_ERR(loi))
return PTR_ERR(loi);
+ if (version == 1) {
+ ignore_offset = false;
+ size = min_t(u32, loi->size, SZ_64K);
+ } else {
+ /* All reserved bits must be 0 for now */
+ if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
+ ret = -EINVAL;
+ goto out_loi;
+ }
+ /* Only accept flags we have defined so far */
+ if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
+ ret = -EINVAL;
+ goto out_loi;
+ }
+ ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
+ size = min_t(u32, loi->size, SZ_16M);
+ }
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
- size = min_t(u32, loi->size, SZ_64K);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
@@ -4566,20 +4580,21 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
}
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- build_ino_list, inodes);
+ build_ino_list, inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
goto out;
- ret = copy_to_user((void *)(unsigned long)loi->inodes,
- (void *)(unsigned long)inodes, size);
+ ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
+ size);
if (ret)
ret = -EFAULT;
out:
btrfs_free_path(path);
kvfree(inodes);
+out_loi:
kfree(loi);
return ret;
@@ -5156,15 +5171,11 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
root->root_key.objectid);
if (ret < 0 && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
goto out;
}
}
ret = btrfs_commit_transaction(trans);
- if (ret < 0) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
-
out:
up_write(&fs_info->subvol_sem);
mnt_drop_write_file(file);
@@ -5486,6 +5497,41 @@ out_drop_write:
return ret;
}
+static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
+{
+ struct btrfs_ioctl_send_args *arg;
+ int ret;
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_send_args_32 args32;
+
+ ret = copy_from_user(&args32, argp, sizeof(args32));
+ if (ret)
+ return -EFAULT;
+ arg = kzalloc(sizeof(*arg), GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+ arg->send_fd = args32.send_fd;
+ arg->clone_sources_count = args32.clone_sources_count;
+ arg->clone_sources = compat_ptr(args32.clone_sources);
+ arg->parent_root = args32.parent_root;
+ arg->flags = args32.flags;
+ memcpy(arg->reserved, args32.reserved,
+ sizeof(args32.reserved));
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ arg = memdup_user(argp, sizeof(*arg));
+ if (IS_ERR(arg))
+ return PTR_ERR(arg);
+ }
+ ret = btrfs_ioctl_send(file, arg);
+ kfree(arg);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -5550,7 +5596,9 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_INO_PATHS:
return btrfs_ioctl_ino_to_path(root, argp);
case BTRFS_IOC_LOGICAL_INO:
- return btrfs_ioctl_logical_to_ino(fs_info, argp);
+ return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
+ case BTRFS_IOC_LOGICAL_INO_V2:
+ return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
case BTRFS_IOC_SPACE_INFO:
return btrfs_ioctl_space_info(fs_info, argp);
case BTRFS_IOC_SYNC: {
@@ -5591,7 +5639,11 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return btrfs_ioctl_send(file, argp);
+ return _btrfs_ioctl_send(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_SEND_32:
+ return _btrfs_ioctl_send(file, argp, true);
+#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
case BTRFS_IOC_QUOTA_CTL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d433e75d489a..6c7f18cd3b61 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -430,10 +430,15 @@ out:
return ret;
}
+static void lzo_set_level(struct list_head *ws, unsigned int type)
+{
+}
+
const struct btrfs_compress_op btrfs_lzo_compress = {
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
.decompress_bio = lzo_decompress_bio,
.decompress = lzo_decompress,
+ .set_level = lzo_set_level,
};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a3aca495e33e..5b311aeddcc8 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -242,6 +242,15 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
}
spin_unlock(&root->ordered_extent_lock);
+ /*
+ * We don't need the count_max_extents here, we can assume that all of
+ * that work has been done at higher layers, so this is truly the
+ * smallest the extent is going to get.
+ */
+ spin_lock(&BTRFS_I(inode)->lock);
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
+ spin_unlock(&BTRFS_I(inode)->lock);
+
return 0;
}
@@ -591,11 +600,19 @@ void btrfs_remove_ordered_extent(struct inode *inode,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+ struct btrfs_root *root = btrfs_inode->root;
struct rb_node *node;
bool dec_pending_ordered = false;
- tree = &BTRFS_I(inode)->ordered_tree;
+ /* This is paired with btrfs_add_ordered_extent. */
+ spin_lock(&btrfs_inode->lock);
+ btrfs_mod_outstanding_extents(btrfs_inode, -1);
+ spin_unlock(&btrfs_inode->lock);
+ if (root != fs_info->tree_root)
+ btrfs_delalloc_release_metadata(btrfs_inode, entry->len);
+
+ tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5c8b61c86e61..168fd03ca3ac 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -807,7 +807,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
}
ret = 0;
out:
- set_bit(BTRFS_FS_QUOTA_DISABLING, &root->fs_info->flags);
btrfs_free_path(path);
return ret;
}
@@ -953,7 +952,6 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
if (!fs_info->quota_root)
goto out;
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- set_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
@@ -1307,6 +1305,8 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
}
}
ret = del_qgroup_item(trans, quota_root, qgroupid);
+ if (ret && ret != -ENOENT)
+ goto out;
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
@@ -1441,7 +1441,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
u64 bytenr = qrecord->bytenr;
int ret;
- ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root);
+ ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
if (ret < 0)
return ret;
@@ -2031,7 +2031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots(NULL, fs_info,
record->bytenr, 0,
- &record->old_roots);
+ &record->old_roots, false);
if (ret < 0)
goto cleanup;
}
@@ -2042,7 +2042,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
* root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, SEQ_LAST, &new_roots);
+ record->bytenr, SEQ_LAST, &new_roots, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
@@ -2086,8 +2086,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- if (test_and_clear_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags))
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
spin_lock(&fs_info->qgroup_lock);
while (!list_empty(&fs_info->dirty_qgroups)) {
@@ -2572,7 +2570,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
num_bytes = found.offset;
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
- &roots);
+ &roots, false);
if (ret < 0)
goto out;
/* For rescan, just pass old_roots as NULL */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24a62224b24b..a7f79254ecca 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1326,6 +1326,9 @@ write_data:
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
}
/*
@@ -1582,6 +1585,10 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
return -EIO;
finish:
@@ -2107,6 +2114,10 @@ cleanup:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
return -EIO;
}
@@ -2231,12 +2242,18 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
ASSERT(!bio->bi_iter.bi_size);
rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
- for (i = 0; i < rbio->real_stripes; i++) {
+ /*
+ * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+ * to the end position, so this search can start from the first parity
+ * stripe.
+ */
+ for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
if (bbio->stripes[i].dev == scrub_dev) {
rbio->scrubp = i;
break;
}
}
+ ASSERT(i < rbio->real_stripes);
/* Now we just support the sectorsize equals to page size */
ASSERT(fs_info->sectorsize == PAGE_SIZE);
@@ -2454,6 +2471,9 @@ submit_write:
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
}
static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2563,12 +2583,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
int stripe;
struct bio *bio;
+ bio_list_init(&bio_list);
+
ret = alloc_rbio_essential_pages(rbio);
if (ret)
goto cleanup;
- bio_list_init(&bio_list);
-
atomic_set(&rbio->error, 0);
/*
* build a list of bios to read all the missing parts of this
@@ -2636,6 +2656,10 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
return;
finish:
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
new file mode 100644
index 000000000000..34878699d363
--- /dev/null
+++ b/fs/btrfs/ref-verify.c
@@ -0,0 +1,1031 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "delayed-ref.h"
+#include "ref-verify.h"
+
+/*
+ * Used to keep track the roots and number of refs each root has for a given
+ * bytenr. This just tracks the number of direct references, no shared
+ * references.
+ */
+struct root_entry {
+ u64 root_objectid;
+ u64 num_refs;
+ struct rb_node node;
+};
+
+/*
+ * These are meant to represent what should exist in the extent tree, these can
+ * be used to verify the extent tree is consistent as these should all match
+ * what the extent tree says.
+ */
+struct ref_entry {
+ u64 root_objectid;
+ u64 parent;
+ u64 owner;
+ u64 offset;
+ u64 num_refs;
+ struct rb_node node;
+};
+
+#define MAX_TRACE 16
+
+/*
+ * Whenever we add/remove a reference we record the action. The action maps
+ * back to the delayed ref action. We hold the ref we are changing in the
+ * action so we can account for the history properly, and we record the root we
+ * were called with since it could be different from ref_root. We also store
+ * stack traces because thats how I roll.
+ */
+struct ref_action {
+ int action;
+ u64 root;
+ struct ref_entry ref;
+ struct list_head list;
+ unsigned long trace[MAX_TRACE];
+ unsigned int trace_len;
+};
+
+/*
+ * One of these for every block we reference, it holds the roots and references
+ * to it as well as all of the ref actions that have occured to it. We never
+ * free it until we unmount the file system in order to make sure re-allocations
+ * are happening properly.
+ */
+struct block_entry {
+ u64 bytenr;
+ u64 len;
+ u64 num_refs;
+ int metadata;
+ int from_disk;
+ struct rb_root roots;
+ struct rb_root refs;
+ struct rb_node node;
+ struct list_head actions;
+};
+
+static struct block_entry *insert_block_entry(struct rb_root *root,
+ struct block_entry *be)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct block_entry *entry;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct block_entry, node);
+ if (entry->bytenr > be->bytenr)
+ p = &(*p)->rb_left;
+ else if (entry->bytenr < be->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(&be->node, parent_node, p);
+ rb_insert_color(&be->node, root);
+ return NULL;
+}
+
+static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *n;
+ struct block_entry *entry = NULL;
+
+ n = root->rb_node;
+ while (n) {
+ entry = rb_entry(n, struct block_entry, node);
+ if (entry->bytenr < bytenr)
+ n = n->rb_right;
+ else if (entry->bytenr > bytenr)
+ n = n->rb_left;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static struct root_entry *insert_root_entry(struct rb_root *root,
+ struct root_entry *re)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct root_entry *entry;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct root_entry, node);
+ if (entry->root_objectid > re->root_objectid)
+ p = &(*p)->rb_left;
+ else if (entry->root_objectid < re->root_objectid)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(&re->node, parent_node, p);
+ rb_insert_color(&re->node, root);
+ return NULL;
+
+}
+
+static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
+{
+ if (ref1->root_objectid < ref2->root_objectid)
+ return -1;
+ if (ref1->root_objectid > ref2->root_objectid)
+ return 1;
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ if (ref1->owner < ref2->owner)
+ return -1;
+ if (ref1->owner > ref2->owner)
+ return 1;
+ if (ref1->offset < ref2->offset)
+ return -1;
+ if (ref1->offset > ref2->offset)
+ return 1;
+ return 0;
+}
+
+static struct ref_entry *insert_ref_entry(struct rb_root *root,
+ struct ref_entry *ref)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct ref_entry *entry;
+ int cmp;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct ref_entry, node);
+ cmp = comp_refs(entry, ref);
+ if (cmp > 0)
+ p = &(*p)->rb_left;
+ else if (cmp < 0)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(&ref->node, parent_node, p);
+ rb_insert_color(&ref->node, root);
+ return NULL;
+
+}
+
+static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
+{
+ struct rb_node *n;
+ struct root_entry *entry = NULL;
+
+ n = root->rb_node;
+ while (n) {
+ entry = rb_entry(n, struct root_entry, node);
+ if (entry->root_objectid < objectid)
+ n = n->rb_right;
+ else if (entry->root_objectid > objectid)
+ n = n->rb_left;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_STACKTRACE
+static void __save_stack_trace(struct ref_action *ra)
+{
+ struct stack_trace stack_trace;
+
+ stack_trace.max_entries = MAX_TRACE;
+ stack_trace.nr_entries = 0;
+ stack_trace.entries = ra->trace;
+ stack_trace.skip = 2;
+ save_stack_trace(&stack_trace);
+ ra->trace_len = stack_trace.nr_entries;
+}
+
+static void __print_stack_trace(struct btrfs_fs_info *fs_info,
+ struct ref_action *ra)
+{
+ struct stack_trace trace;
+
+ if (ra->trace_len == 0) {
+ btrfs_err(fs_info, " ref-verify: no stacktrace");
+ return;
+ }
+ trace.nr_entries = ra->trace_len;
+ trace.entries = ra->trace;
+ print_stack_trace(&trace, 2);
+}
+#else
+static void inline __save_stack_trace(struct ref_action *ra)
+{
+}
+
+static void inline __print_stack_trace(struct btrfs_fs_info *fs_info,
+ struct ref_action *ra)
+{
+ btrfs_err(fs_info, " ref-verify: no stacktrace support");
+}
+#endif
+
+static void free_block_entry(struct block_entry *be)
+{
+ struct root_entry *re;
+ struct ref_entry *ref;
+ struct ref_action *ra;
+ struct rb_node *n;
+
+ while ((n = rb_first(&be->roots))) {
+ re = rb_entry(n, struct root_entry, node);
+ rb_erase(&re->node, &be->roots);
+ kfree(re);
+ }
+
+ while((n = rb_first(&be->refs))) {
+ ref = rb_entry(n, struct ref_entry, node);
+ rb_erase(&ref->node, &be->refs);
+ kfree(ref);
+ }
+
+ while (!list_empty(&be->actions)) {
+ ra = list_first_entry(&be->actions, struct ref_action,
+ list);
+ list_del(&ra->list);
+ kfree(ra);
+ }
+ kfree(be);
+}
+
+static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 len,
+ u64 root_objectid)
+{
+ struct block_entry *be = NULL, *exist;
+ struct root_entry *re = NULL;
+
+ re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
+ be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
+ if (!be || !re) {
+ kfree(re);
+ kfree(be);
+ return ERR_PTR(-ENOMEM);
+ }
+ be->bytenr = bytenr;
+ be->len = len;
+
+ re->root_objectid = root_objectid;
+ re->num_refs = 0;
+
+ spin_lock(&fs_info->ref_verify_lock);
+ exist = insert_block_entry(&fs_info->block_tree, be);
+ if (exist) {
+ if (root_objectid) {
+ struct root_entry *exist_re;
+
+ exist_re = insert_root_entry(&exist->roots, re);
+ if (exist_re)
+ kfree(re);
+ }
+ kfree(be);
+ return exist;
+ }
+
+ be->num_refs = 0;
+ be->metadata = 0;
+ be->from_disk = 0;
+ be->roots = RB_ROOT;
+ be->refs = RB_ROOT;
+ INIT_LIST_HEAD(&be->actions);
+ if (root_objectid)
+ insert_root_entry(&be->roots, re);
+ else
+ kfree(re);
+ return be;
+}
+
+static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 parent, u64 bytenr, int level)
+{
+ struct block_entry *be;
+ struct root_entry *re;
+ struct ref_entry *ref = NULL, *exist;
+
+ ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ if (!ref)
+ return -ENOMEM;
+
+ if (parent)
+ ref->root_objectid = 0;
+ else
+ ref->root_objectid = ref_root;
+ ref->parent = parent;
+ ref->owner = level;
+ ref->offset = 0;
+ ref->num_refs = 1;
+
+ be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root);
+ if (IS_ERR(be)) {
+ kfree(ref);
+ return PTR_ERR(be);
+ }
+ be->num_refs++;
+ be->from_disk = 1;
+ be->metadata = 1;
+
+ if (!parent) {
+ ASSERT(ref_root);
+ re = lookup_root_entry(&be->roots, ref_root);
+ ASSERT(re);
+ re->num_refs++;
+ }
+ exist = insert_ref_entry(&be->refs, ref);
+ if (exist) {
+ exist->num_refs++;
+ kfree(ref);
+ }
+ spin_unlock(&fs_info->ref_verify_lock);
+
+ return 0;
+}
+
+static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
+ u64 parent, u32 num_refs, u64 bytenr,
+ u64 num_bytes)
+{
+ struct block_entry *be;
+ struct ref_entry *ref;
+
+ ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ if (!ref)
+ return -ENOMEM;
+ be = add_block_entry(fs_info, bytenr, num_bytes, 0);
+ if (IS_ERR(be)) {
+ kfree(ref);
+ return PTR_ERR(be);
+ }
+ be->num_refs += num_refs;
+
+ ref->parent = parent;
+ ref->num_refs = num_refs;
+ if (insert_ref_entry(&be->refs, ref)) {
+ spin_unlock(&fs_info->ref_verify_lock);
+ btrfs_err(fs_info, "existing shared ref when reading from disk?");
+ kfree(ref);
+ return -EINVAL;
+ }
+ spin_unlock(&fs_info->ref_verify_lock);
+ return 0;
+}
+
+static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *dref,
+ u64 bytenr, u64 num_bytes)
+{
+ struct block_entry *be;
+ struct ref_entry *ref;
+ struct root_entry *re;
+ u64 ref_root = btrfs_extent_data_ref_root(leaf, dref);
+ u64 owner = btrfs_extent_data_ref_objectid(leaf, dref);
+ u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
+ u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
+
+ ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ if (!ref)
+ return -ENOMEM;
+ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
+ if (IS_ERR(be)) {
+ kfree(ref);
+ return PTR_ERR(be);
+ }
+ be->num_refs += num_refs;
+
+ ref->parent = 0;
+ ref->owner = owner;
+ ref->root_objectid = ref_root;
+ ref->offset = offset;
+ ref->num_refs = num_refs;
+ if (insert_ref_entry(&be->refs, ref)) {
+ spin_unlock(&fs_info->ref_verify_lock);
+ btrfs_err(fs_info, "existing ref when reading from disk?");
+ kfree(ref);
+ return -EINVAL;
+ }
+
+ re = lookup_root_entry(&be->roots, ref_root);
+ if (!re) {
+ spin_unlock(&fs_info->ref_verify_lock);
+ btrfs_err(fs_info, "missing root in new block entry?");
+ return -EINVAL;
+ }
+ re->num_refs += num_refs;
+ spin_unlock(&fs_info->ref_verify_lock);
+ return 0;
+}
+
+static int process_extent_item(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, struct btrfs_key *key,
+ int slot, int *tree_block_level)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct extent_buffer *leaf = path->nodes[0];
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ unsigned long end, ptr;
+ u64 offset, flags, count;
+ int type, ret;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ if ((key->type == BTRFS_EXTENT_ITEM_KEY) &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *tree_block_level = btrfs_tree_block_level(leaf, info);
+ iref = (struct btrfs_extent_inline_ref *)(info + 1);
+ } else {
+ if (key->type == BTRFS_METADATA_ITEM_KEY)
+ *tree_block_level = key->offset;
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ }
+
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ while (ptr < end) {
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ switch (type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = add_tree_block(fs_info, offset, 0, key->objectid,
+ *tree_block_level);
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = add_tree_block(fs_info, 0, offset, key->objectid,
+ *tree_block_level);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ ret = add_extent_data_ref(fs_info, leaf, dref,
+ key->objectid, key->offset);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ count = btrfs_shared_data_ref_count(leaf, sref);
+ ret = add_shared_data_ref(fs_info, offset, count,
+ key->objectid, key->offset);
+ break;
+ default:
+ btrfs_err(fs_info, "invalid key type in iref");
+ ret = -EINVAL;
+ break;
+ }
+ if (ret)
+ break;
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+ return ret;
+}
+
+static int process_leaf(struct btrfs_root *root,
+ struct btrfs_path *path, u64 *bytenr, u64 *num_bytes)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ u32 count;
+ int i = 0, tree_block_level = 0, ret;
+ struct btrfs_key key;
+ int nritems = btrfs_header_nritems(leaf);
+
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ switch (key.type) {
+ case BTRFS_EXTENT_ITEM_KEY:
+ *num_bytes = key.offset;
+ case BTRFS_METADATA_ITEM_KEY:
+ *bytenr = key.objectid;
+ ret = process_extent_item(fs_info, path, &key, i,
+ &tree_block_level);
+ break;
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = add_tree_block(fs_info, key.offset, 0,
+ key.objectid, tree_block_level);
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = add_tree_block(fs_info, 0, key.offset,
+ key.objectid, tree_block_level);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = btrfs_item_ptr(leaf, i,
+ struct btrfs_extent_data_ref);
+ ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr,
+ *num_bytes);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = btrfs_item_ptr(leaf, i,
+ struct btrfs_shared_data_ref);
+ count = btrfs_shared_data_ref_count(leaf, sref);
+ ret = add_shared_data_ref(fs_info, key.offset, count,
+ *bytenr, *num_bytes);
+ break;
+ default:
+ break;
+ }
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+/* Walk down to the leaf from the given level */
+static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int level, u64 *bytenr, u64 *num_bytes)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *eb;
+ u64 block_bytenr, gen;
+ int ret = 0;
+
+ while (level >= 0) {
+ if (level) {
+ block_bytenr = btrfs_node_blockptr(path->nodes[level],
+ path->slots[level]);
+ gen = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+ eb = read_tree_block(fs_info, block_bytenr, gen);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+ if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return -EIO;
+ }
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->nodes[level-1] = eb;
+ path->slots[level-1] = 0;
+ path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
+ } else {
+ ret = process_leaf(root, path, bytenr, num_bytes);
+ if (ret)
+ break;
+ }
+ level--;
+ }
+ return ret;
+}
+
+/* Walk up to the next node that needs to be processed */
+static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int *level)
+{
+ int l;
+
+ for (l = 0; l < BTRFS_MAX_LEVEL; l++) {
+ if (!path->nodes[l])
+ continue;
+ if (l) {
+ path->slots[l]++;
+ if (path->slots[l] <
+ btrfs_header_nritems(path->nodes[l])) {
+ *level = l;
+ return 0;
+ }
+ }
+ btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]);
+ free_extent_buffer(path->nodes[l]);
+ path->nodes[l] = NULL;
+ path->slots[l] = 0;
+ path->locks[l] = 0;
+ }
+
+ return 1;
+}
+
+static void dump_ref_action(struct btrfs_fs_info *fs_info,
+ struct ref_action *ra)
+{
+ btrfs_err(fs_info,
+" Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
+ ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent,
+ ra->ref.owner, ra->ref.offset, ra->ref.num_refs);
+ __print_stack_trace(fs_info, ra);
+}
+
+/*
+ * Dumps all the information from the block entry to printk, it's going to be
+ * awesome.
+ */
+static void dump_block_entry(struct btrfs_fs_info *fs_info,
+ struct block_entry *be)
+{
+ struct ref_entry *ref;
+ struct root_entry *re;
+ struct ref_action *ra;
+ struct rb_node *n;
+
+ btrfs_err(fs_info,
+"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d",
+ be->bytenr, be->len, be->num_refs, be->metadata,
+ be->from_disk);
+
+ for (n = rb_first(&be->refs); n; n = rb_next(n)) {
+ ref = rb_entry(n, struct ref_entry, node);
+ btrfs_err(fs_info,
+" ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
+ ref->root_objectid, ref->parent, ref->owner,
+ ref->offset, ref->num_refs);
+ }
+
+ for (n = rb_first(&be->roots); n; n = rb_next(n)) {
+ re = rb_entry(n, struct root_entry, node);
+ btrfs_err(fs_info, " root entry %llu, num_refs %llu",
+ re->root_objectid, re->num_refs);
+ }
+
+ list_for_each_entry(ra, &be->actions, list)
+ dump_ref_action(fs_info, ra);
+}
+
+/*
+ * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
+ * @root: the root we are making this modification from.
+ * @bytenr: the bytenr we are modifying.
+ * @num_bytes: number of bytes.
+ * @parent: the parent bytenr.
+ * @ref_root: the original root owner of the bytenr.
+ * @owner: level in the case of metadata, inode in the case of data.
+ * @offset: 0 for metadata, file offset for data.
+ * @action: the action that we are doing, this is the same as the delayed ref
+ * action.
+ *
+ * This will add an action item to the given bytenr and do sanity checks to make
+ * sure we haven't messed something up. If we are making a new allocation and
+ * this block entry has history we will delete all previous actions as long as
+ * our sanity checks pass as they are no longer needed.
+ */
+int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+ u64 parent, u64 ref_root, u64 owner, u64 offset,
+ int action)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ref_entry *ref = NULL, *exist;
+ struct ref_action *ra = NULL;
+ struct block_entry *be = NULL;
+ struct root_entry *re = NULL;
+ int ret = 0;
+ bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+ if (!btrfs_test_opt(root->fs_info, REF_VERIFY))
+ return 0;
+
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
+ ra = kmalloc(sizeof(struct ref_action), GFP_NOFS);
+ if (!ra || !ref) {
+ kfree(ref);
+ kfree(ra);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (parent) {
+ ref->parent = parent;
+ } else {
+ ref->root_objectid = ref_root;
+ ref->owner = owner;
+ ref->offset = offset;
+ }
+ ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1;
+
+ memcpy(&ra->ref, ref, sizeof(struct ref_entry));
+ /*
+ * Save the extra info from the delayed ref in the ref action to make it
+ * easier to figure out what is happening. The real ref's we add to the
+ * ref tree need to reflect what we save on disk so it matches any
+ * on-disk refs we pre-loaded.
+ */
+ ra->ref.owner = owner;
+ ra->ref.offset = offset;
+ ra->ref.root_objectid = ref_root;
+ __save_stack_trace(ra);
+
+ INIT_LIST_HEAD(&ra->list);
+ ra->action = action;
+ ra->root = root->objectid;
+
+ /*
+ * This is an allocation, preallocate the block_entry in case we haven't
+ * used it before.
+ */
+ ret = -EINVAL;
+ if (action == BTRFS_ADD_DELAYED_EXTENT) {
+ /*
+ * For subvol_create we'll just pass in whatever the parent root
+ * is and the new root objectid, so let's not treat the passed
+ * in root as if it really has a ref for this bytenr.
+ */
+ be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root);
+ if (IS_ERR(be)) {
+ kfree(ra);
+ ret = PTR_ERR(be);
+ goto out;
+ }
+ be->num_refs++;
+ if (metadata)
+ be->metadata = 1;
+
+ if (be->num_refs != 1) {
+ btrfs_err(fs_info,
+ "re-allocated a block that still has references to it!");
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ goto out_unlock;
+ }
+
+ while (!list_empty(&be->actions)) {
+ struct ref_action *tmp;
+
+ tmp = list_first_entry(&be->actions, struct ref_action,
+ list);
+ list_del(&tmp->list);
+ kfree(tmp);
+ }
+ } else {
+ struct root_entry *tmp;
+
+ if (!parent) {
+ re = kmalloc(sizeof(struct root_entry), GFP_NOFS);
+ if (!re) {
+ kfree(ref);
+ kfree(ra);
+ ret = -ENOMEM;
+ goto out;
+ }
+ /*
+ * This is the root that is modifying us, so it's the
+ * one we want to lookup below when we modify the
+ * re->num_refs.
+ */
+ ref_root = root->objectid;
+ re->root_objectid = root->objectid;
+ re->num_refs = 0;
+ }
+
+ spin_lock(&root->fs_info->ref_verify_lock);
+ be = lookup_block_entry(&root->fs_info->block_tree, bytenr);
+ if (!be) {
+ btrfs_err(fs_info,
+"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
+ action, (unsigned long long)bytenr,
+ (unsigned long long)num_bytes);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ goto out_unlock;
+ }
+
+ if (!parent) {
+ tmp = insert_root_entry(&be->roots, re);
+ if (tmp) {
+ kfree(re);
+ re = tmp;
+ }
+ }
+ }
+
+ exist = insert_ref_entry(&be->refs, ref);
+ if (exist) {
+ if (action == BTRFS_DROP_DELAYED_REF) {
+ if (exist->num_refs == 0) {
+ btrfs_err(fs_info,
+"dropping a ref for a existing root that doesn't have a ref on the block");
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ra);
+ goto out_unlock;
+ }
+ exist->num_refs--;
+ if (exist->num_refs == 0) {
+ rb_erase(&exist->node, &be->refs);
+ kfree(exist);
+ }
+ } else if (!be->metadata) {
+ exist->num_refs++;
+ } else {
+ btrfs_err(fs_info,
+"attempting to add another ref for an existing ref on a tree block");
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ra);
+ goto out_unlock;
+ }
+ kfree(ref);
+ } else {
+ if (action == BTRFS_DROP_DELAYED_REF) {
+ btrfs_err(fs_info,
+"dropping a ref for a root that doesn't have a ref on the block");
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ra);
+ goto out_unlock;
+ }
+ }
+
+ if (!parent && !re) {
+ re = lookup_root_entry(&be->roots, ref_root);
+ if (!re) {
+ /*
+ * This shouldn't happen because we will add our re
+ * above when we lookup the be with !parent, but just in
+ * case catch this case so we don't panic because I
+ * didn't thik of some other corner case.
+ */
+ btrfs_err(fs_info, "failed to find root %llu for %llu",
+ root->objectid, be->bytenr);
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ra);
+ goto out_unlock;
+ }
+ }
+ if (action == BTRFS_DROP_DELAYED_REF) {
+ if (re)
+ re->num_refs--;
+ be->num_refs--;
+ } else if (action == BTRFS_ADD_DELAYED_REF) {
+ be->num_refs++;
+ if (re)
+ re->num_refs++;
+ }
+ list_add_tail(&ra->list, &be->actions);
+ ret = 0;
+out_unlock:
+ spin_unlock(&root->fs_info->ref_verify_lock);
+out:
+ if (ret)
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ return ret;
+}
+
+/* Free up the ref cache */
+void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
+{
+ struct block_entry *be;
+ struct rb_node *n;
+
+ if (!btrfs_test_opt(fs_info, REF_VERIFY))
+ return;
+
+ spin_lock(&fs_info->ref_verify_lock);
+ while ((n = rb_first(&fs_info->block_tree))) {
+ be = rb_entry(n, struct block_entry, node);
+ rb_erase(&be->node, &fs_info->block_tree);
+ free_block_entry(be);
+ cond_resched_lock(&fs_info->ref_verify_lock);
+ }
+ spin_unlock(&fs_info->ref_verify_lock);
+}
+
+void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
+ u64 len)
+{
+ struct block_entry *be = NULL, *entry;
+ struct rb_node *n;
+
+ if (!btrfs_test_opt(fs_info, REF_VERIFY))
+ return;
+
+ spin_lock(&fs_info->ref_verify_lock);
+ n = fs_info->block_tree.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct block_entry, node);
+ if (entry->bytenr < start) {
+ n = n->rb_right;
+ } else if (entry->bytenr > start) {
+ n = n->rb_left;
+ } else {
+ be = entry;
+ break;
+ }
+ /* We want to get as close to start as possible */
+ if (be == NULL ||
+ (entry->bytenr < start && be->bytenr > start) ||
+ (entry->bytenr < start && entry->bytenr > be->bytenr))
+ be = entry;
+ }
+
+ /*
+ * Could have an empty block group, maybe have something to check for
+ * this case to verify we were actually empty?
+ */
+ if (!be) {
+ spin_unlock(&fs_info->ref_verify_lock);
+ return;
+ }
+
+ n = &be->node;
+ while (n) {
+ be = rb_entry(n, struct block_entry, node);
+ n = rb_next(n);
+ if (be->bytenr < start && be->bytenr + be->len > start) {
+ btrfs_err(fs_info,
+ "block entry overlaps a block group [%llu,%llu]!",
+ start, len);
+ dump_block_entry(fs_info, be);
+ continue;
+ }
+ if (be->bytenr < start)
+ continue;
+ if (be->bytenr >= start + len)
+ break;
+ if (be->bytenr + be->len > start + len) {
+ btrfs_err(fs_info,
+ "block entry overlaps a block group [%llu,%llu]!",
+ start, len);
+ dump_block_entry(fs_info, be);
+ }
+ rb_erase(&be->node, &fs_info->block_tree);
+ free_block_entry(be);
+ }
+ spin_unlock(&fs_info->ref_verify_lock);
+}
+
+/* Walk down all roots and build the ref tree, meant to be called at mount */
+int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct extent_buffer *eb;
+ u64 bytenr = 0, num_bytes = 0;
+ int ret, level;
+
+ if (!btrfs_test_opt(fs_info, REF_VERIFY))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ eb = btrfs_read_lock_root_node(fs_info->extent_root);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ level = btrfs_header_level(eb);
+ path->nodes[level] = eb;
+ path->slots[level] = 0;
+ path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ while (1) {
+ /*
+ * We have to keep track of the bytenr/num_bytes we last hit
+ * because we could have run out of space for an inline ref, and
+ * would have had to added a ref key item which may appear on a
+ * different leaf from the original extent item.
+ */
+ ret = walk_down_tree(fs_info->extent_root, path, level,
+ &bytenr, &num_bytes);
+ if (ret)
+ break;
+ ret = walk_up_tree(root, path, &level);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret) {
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ btrfs_free_ref_cache(fs_info);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
new file mode 100644
index 000000000000..3bf02ce0e1e2
--- /dev/null
+++ b/fs/btrfs/ref-verify.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __REF_VERIFY__
+#define __REF_VERIFY__
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
+void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
+int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+ u64 parent, u64 ref_root, u64 owner, u64 offset,
+ int action);
+void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
+ u64 len);
+
+static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->ref_verify_lock);
+ fs_info->block_tree = RB_ROOT;
+}
+#else
+static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+
+static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
+{
+}
+
+static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 ref_root,
+ u64 owner, u64 offset, int action)
+{
+ return 0;
+}
+
+static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len)
+{
+}
+
+static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
+{
+}
+
+#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
+#endif /* _REF_VERIFY__ */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3a49a3c2fca4..4cf2eb67eba6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1742,7 +1742,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
- ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
+ ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
num_bytes, parent,
btrfs_header_owner(leaf),
key.objectid, key.offset);
@@ -1751,7 +1751,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
break;
}
- ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
parent, btrfs_header_owner(leaf),
key.objectid, key.offset);
if (ret) {
@@ -1952,21 +1952,21 @@ again:
path->slots[level], old_ptr_gen);
btrfs_mark_buffer_dirty(path->nodes[level]);
- ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr,
+ ret = btrfs_inc_extent_ref(trans, src, old_bytenr,
blocksize, path->nodes[level]->start,
src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
- ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
+ ret = btrfs_inc_extent_ref(trans, dest, new_bytenr,
blocksize, 0, dest->root_key.objectid,
level - 1, 0);
BUG_ON(ret);
- ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize,
+ ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
path->nodes[level]->start,
src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
- ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize,
+ ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
0);
BUG_ON(ret);
@@ -2400,11 +2400,11 @@ void free_reloc_roots(struct list_head *list)
while (!list_empty(list)) {
reloc_root = list_entry(list->next, struct btrfs_root,
root_list);
+ __del_reloc_root(reloc_root);
free_extent_buffer(reloc_root->node);
free_extent_buffer(reloc_root->commit_root);
reloc_root->node = NULL;
reloc_root->commit_root = NULL;
- __del_reloc_root(reloc_root);
}
}
@@ -2808,7 +2808,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
trans->transid);
btrfs_mark_buffer_dirty(upper->eb);
- ret = btrfs_inc_extent_ref(trans, root->fs_info,
+ ret = btrfs_inc_extent_ref(trans, root,
node->eb->start, blocksize,
upper->eb->start,
btrfs_header_owner(upper->eb),
@@ -3246,6 +3246,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
put_page(page);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ PAGE_SIZE);
ret = -EIO;
goto out;
}
@@ -3275,6 +3277,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
put_page(page);
index++;
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 95bcc3cce78f..3338407ef0f0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -226,10 +226,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *root;
int err = 0;
int ret;
- bool can_recover = true;
-
- if (sb_rdonly(fs_info->sb))
- can_recover = false;
path = btrfs_alloc_path();
if (!path)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e3f6c49e5c4d..b2f871d80982 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -231,7 +231,7 @@ struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
const char *errstr;
- sector_t sector;
+ u64 physical;
u64 logical;
struct btrfs_device *dev;
};
@@ -797,10 +797,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
btrfs_warn_in_rcu(fs_info,
- "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
+"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
- (unsigned long long)swarn->sector,
+ swarn->physical,
root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
@@ -810,10 +810,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
err:
btrfs_warn_in_rcu(fs_info,
- "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
+ "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
- (unsigned long long)swarn->sector,
+ swarn->physical,
root, inum, offset, ret);
free_ipath(ipath);
@@ -845,7 +845,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
if (!path)
return;
- swarn.sector = (sblock->pagev[0]->physical) >> 9;
+ swarn.physical = sblock->pagev[0]->physical;
swarn.logical = sblock->pagev[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
@@ -868,10 +868,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
item_size, &ref_root,
&ref_level);
btrfs_warn_in_rcu(fs_info,
- "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu",
+"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical,
rcu_str_deref(dev->name),
- (unsigned long long)swarn.sector,
+ swarn.physical,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
ret < 0 ? -1 : ref_root);
@@ -883,7 +883,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
swarn.dev = dev;
iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1,
- scrub_print_warning_inode, &swarn);
+ scrub_print_warning_inode, &swarn, false);
}
out:
@@ -1047,7 +1047,7 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
* can be found.
*/
ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
- scrub_fixup_readpage, fixup);
+ scrub_fixup_readpage, fixup, false);
if (ret < 0) {
uncorrectable = 1;
goto out;
@@ -4390,7 +4390,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
}
ret = iterate_inodes_from_logical(logical, fs_info, path,
- record_inode_for_nocow, nocow_ctx);
+ record_inode_for_nocow, nocow_ctx, false);
if (ret != 0 && ret != -ENOENT) {
btrfs_warn(fs_info,
"iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 32b043ef8ac9..c10e4c70f02d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -26,6 +26,7 @@
#include <linux/radix-tree.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
+#include <linux/compat.h>
#include "send.h"
#include "backref.h"
@@ -992,7 +993,6 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
* path must point to the dir item when called.
*/
static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *found_key,
iterate_dir_item_t iterate, void *ctx)
{
int ret = 0;
@@ -1271,12 +1271,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
*/
if (ino >= bctx->cur_objectid)
return 0;
-#if 0
- if (ino > bctx->cur_objectid)
- return 0;
- if (offset + bctx->extent_len > bctx->cur_offset)
- return 0;
-#endif
}
bctx->found++;
@@ -1429,7 +1423,7 @@ static int find_extent_clone(struct send_ctx *sctx,
extent_item_pos = 0;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1, __iterate_backrefs,
- backref_ctx);
+ backref_ctx, false);
if (ret < 0)
goto out;
@@ -2630,7 +2624,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
} else {
btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
(int)(mode & S_IFMT));
- ret = -ENOTSUPP;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -4106,8 +4100,8 @@ out:
return ret;
}
-static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
- struct fs_path *name, void *ctx, struct list_head *refs)
+static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
+ void *ctx, struct list_head *refs)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -4143,8 +4137,7 @@ static int __record_new_ref(int num, u64 dir, int index,
void *ctx)
{
struct send_ctx *sctx = ctx;
- return record_ref(sctx->send_root, num, dir, index, name,
- ctx, &sctx->new_refs);
+ return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
}
@@ -4153,8 +4146,8 @@ static int __record_deleted_ref(int num, u64 dir, int index,
void *ctx)
{
struct send_ctx *sctx = ctx;
- return record_ref(sctx->parent_root, num, dir, index, name,
- ctx, &sctx->deleted_refs);
+ return record_ref(sctx->parent_root, dir, name, ctx,
+ &sctx->deleted_refs);
}
static int record_new_ref(struct send_ctx *sctx)
@@ -4498,7 +4491,7 @@ static int process_new_xattr(struct send_ctx *sctx)
int ret = 0;
ret = iterate_dir_item(sctx->send_root, sctx->left_path,
- sctx->cmp_key, __process_new_xattr, sctx);
+ __process_new_xattr, sctx);
return ret;
}
@@ -4506,7 +4499,7 @@ static int process_new_xattr(struct send_ctx *sctx)
static int process_deleted_xattr(struct send_ctx *sctx)
{
return iterate_dir_item(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, __process_deleted_xattr, sctx);
+ __process_deleted_xattr, sctx);
}
struct find_xattr_ctx {
@@ -4551,7 +4544,7 @@ static int find_xattr(struct btrfs_root *root,
ctx.found_data = NULL;
ctx.found_data_len = 0;
- ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
+ ret = iterate_dir_item(root, path, __find_xattr, &ctx);
if (ret < 0)
return ret;
@@ -4621,11 +4614,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
int ret = 0;
ret = iterate_dir_item(sctx->send_root, sctx->left_path,
- sctx->cmp_key, __process_changed_new_xattr, sctx);
+ __process_changed_new_xattr, sctx);
if (ret < 0)
goto out;
ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, __process_changed_deleted_xattr, sctx);
+ __process_changed_deleted_xattr, sctx);
out:
return ret;
@@ -4675,8 +4668,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
goto out;
}
- ret = iterate_dir_item(root, path, &found_key,
- __process_new_xattr, sctx);
+ ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
if (ret < 0)
goto out;
@@ -4723,16 +4715,27 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
/* initial readahead */
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
file_ra_state_init(&sctx->ra, inode->i_mapping);
- page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index,
- last_index - index + 1);
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
- page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
+
+ page = find_lock_page(inode->i_mapping, index);
if (!page) {
- ret = -ENOMEM;
- break;
+ page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
+ NULL, index, last_index + 1 - index);
+
+ page = find_or_create_page(inode->i_mapping, index,
+ GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(inode->i_mapping, &sctx->ra,
+ NULL, page, index, last_index + 1 - index);
}
if (!PageUptodate(page)) {
@@ -6162,9 +6165,7 @@ out:
* Updates compare related fields in sctx and simply forwards to the actual
* changed_xxx functions.
*/
-static int changed_cb(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- struct btrfs_path *left_path,
+static int changed_cb(struct btrfs_path *left_path,
struct btrfs_path *right_path,
struct btrfs_key *key,
enum btrfs_compare_tree_result result,
@@ -6246,8 +6247,8 @@ static int full_send_tree(struct send_ctx *sctx)
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
- ret = changed_cb(send_root, NULL, path, NULL,
- &found_key, BTRFS_COMPARE_TREE_NEW, sctx);
+ ret = changed_cb(path, NULL, &found_key,
+ BTRFS_COMPARE_TREE_NEW, sctx);
if (ret < 0)
goto out;
@@ -6365,13 +6366,12 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
spin_unlock(&root->root_item_lock);
}
-long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
+long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
- struct btrfs_ioctl_send_args *arg = NULL;
struct btrfs_key key;
struct send_ctx *sctx = NULL;
u32 i;
@@ -6407,13 +6407,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
- arg = memdup_user(arg_, sizeof(*arg));
- if (IS_ERR(arg)) {
- ret = PTR_ERR(arg);
- arg = NULL;
- goto out;
- }
-
/*
* Check that we don't overflow at later allocations, we request
* clone_sources_count + 1 items, and compare to unsigned long inside
@@ -6654,7 +6647,6 @@ out:
if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
btrfs_root_dec_send_in_progress(sctx->parent_root);
- kfree(arg);
kvfree(clone_sources_tmp);
if (sctx) {
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 02e00166c4da..3aa4bc55754f 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,5 +130,5 @@ enum {
#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
#ifdef __KERNEL__
-long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 35a128acfbd1..65af029559b5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,7 +202,6 @@ static struct ratelimit_state printk_limits[] = {
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
- struct super_block *sb = fs_info->sb;
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
va_list args;
@@ -228,7 +227,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
vaf.va = &args;
if (__ratelimit(ratelimit))
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+ printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
+ fs_info ? fs_info->sb->s_id : "<unknown>", &vaf);
va_end(args);
}
@@ -292,7 +292,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
vaf.va = &args;
errstr = btrfs_decode_error(errno);
- if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
+ if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
s_id, function, line, &vaf, errno, errstr);
@@ -326,6 +326,9 @@ enum {
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ Opt_ref_verify,
+#endif
Opt_err,
};
@@ -387,6 +390,9 @@ static const match_table_t tokens = {
{Opt_fragment_metadata, "fragment=metadata"},
{Opt_fragment_all, "fragment=all"},
#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ {Opt_ref_verify, "ref_verify"},
+#endif
{Opt_err, NULL},
};
@@ -502,6 +508,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
strncmp(args[0].from, "zlib", 4) == 0) {
compress_type = "zlib";
info->compress_type = BTRFS_COMPRESS_ZLIB;
+ info->compress_level =
+ btrfs_compress_str2level(args[0].from);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -549,9 +557,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
compress_force != saved_compress_force)) ||
(!btrfs_test_opt(info, COMPRESS) &&
no_compress == 1)) {
- btrfs_info(info, "%s %s compression",
+ btrfs_info(info, "%s %s compression, level %d",
(compress_force) ? "force" : "use",
- compress_type);
+ compress_type, info->compress_level);
}
compress_force = false;
break;
@@ -825,6 +833,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
break;
#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ case Opt_ref_verify:
+ btrfs_info(info, "doing ref verification");
+ btrfs_set_opt(info->mount_opt, REF_VERIFY);
+ break;
+#endif
case Opt_err:
btrfs_info(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
@@ -1135,7 +1149,7 @@ static int btrfs_fill_super(struct super_block *sb,
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
- sb->s_flags |= MS_I_VERSION;
+ sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
err = super_setup_bdi(sb);
@@ -1205,8 +1219,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
* happens. The pending operations are delayed to the
* next commit after thawing.
*/
- if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
- __sb_end_write(sb, SB_FREEZE_WRITE);
+ if (sb_start_write_trylock(sb))
+ sb_end_write(sb);
else
return 0;
trans = btrfs_start_transaction(root, 0);
@@ -1246,6 +1260,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
+ if (info->compress_level)
+ seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
seq_puts(seq, ",nossd");
@@ -1305,6 +1321,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, FRAGMENT_METADATA))
seq_puts(seq, ",fragment=metadata");
#endif
+ if (btrfs_test_opt(info, REF_VERIFY))
+ seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
seq_puts(seq, ",subvol=");
@@ -2112,7 +2130,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
* succeed even if the Avail is zero. But this is better than the other
* way around.
*/
- thresh = 4 * 1024 * 1024;
+ thresh = SZ_4M;
if (!mixed && total_free_meta - thresh < block_rsv->size)
buf->f_bavail = 0;
@@ -2319,6 +2337,9 @@ static void btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
", integrity-checker=on"
#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ ", ref-verify=on"
+#endif
"\n",
btrfs_crc32c_impl());
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 883881b16c86..a28bba801264 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -247,7 +247,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
}
-BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
+BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show);
static ssize_t global_rsv_reserved_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -256,15 +256,15 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
}
-BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
+BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show);
#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
static ssize_t raid_bytes_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
-BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
-BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
+BTRFS_ATTR(raid, total_bytes, raid_bytes_show);
+BTRFS_ATTR(raid, used_bytes, raid_bytes_show);
static ssize_t raid_bytes_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -277,7 +277,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
down_read(&sinfo->groups_sem);
list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
- if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
+ if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes))
val += block_group->key.offset;
else
val += btrfs_block_group_used(&block_group->item);
@@ -287,8 +287,8 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
}
static struct attribute *raid_attributes[] = {
- BTRFS_RAID_ATTR_PTR(total_bytes),
- BTRFS_RAID_ATTR_PTR(used_bytes),
+ BTRFS_ATTR_PTR(raid, total_bytes),
+ BTRFS_ATTR_PTR(raid, used_bytes),
NULL
};
@@ -311,7 +311,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
struct btrfs_space_info *sinfo = to_space_info(kobj); \
return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
} \
-BTRFS_ATTR(field, btrfs_space_info_show_##field)
+BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
struct kobj_attribute *a,
@@ -331,19 +331,20 @@ SPACE_INFO_ATTR(bytes_may_use);
SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
+BTRFS_ATTR(space_info, total_bytes_pinned,
+ btrfs_space_info_show_total_bytes_pinned);
static struct attribute *space_info_attrs[] = {
- BTRFS_ATTR_PTR(flags),
- BTRFS_ATTR_PTR(total_bytes),
- BTRFS_ATTR_PTR(bytes_used),
- BTRFS_ATTR_PTR(bytes_pinned),
- BTRFS_ATTR_PTR(bytes_reserved),
- BTRFS_ATTR_PTR(bytes_may_use),
- BTRFS_ATTR_PTR(bytes_readonly),
- BTRFS_ATTR_PTR(disk_used),
- BTRFS_ATTR_PTR(disk_total),
- BTRFS_ATTR_PTR(total_bytes_pinned),
+ BTRFS_ATTR_PTR(space_info, flags),
+ BTRFS_ATTR_PTR(space_info, total_bytes),
+ BTRFS_ATTR_PTR(space_info, bytes_used),
+ BTRFS_ATTR_PTR(space_info, bytes_pinned),
+ BTRFS_ATTR_PTR(space_info, bytes_reserved),
+ BTRFS_ATTR_PTR(space_info, bytes_may_use),
+ BTRFS_ATTR_PTR(space_info, bytes_readonly),
+ BTRFS_ATTR_PTR(space_info, disk_used),
+ BTRFS_ATTR_PTR(space_info, disk_total),
+ BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
NULL,
};
@@ -361,8 +362,8 @@ struct kobj_type space_info_ktype = {
};
static const struct attribute *allocation_attrs[] = {
- BTRFS_ATTR_PTR(global_rsv_reserved),
- BTRFS_ATTR_PTR(global_rsv_size),
+ BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
+ BTRFS_ATTR_PTR(allocation, global_rsv_size),
NULL,
};
@@ -415,7 +416,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
return len;
}
-BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
+BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store);
static ssize_t btrfs_nodesize_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -425,7 +426,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
}
-BTRFS_ATTR(nodesize, btrfs_nodesize_show);
+BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -436,7 +437,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
fs_info->super_copy->sectorsize);
}
-BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
+BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -447,7 +448,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
fs_info->super_copy->sectorsize);
}
-BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
+BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
static ssize_t quota_override_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -487,14 +488,14 @@ static ssize_t quota_override_store(struct kobject *kobj,
return len;
}
-BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
+BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store);
static const struct attribute *btrfs_attrs[] = {
- BTRFS_ATTR_PTR(label),
- BTRFS_ATTR_PTR(nodesize),
- BTRFS_ATTR_PTR(sectorsize),
- BTRFS_ATTR_PTR(clone_alignment),
- BTRFS_ATTR_PTR(quota_override),
+ BTRFS_ATTR_PTR(, label),
+ BTRFS_ATTR_PTR(, nodesize),
+ BTRFS_ATTR_PTR(, sectorsize),
+ BTRFS_ATTR_PTR(, clone_alignment),
+ BTRFS_ATTR_PTR(, quota_override),
NULL,
};
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index d7da1a4c2f6c..80457f31c29f 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BTRFS_SYSFS_H_
#define _BTRFS_SYSFS_H_
@@ -20,21 +21,16 @@ enum btrfs_feature_set {
.store = _store, \
}
-#define BTRFS_ATTR_RW(_name, _show, _store) \
- static struct kobj_attribute btrfs_attr_##_name = \
+#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
-#define BTRFS_ATTR(_name, _show) \
- static struct kobj_attribute btrfs_attr_##_name = \
+#define BTRFS_ATTR(_prefix, _name, _show) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
-#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
-
-#define BTRFS_RAID_ATTR(_name, _show) \
- static struct kobj_attribute btrfs_raid_attr_##_name = \
- __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
-
-#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
+#define BTRFS_ATTR_PTR(_prefix, _name) \
+ (&btrfs_attr_##_prefix##_##_name.attr)
struct btrfs_feature_attr {
@@ -43,15 +39,16 @@ struct btrfs_feature_attr {
u64 feature_bit;
};
-#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \
-static struct btrfs_feature_attr btrfs_attr_##_name = { \
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \
+static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \
btrfs_feature_attr_show, \
btrfs_feature_attr_store), \
.feature_set = _feature_set, \
- .feature_bit = _prefix ##_## _feature_bit, \
+ .feature_bit = _feature_prefix ##_## _feature_bit, \
}
-#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr)
+#define BTRFS_FEAT_ATTR_PTR(_name) \
+ (&btrfs_attr_features_##_name.kobj_attr.attr)
#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 1458bb0ea124..8444a018cca2 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -500,7 +500,8 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
path = btrfs_alloc_path();
if (!path) {
test_msg("Couldn't allocate path\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
ret = add_block_group_free_space(&trans, root->fs_info, cache);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 8c91d03cc82d..f797642c013d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -968,7 +968,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
- BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
NULL, 0);
if (ret) {
@@ -983,7 +982,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
NULL, 0);
@@ -1003,7 +1001,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DIRTY |
- EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+ EXTENT_UPTODATE, 0, 0,
NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
@@ -1017,7 +1015,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1)
+ sectorsize - 1,
@@ -1035,12 +1032,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
- *
- * I'm artificially adding 2 to outstanding_extents because in the
- * buffered IO case we'd add things up as we go, but I don't feel like
- * doing that here, this isn't the interesting case we want to test.
*/
- BTRFS_I(inode)->outstanding_extents += 2;
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
@@ -1059,7 +1051,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
@@ -1079,7 +1070,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ EXTENT_UPTODATE, 0, 0,
NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
@@ -1096,7 +1087,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
* Refill the hole again just for good measure, because I thought it
* might fail and I'd rather satisfy my paranoia at this point.
*/
- BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
@@ -1114,7 +1104,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ EXTENT_UPTODATE, 0, 0,
NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
@@ -1131,7 +1121,7 @@ out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ EXTENT_UPTODATE, 0, 0,
NULL, GFP_KERNEL);
iput(inode);
btrfs_free_dummy_root(root);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 0f4ce970d195..90204b166643 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -240,7 +240,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+ false);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
@@ -252,7 +253,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+ false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -275,7 +277,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
old_roots = NULL;
new_roots = NULL;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+ false);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
@@ -286,7 +289,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret)
return -EINVAL;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+ false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -337,7 +341,8 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+ false);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
@@ -349,7 +354,8 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+ false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -370,7 +376,8 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+ false);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
@@ -382,7 +389,8 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+ false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -409,7 +417,8 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+ false);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
@@ -421,7 +430,8 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+ false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f615d59b0489..5a8c2649af2f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -797,8 +797,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- if (fs_info->global_block_rsv.space_info->full &&
- btrfs_check_space_for_delayed_refs(trans, fs_info))
+ if (btrfs_check_space_for_delayed_refs(trans, fs_info))
return 1;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -950,6 +949,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
+ atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
bool wait_writeback = false;
@@ -985,6 +985,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
cond_resched();
start = end + 1;
}
+ atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
return werr;
}
@@ -1915,8 +1916,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
+ /*
+ * We use writeback_inodes_sb here because if we used
+ * btrfs_start_delalloc_roots we would deadlock with fs freeze.
+ * Currently are holding the fs freeze lock, if we do an async flush
+ * we'll do btrfs_join_transaction() and deadlock because we need to
+ * wait for the fs freeze lock. Using the direct flushing we benefit
+ * from already being in a transaction and our join_transaction doesn't
+ * have to re-take the fs freeze lock.
+ */
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- return btrfs_start_delalloc_roots(fs_info, 1, -1);
+ writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
return 0;
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
new file mode 100644
index 000000000000..114fc5f0ecc5
--- /dev/null
+++ b/fs/btrfs/tree-checker.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+/*
+ * The module is used to catch unexpected/corrupted tree block data.
+ * Such behavior can be caused either by a fuzzed image or bugs.
+ *
+ * The objective is to do leaf/node validation checks when tree block is read
+ * from disk, and check *every* possible member, so other code won't
+ * need to checking them again.
+ *
+ * Due to the potential and unwanted damage, every checker needs to be
+ * carefully reviewed otherwise so it does not prevent mount of valid images.
+ */
+
+#include "ctree.h"
+#include "tree-checker.h"
+#include "disk-io.h"
+#include "compression.h"
+
+/*
+ * Error message should follow the following format:
+ * corrupt <type>: <identifier>, <reason>[, <bad_value>]
+ *
+ * @type: leaf or node
+ * @identifier: the necessary info to locate the leaf/node.
+ * It's recommened to decode key.objecitd/offset if it's
+ * meaningful.
+ * @reason: describe the error
+ * @bad_value: optional, it's recommened to output bad value and its
+ * expected value (range).
+ *
+ * Since comma is used to separate the components, only space is allowed
+ * inside each component.
+ */
+
+/*
+ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
+ * Allows callers to customize the output.
+ */
+__printf(4, 5)
+static void generic_err(const struct btrfs_root *root,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(root->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+ va_end(args);
+}
+
+/*
+ * Customized reporter for extent data item, since its key objectid and
+ * offset has its own meaning.
+ */
+__printf(4, 5)
+static void file_extent_err(const struct btrfs_root *root,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(root->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
+ btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf);
+ va_end(args);
+}
+
+/*
+ * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
+ * Else return 1
+ */
+#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \
+({ \
+ if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
+ file_extent_err((root), (leaf), (slot), \
+ "invalid %s for file extent, have %llu, should be aligned to %u", \
+ (#name), btrfs_file_extent_##name((leaf), (fi)), \
+ (alignment)); \
+ (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \
+})
+
+static int check_extent_data_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_file_extent_item *fi;
+ u32 sectorsize = root->fs_info->sectorsize;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+ if (!IS_ALIGNED(key->offset, sectorsize)) {
+ file_extent_err(root, leaf, slot,
+"unaligned file_offset for file extent, have %llu should be aligned to %u",
+ key->offset, sectorsize);
+ return -EUCLEAN;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+ file_extent_err(root, leaf, slot,
+ "invalid type for file extent, have %u expect range [0, %u]",
+ btrfs_file_extent_type(leaf, fi),
+ BTRFS_FILE_EXTENT_TYPES);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Support for new compression/encrption must introduce incompat flag,
+ * and must be caught in open_ctree().
+ */
+ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+ file_extent_err(root, leaf, slot,
+ "invalid compression for file extent, have %u expect range [0, %u]",
+ btrfs_file_extent_compression(leaf, fi),
+ BTRFS_COMPRESS_TYPES);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_encryption(leaf, fi)) {
+ file_extent_err(root, leaf, slot,
+ "invalid encryption for file extent, have %u expect 0",
+ btrfs_file_extent_encryption(leaf, fi));
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ /* Inline extent must have 0 as key offset */
+ if (key->offset) {
+ file_extent_err(root, leaf, slot,
+ "invalid file_offset for inline file extent, have %llu expect 0",
+ key->offset);
+ return -EUCLEAN;
+ }
+
+ /* Compressed inline extent has no on-disk size, skip it */
+ if (btrfs_file_extent_compression(leaf, fi) !=
+ BTRFS_COMPRESS_NONE)
+ return 0;
+
+ /* Uncompressed inline extent size must match item size */
+ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi)) {
+ file_extent_err(root, leaf, slot,
+ "invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
+ item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi));
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /* Regular or preallocated extent has fixed item size */
+ if (item_size != sizeof(*fi)) {
+ file_extent_err(root, leaf, slot,
+ "invalid item size for reg/prealloc file extent, have %u expect %zu",
+ item_size, sizeof(*fi));
+ return -EUCLEAN;
+ }
+ if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize))
+ return -EUCLEAN;
+ return 0;
+}
+
+static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ u32 sectorsize = root->fs_info->sectorsize;
+ u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+
+ if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+ generic_err(root, leaf, slot,
+ "invalid key objectid for csum item, have %llu expect %llu",
+ key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(key->offset, sectorsize)) {
+ generic_err(root, leaf, slot,
+ "unaligned key offset for csum item, have %llu should be aligned to %u",
+ key->offset, sectorsize);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+ generic_err(root, leaf, slot,
+ "unaligned item size for csum item, have %u should be aligned to %u",
+ btrfs_item_size_nr(leaf, slot), csumsize);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ int ret = 0;
+
+ switch (key->type) {
+ case BTRFS_EXTENT_DATA_KEY:
+ ret = check_extent_data_item(root, leaf, key, slot);
+ break;
+ case BTRFS_EXTENT_CSUM_KEY:
+ ret = check_csum_item(root, leaf, key, slot);
+ break;
+ }
+ return ret;
+}
+
+int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ /* No valid key type is 0, so all key should be larger than this key */
+ struct btrfs_key prev_key = {0, 0, 0};
+ struct btrfs_key key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ /*
+ * Extent buffers from a relocation tree have a owner field that
+ * corresponds to the subvolume tree they are based on. So just from an
+ * extent buffer alone we can not find out what is the id of the
+ * corresponding subvolume tree, so we can not figure out if the extent
+ * buffer corresponds to the root of the relocation tree or not. So
+ * skip this check for relocation trees.
+ */
+ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+ struct btrfs_root *check_root;
+
+ key.objectid = btrfs_header_owner(leaf);
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ check_root = btrfs_get_fs_root(fs_info, &key, false);
+ /*
+ * The only reason we also check NULL here is that during
+ * open_ctree() some roots has not yet been set up.
+ */
+ if (!IS_ERR_OR_NULL(check_root)) {
+ struct extent_buffer *eb;
+
+ eb = btrfs_root_node(check_root);
+ /* if leaf is the root, then it's fine */
+ if (leaf != eb) {
+ generic_err(check_root, leaf, 0,
+ "invalid nritems, have %u should not be 0 for non-root leaf",
+ nritems);
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ free_extent_buffer(eb);
+ }
+ return 0;
+ }
+
+ if (nritems == 0)
+ return 0;
+
+ /*
+ * Check the following things to make sure this is a good leaf, and
+ * leaf users won't need to bother with similar sanity checks:
+ *
+ * 1) key ordering
+ * 2) item offset and size
+ * No overlap, no hole, all inside the leaf.
+ * 3) item content
+ * If possible, do comprehensive sanity check.
+ * NOTE: All checks must only rely on the item data itself.
+ */
+ for (slot = 0; slot < nritems; slot++) {
+ u32 item_end_expected;
+ int ret;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /* Make sure the keys are in the right order */
+ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+ generic_err(root, leaf, slot,
+ "bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
+ prev_key.objectid, prev_key.type,
+ prev_key.offset, key.objectid, key.type,
+ key.offset);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (slot == 0)
+ item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
+ else
+ item_end_expected = btrfs_item_offset_nr(leaf,
+ slot - 1);
+ if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+ generic_err(root, leaf, slot,
+ "unexpected item end, have %u expect %u",
+ btrfs_item_end_nr(leaf, slot),
+ item_end_expected);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just in case all the items are consistent to each other, but
+ * all point outside of the leaf.
+ */
+ if (btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ generic_err(root, leaf, slot,
+ "slot end outside of leaf, have %u expect range [0, %u]",
+ btrfs_item_end_nr(leaf, slot),
+ BTRFS_LEAF_DATA_SIZE(fs_info));
+ return -EUCLEAN;
+ }
+
+ /* Also check if the item pointer overlaps with btrfs item. */
+ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
+ btrfs_item_ptr_offset(leaf, slot)) {
+ generic_err(root, leaf, slot,
+ "slot overlaps with its data, item end %lu data start %lu",
+ btrfs_item_nr_offset(slot) +
+ sizeof(struct btrfs_item),
+ btrfs_item_ptr_offset(leaf, slot));
+ return -EUCLEAN;
+ }
+
+ /* Check if the item size and content meet other criteria */
+ ret = check_leaf_item(root, leaf, &key, slot);
+ if (ret < 0)
+ return ret;
+
+ prev_key.objectid = key.objectid;
+ prev_key.type = key.type;
+ prev_key.offset = key.offset;
+ }
+
+ return 0;
+}
+
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+ unsigned long nr = btrfs_header_nritems(node);
+ struct btrfs_key key, next_key;
+ int slot;
+ u64 bytenr;
+ int ret = 0;
+
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
+ btrfs_crit(root->fs_info,
+"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
+ root->objectid, node->start,
+ nr == 0 ? "small" : "large", nr,
+ BTRFS_NODEPTRS_PER_BLOCK(root->fs_info));
+ return -EUCLEAN;
+ }
+
+ for (slot = 0; slot < nr - 1; slot++) {
+ bytenr = btrfs_node_blockptr(node, slot);
+ btrfs_node_key_to_cpu(node, &key, slot);
+ btrfs_node_key_to_cpu(node, &next_key, slot + 1);
+
+ if (!bytenr) {
+ generic_err(root, node, slot,
+ "invalid NULL node pointer");
+ ret = -EUCLEAN;
+ goto out;
+ }
+ if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) {
+ generic_err(root, node, slot,
+ "unaligned pointer, have %llu should be aligned to %u",
+ bytenr, root->fs_info->sectorsize);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+ generic_err(root, node, slot,
+ "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
+ key.objectid, key.type, key.offset,
+ next_key.objectid, next_key.type,
+ next_key.offset);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
new file mode 100644
index 000000000000..96c486e95d70
--- /dev/null
+++ b/fs/btrfs/tree-checker.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+#ifndef __BTRFS_TREE_CHECKER__
+#define __BTRFS_TREE_CHECKER__
+
+#include "ctree.h"
+#include "extent_io.h"
+
+int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
+
+#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ad7f4bab640b..aa7c71cff575 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -717,7 +717,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
if (ret == 0) {
- ret = btrfs_inc_extent_ref(trans, fs_info,
+ ret = btrfs_inc_extent_ref(trans, root,
ins.objectid, ins.offset,
0, root->root_key.objectid,
key->objectid, offset);
@@ -2699,34 +2699,36 @@ static void wait_log_commit(struct btrfs_root *root, int transid)
* so we know that if ours is more than 2 older than the
* current transaction, we're done
*/
- do {
+ for (;;) {
prepare_to_wait(&root->log_commit_wait[index],
&wait, TASK_UNINTERRUPTIBLE);
- mutex_unlock(&root->log_mutex);
- if (root->log_transid_committed < transid &&
- atomic_read(&root->log_commit[index]))
- schedule();
+ if (!(root->log_transid_committed < transid &&
+ atomic_read(&root->log_commit[index])))
+ break;
- finish_wait(&root->log_commit_wait[index], &wait);
+ mutex_unlock(&root->log_mutex);
+ schedule();
mutex_lock(&root->log_mutex);
- } while (root->log_transid_committed < transid &&
- atomic_read(&root->log_commit[index]));
+ }
+ finish_wait(&root->log_commit_wait[index], &wait);
}
static void wait_for_writer(struct btrfs_root *root)
{
DEFINE_WAIT(wait);
- while (atomic_read(&root->log_writers)) {
- prepare_to_wait(&root->log_writer_wait,
- &wait, TASK_UNINTERRUPTIBLE);
+ for (;;) {
+ prepare_to_wait(&root->log_writer_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&root->log_writers))
+ break;
+
mutex_unlock(&root->log_mutex);
- if (atomic_read(&root->log_writers))
- schedule();
- finish_wait(&root->log_writer_wait, &wait);
+ schedule();
mutex_lock(&root->log_mutex);
}
+ finish_wait(&root->log_writer_wait, &wait);
}
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
@@ -4181,6 +4183,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
+ u64 logged_start, logged_end;
u64 test_gen;
int ret = 0;
int num = 0;
@@ -4190,10 +4193,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
down_write(&inode->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
+ logged_start = start;
+ logged_end = end;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
list_del_init(&em->list);
-
/*
* Just an arbitrary number, this can be really CPU intensive
* once we start getting a lot of extents, and really once we
@@ -4208,6 +4212,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
if (em->generation <= test_gen)
continue;
+
+ if (em->start < logged_start)
+ logged_start = em->start;
+ if ((em->start + em->len - 1) > logged_end)
+ logged_end = em->start + em->len - 1;
+
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -4216,7 +4226,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
}
list_sort(NULL, &extents, extent_cmp);
- btrfs_get_logged_extents(inode, logged_list, start, end);
+ btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
/*
* Some ordered extents started by fsync might have completed
* before we could collect them into the list logged_list, which
@@ -4637,7 +4647,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
- struct extent_buffer *src = NULL;
LIST_HEAD(logged_list);
u64 last_extent = 0;
int err = 0;
@@ -4880,7 +4889,6 @@ again:
goto next_slot;
}
- src = path->nodes[0];
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
goto next_slot;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e8f16c305df..f1ecb938ba4d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -360,7 +360,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
int again = 0;
unsigned long num_run;
unsigned long batch_run = 0;
- unsigned long limit;
unsigned long last_waited = 0;
int force_reg = 0;
int sync_pending = 0;
@@ -375,8 +374,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
blk_start_plug(&plug);
bdi = device->bdev->bd_bdi;
- limit = btrfs_async_submit_limit(fs_info);
- limit = limit * 2 / 3;
loop:
spin_lock(&device->io_lock);
@@ -443,13 +440,6 @@ loop_lock:
pending = pending->bi_next;
cur->bi_next = NULL;
- /*
- * atomic_dec_return implies a barrier for waitqueue_active
- */
- if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
- waitqueue_active(&fs_info->async_submit_wait))
- wake_up(&fs_info->async_submit_wait);
-
BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
/*
@@ -517,12 +507,6 @@ loop_lock:
&device->work);
goto done;
}
- /* unplug every 64 requests just for good measure */
- if (batch_run % 64 == 0) {
- blk_finish_plug(&plug);
- blk_start_plug(&plug);
- sync_pending = 0;
- }
}
cond_resched();
@@ -547,7 +531,7 @@ static void pending_bios_fn(struct btrfs_work *work)
}
-void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
{
struct btrfs_fs_devices *fs_devs;
struct btrfs_device *dev;
@@ -1068,14 +1052,15 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
return ret;
}
-void btrfs_release_disk_super(struct page *page)
+static void btrfs_release_disk_super(struct page *page)
{
kunmap(page);
put_page(page);
}
-int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
- struct page **page, struct btrfs_super_block **disk_super)
+static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
+ struct page **page,
+ struct btrfs_super_block **disk_super)
{
void *p;
pgoff_t index;
@@ -1817,8 +1802,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
return 0;
}
-struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
- struct btrfs_device *device)
+static struct btrfs_device * btrfs_find_next_active_device(
+ struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
{
struct btrfs_device *next_device;
@@ -2031,19 +2016,20 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
}
btrfs_close_bdev(srcdev);
-
call_rcu(&srcdev->rcu, free_device);
- /*
- * unless fs_devices is seed fs, num_devices shouldn't go
- * zero
- */
- BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
-
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
struct btrfs_fs_devices *tmp_fs_devices;
+ /*
+ * On a mounted FS, num_devices can't be zero unless it's a
+ * seed. In case of a seed device being replaced, the replace
+ * target added to the sprout FS, so there will be no more
+ * device left under the seed FS.
+ */
+ ASSERT(fs_devices->seeding);
+
tmp_fs_devices = fs_info->fs_devices;
while (tmp_fs_devices) {
if (tmp_fs_devices->seed == fs_devices) {
@@ -2323,6 +2309,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
u64 tmp;
int seeding_dev = 0;
int ret = 0;
+ bool unlocked = false;
if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
return -EROFS;
@@ -2399,7 +2386,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
sb->s_flags &= ~MS_RDONLY;
ret = btrfs_prepare_sprout(fs_info);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_trans;
+ }
}
device->fs_devices = fs_info->fs_devices;
@@ -2445,14 +2435,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto error_trans;
+ goto error_sysfs;
}
}
ret = btrfs_add_device(trans, fs_info, device);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto error_trans;
+ goto error_sysfs;
}
if (seeding_dev) {
@@ -2461,7 +2451,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
ret = btrfs_finish_sprout(trans, fs_info);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto error_trans;
+ goto error_sysfs;
}
/* Sprouting would change fsid of the mounted root,
@@ -2479,6 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
+ unlocked = true;
if (ret) /* transaction commit */
return ret;
@@ -2491,7 +2482,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (IS_ERR(trans)) {
if (PTR_ERR(trans) == -ENOENT)
return 0;
- return PTR_ERR(trans);
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto error_sysfs;
}
ret = btrfs_commit_transaction(trans);
}
@@ -2500,14 +2493,18 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
update_dev_time(device_path);
return ret;
+error_sysfs:
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
error_trans:
- btrfs_end_transaction(trans);
+ if (seeding_dev)
+ sb->s_flags |= MS_RDONLY;
+ if (trans)
+ btrfs_end_transaction(trans);
rcu_string_free(device->name);
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev) {
+ if (seeding_dev && !unlocked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -4813,16 +4810,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em_tree = &info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- if (!ret) {
- list_add_tail(&em->list, &trans->transaction->pending_chunks);
- refcount_inc(&em->refs);
- }
- write_unlock(&em_tree->lock);
if (ret) {
+ write_unlock(&em_tree->lock);
free_extent_map(em);
goto error;
}
+ list_add_tail(&em->list, &trans->transaction->pending_chunks);
+ refcount_inc(&em->refs);
+ write_unlock(&em_tree->lock);
+
ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
if (ret)
goto error_del_extent;
@@ -5695,10 +5692,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
- if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
+ if (!need_full_stripe(op))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (need_full_stripe(op))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -5711,7 +5708,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
+ if (need_full_stripe(op)) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -5725,7 +5722,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (need_full_stripe(op))
num_stripes = map->sub_stripes;
else if (mirror_num)
stripe_index += mirror_num - 1;
@@ -5740,9 +5737,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (need_raid_map &&
- (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
- mirror_num > 1)) {
+ if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * nr_data_stripes(map));
@@ -5769,9 +5764,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
- if ((op != BTRFS_MAP_WRITE &&
- op != BTRFS_MAP_GET_READ_MIRRORS) &&
- mirror_num <= 1)
+ if (!need_full_stripe(op) && mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -6033,7 +6026,7 @@ static void btrfs_end_bio(struct bio *bio)
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
- bio->bi_status = 0;
+ bio->bi_status = BLK_STS_OK;
}
btrfs_end_bbio(bbio, bio);
@@ -6069,13 +6062,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
return;
}
- /*
- * nr_async_bios allows us to reliably return congestion to the
- * higher layers. Otherwise, the async bio makes it appear we have
- * made progress against dirty pages when we've really just put it
- * on a queue for later
- */
- atomic_inc(&fs_info->nr_async_bios);
WARN_ON(bio->bi_next);
bio->bi_next = NULL;
@@ -6144,7 +6130,10 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- bio->bi_status = BLK_STS_IOERR;
+ if (atomic_read(&bbio->error) > bbio->max_errors)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_status = BLK_STS_OK;
btrfs_end_bbio(bbio, bio);
}
}
@@ -6166,7 +6155,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
map_length = length;
btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, bio_op(bio), logical,
+ ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
&map_length, &bbio, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
@@ -6249,7 +6238,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
if (IS_ERR(device))
- return NULL;
+ return device;
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
@@ -6377,6 +6366,17 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
return 0;
}
+static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid, bool error)
+{
+ if (error)
+ btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
+ else
+ btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
+}
+
static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
@@ -6447,18 +6447,21 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
- btrfs_report_missing_device(fs_info, devid, uuid);
- return -EIO;
+ btrfs_report_missing_device(fs_info, devid, uuid, true);
+ return -ENOENT;
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
add_missing_dev(fs_info->fs_devices, devid,
uuid);
- if (!map->stripes[i].dev) {
+ if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
- return -EIO;
+ btrfs_err(fs_info,
+ "failed to init missing dev %llu: %ld",
+ devid, PTR_ERR(map->stripes[i].dev));
+ return PTR_ERR(map->stripes[i].dev);
}
- btrfs_report_missing_device(fs_info, devid, uuid);
+ btrfs_report_missing_device(fs_info, devid, uuid, false);
}
map->stripes[i].dev->in_fs_metadata = 1;
}
@@ -6577,19 +6580,28 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
- btrfs_report_missing_device(fs_info, devid, dev_uuid);
- return -EIO;
+ btrfs_report_missing_device(fs_info, devid,
+ dev_uuid, true);
+ return -ENOENT;
}
device = add_missing_dev(fs_devices, devid, dev_uuid);
- if (!device)
- return -ENOMEM;
- btrfs_report_missing_device(fs_info, devid, dev_uuid);
+ if (IS_ERR(device)) {
+ btrfs_err(fs_info,
+ "failed to add missing dev %llu: %ld",
+ devid, PTR_ERR(device));
+ return PTR_ERR(device);
+ }
+ btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
} else {
if (!device->bdev) {
- btrfs_report_missing_device(fs_info, devid, dev_uuid);
- if (!btrfs_test_opt(fs_info, DEGRADED))
- return -EIO;
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info,
+ devid, dev_uuid, true);
+ return -ENOENT;
+ }
+ btrfs_report_missing_device(fs_info, devid,
+ dev_uuid, false);
}
if(!device->bdev && !device->missing) {
@@ -6756,12 +6768,6 @@ out_short_read:
return -EIO;
}
-void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid)
-{
- btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
-}
-
/*
* Check if all chunks in the fs are OK for read-write degraded mount
*
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6108fdfec67f..ff15208344a7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -542,7 +542,5 @@ void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info);
-void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid);
#endif
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c248f9286366..2b52950dc2c6 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -37,6 +37,7 @@ struct workspace {
z_stream strm;
char *buf;
struct list_head list;
+ int level;
};
static void zlib_free_workspace(struct list_head *ws)
@@ -96,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
*total_out = 0;
*total_in = 0;
- if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
+ if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) {
pr_warn("BTRFS: deflateInit failed\n");
ret = -EIO;
goto out;
@@ -402,10 +403,22 @@ next:
return ret;
}
+static void zlib_set_level(struct list_head *ws, unsigned int type)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ unsigned level = (type & 0xF0) >> 4;
+
+ if (level > 9)
+ level = 9;
+
+ workspace->level = level > 0 ? level : 3;
+}
+
const struct btrfs_compress_op btrfs_zlib_compress = {
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,
.decompress_bio = zlib_decompress_bio,
.decompress = zlib_decompress,
+ .set_level = zlib_set_level,
};
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 607ce47b483a..17f2dd8fddb8 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -423,10 +423,15 @@ finish:
return ret;
}
+static void zstd_set_level(struct list_head *ws, unsigned int type)
+{
+}
+
const struct btrfs_compress_op btrfs_zstd_compress = {
.alloc_workspace = zstd_alloc_workspace,
.free_workspace = zstd_free_workspace,
.compress_pages = zstd_compress_pages,
.decompress_bio = zstd_decompress_bio,
.decompress = zstd_decompress,
+ .set_level = zstd_set_level,
};
diff --git a/fs/buffer.c b/fs/buffer.c
index 170df856bdb9..49b7e9bdcd1d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1692,7 +1692,8 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+ create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
+ b_state);
return page_buffers(page);
}
@@ -1978,8 +1979,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
case IOMAP_MAPPED:
if (offset >= i_size_read(inode))
set_buffer_new(bh);
- bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
- ((offset - iomap->offset) >> inode->i_blkbits);
+ bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
+ inode->i_blkbits;
set_buffer_mapped(bh);
break;
}
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 32cbab0ffce3..891dedda5905 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for caching in a mounted filesystem
#
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 85a4230b9bff..174f5709e508 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for CEPH filesystem.
#
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b3e3edc09d80..4d622654bfbc 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/backing-dev.h>
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 157fe59fbabe..ff5d32cf9578 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/fs.h>
@@ -1991,6 +1992,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
retry:
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+ spin_unlock(&ci->i_ceph_lock);
dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
goto out;
}
@@ -2008,8 +2010,10 @@ retry:
mutex_lock(&session->s_mutex);
goto retry;
}
- if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+ if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
+ spin_unlock(&ci->i_ceph_lock);
goto out;
+ }
flushing = __mark_caps_flushing(inode, session, true,
&flush_tid, &oldest_flush_tid);
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index bdce8b1fbd06..6f67d5b884a0 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Ceph 'frag' type
*/
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index d635496ea189..644def813754 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/device.h>
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 019c2036d36f..8a5266699b67 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/spinlock.h>
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 7df550c13d7f..3c59ad180ef0 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/exportfs.h>
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 65a6fa12c857..5c17125f45c7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/module.h>
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 373dab5173ca..f2550a076edc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/module.h>
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 4c9c72f26eb9..851aa69ec8f0 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/in.h>
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index c77028afb1e1..51f7f1d39a94 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef FS_CEPH_IOCTL_H
#define FS_CEPH_IOCTL_H
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 8cd63e8123d8..e7cce412f2cf 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/file.h>
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 84edfc60d87a..0687ab3c3267 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/fs.h>
@@ -734,12 +735,13 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
inode = req->r_inode;
ihold(inode);
} else {
- /* req->r_dentry is non-null for LSSNAP request.
- * fall-thru */
- WARN_ON_ONCE(!req->r_dentry);
+ /* req->r_dentry is non-null for LSSNAP request */
+ rcu_read_lock();
+ inode = get_nonsnap_parent(req->r_dentry);
+ rcu_read_unlock();
+ dout("__choose_mds using snapdir's parent %p\n", inode);
}
- }
- if (!inode && req->r_dentry) {
+ } else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
struct dentry *parent;
struct inode *dir;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 636d6b2ec49c..837ac4b087a0 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_MDS_CLIENT_H
#define _FS_CEPH_MDS_CLIENT_H
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 33ced4c22732..44e53abeb32a 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/bug.h>
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 1ffc8b426c1c..8a2ca41e4b97 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/sort.h>
@@ -374,12 +375,10 @@ static int build_snap_context(struct ceph_snap_realm *realm,
realm->ino, realm, snapc, snapc->seq,
(unsigned int) snapc->num_snaps);
- if (realm->cached_context) {
- ceph_put_snap_context(realm->cached_context);
- /* queue realm for cap_snap creation */
- list_add_tail(&realm->dirty_item, dirty_realms);
- }
+ ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
+ /* queue realm for cap_snap creation */
+ list_add_tail(&realm->dirty_item, dirty_realms);
return 0;
fail:
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 913dea163d5c..4a79f3632260 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Ceph fs string constants
*/
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 279a2f401cf5..3e27a28aa44a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_SUPER_H
#define _FS_CEPH_SUPER_H
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 3542b2c364cf..e1c4e0b12b4c 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/ceph/pagelist.h>
diff --git a/fs/char_dev.c b/fs/char_dev.c
index ebcc8fb3fa66..a65e4a56318c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/char_dev.c
*
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f7243617316c..d5b2e12b5d02 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -5,9 +5,14 @@ config CIFS
select CRYPTO
select CRYPTO_MD4
select CRYPTO_MD5
+ select CRYPTO_SHA256
+ select CRYPTO_CMAC
select CRYPTO_HMAC
select CRYPTO_ARC4
+ select CRYPTO_AEAD2
+ select CRYPTO_CCM
select CRYPTO_ECB
+ select CRYPTO_AES
select CRYPTO_DES
help
This is the client VFS module for the SMB3 family of NAS protocols,
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 5e853a395b92..7134f182720b 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for Linux CIFS VFS client
#
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index de5b2e1fcce5..e185b2853eab 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -661,7 +661,9 @@ struct TCP_Server_Info {
#endif
unsigned int max_read;
unsigned int max_write;
- __u8 preauth_hash[512];
+#ifdef CONFIG_CIFS_SMB311
+ __u8 preauth_sha_hash[64]; /* save initital negprot hash */
+#endif /* 3.1.1 */
struct delayed_work reconnect; /* reconnect workqueue job */
struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
unsigned long echo_interval;
@@ -849,7 +851,9 @@ struct cifs_ses {
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
- __u8 preauth_hash[512];
+#ifdef CONFIG_CIFS_SMB311
+ __u8 preauth_sha_hash[64];
+#endif /* 3.1.1 */
};
static inline bool
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e702d48bd023..81ba6e0d88d8 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -204,7 +204,8 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon)
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
int i;
- if (unlikely(direntry->d_name.len >
+ if (unlikely(tcon->fsAttrInfo.MaxPathNameComponentLength &&
+ direntry->d_name.len >
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength)))
return -ENAMETOOLONG;
@@ -520,7 +521,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
rc = check_name(direntry, tcon);
if (rc)
- goto out_free_xid;
+ goto out;
server = tcon->ses->server;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 7ca9808a0daa..62c88dfed57b 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_DATATYPE_MISALIGNMENT, -EIO, "STATUS_DATATYPE_MISALIGNMENT"},
{STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
{STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
- {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
+ {STATUS_BUFFER_OVERFLOW, -E2BIG, "STATUS_BUFFER_OVERFLOW"},
{STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
{STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
{STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 0dafdbae1f8c..e06740436b92 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -522,6 +522,7 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
struct smb2_file_full_ea_info *smb2_data;
+ int ea_buf_size = SMB2_MIN_EA_BUF;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
@@ -541,14 +542,32 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
- smb2_data = kzalloc(SMB2_MAX_EA_BUF, GFP_KERNEL);
- if (smb2_data == NULL) {
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
- return -ENOMEM;
+ while (1) {
+ smb2_data = kzalloc(ea_buf_size, GFP_KERNEL);
+ if (smb2_data == NULL) {
+ SMB2_close(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid);
+ return -ENOMEM;
+ }
+
+ rc = SMB2_query_eas(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid,
+ ea_buf_size, smb2_data);
+
+ if (rc != -E2BIG)
+ break;
+
+ kfree(smb2_data);
+ ea_buf_size <<= 1;
+
+ if (ea_buf_size > SMB2_MAX_EA_BUF) {
+ cifs_dbg(VFS, "EA size is too large\n");
+ SMB2_close(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid);
+ return -ENOMEM;
+ }
}
- rc = SMB2_query_eas(xid, tcon, fid.persistent_fid, fid.volatile_fid,
- smb2_data);
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
if (!rc)
@@ -2068,22 +2087,6 @@ init_sg(struct smb_rqst *rqst, u8 *sign)
return sg;
}
-struct cifs_crypt_result {
- int err;
- struct completion completion;
-};
-
-static void cifs_crypt_complete(struct crypto_async_request *req, int err)
-{
- struct cifs_crypt_result *res = req->data;
-
- if (err == -EINPROGRESS)
- return;
-
- res->err = err;
- complete(&res->completion);
-}
-
static int
smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
{
@@ -2124,12 +2127,10 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
struct aead_request *req;
char *iv;
unsigned int iv_len;
- struct cifs_crypt_result result = {0, };
+ DECLARE_CRYPTO_WAIT(wait);
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
- init_completion(&result.completion);
-
rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key);
if (rc) {
cifs_dbg(VFS, "%s: Could not get %scryption key\n", __func__,
@@ -2189,14 +2190,10 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
aead_request_set_ad(req, assoc_data_len);
aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- cifs_crypt_complete, &result);
+ crypto_req_done, &wait);
- rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req);
-
- if (rc == -EINPROGRESS || rc == -EBUSY) {
- wait_for_completion(&result.completion);
- rc = result.err;
- }
+ rc = crypto_wait_req(enc ? crypto_aead_encrypt(req)
+ : crypto_aead_decrypt(req), &wait);
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 6f0e6343c15e..5331631386a2 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -648,7 +648,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
{
int rc = 0;
struct validate_negotiate_info_req vneg_inbuf;
- struct validate_negotiate_info_rsp *pneg_rsp;
+ struct validate_negotiate_info_rsp *pneg_rsp = NULL;
u32 rsplen;
u32 inbuflen; /* max of 4 dialects */
@@ -727,8 +727,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
rsplen);
/* relax check since Mac returns max bufsize allowed on ioctl */
- if (rsplen > CIFSMaxBufSize)
- return -EIO;
+ if ((rsplen > CIFSMaxBufSize)
+ || (rsplen < sizeof(struct validate_negotiate_info_rsp)))
+ goto err_rsp_free;
}
/* check validate negotiate info response matches what we got earlier */
@@ -747,10 +748,13 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
/* validate negotiate successful */
cifs_dbg(FYI, "validate negotiate info successful\n");
+ kfree(pneg_rsp);
return 0;
vneg_out:
cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
+err_rsp_free:
+ kfree(pneg_rsp);
return -EIO;
}
@@ -1255,7 +1259,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
struct smb2_tree_connect_req *req;
struct smb2_tree_connect_rsp *rsp = NULL;
struct kvec iov[2];
- struct kvec rsp_iov;
+ struct kvec rsp_iov = { NULL, 0 };
int rc = 0;
int resp_buftype;
int unc_path_len;
@@ -1372,7 +1376,7 @@ tcon_exit:
return rc;
tcon_error_exit:
- if (rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
+ if (rsp && rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
}
goto tcon_exit;
@@ -1975,6 +1979,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
} else
iov[0].iov_len = get_rfc1002_length(req) + 4;
+ /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
+ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
+ req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
@@ -2191,9 +2198,13 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
req->AdditionalInformation = cpu_to_le32(additional_info);
- /* 4 for rfc1002 length field and 1 for Buffer */
- req->InputBufferOffset =
- cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
+
+ /*
+ * We do not use the input buffer (do not send extra byte)
+ */
+ req->InputBufferOffset = 0;
+ inc_rfc1001_len(req, -1);
+
req->OutputBufferLength = cpu_to_le32(output_len);
iov[0].iov_base = (char *)req;
@@ -2233,12 +2244,12 @@ qinf_exit:
}
int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid,
- struct smb2_file_full_ea_info *data)
+ u64 persistent_fid, u64 volatile_fid,
+ int ea_buf_size, struct smb2_file_full_ea_info *data)
{
return query_info(xid, tcon, persistent_fid, volatile_fid,
FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0,
- SMB2_MAX_EA_BUF,
+ ea_buf_size,
sizeof(struct smb2_file_full_ea_info),
(void **)&data,
NULL);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 6c9653a130c8..c2ec934be968 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -832,7 +832,7 @@ struct smb2_flush_rsp {
/* Channel field for read and write: exactly one of following flags can be set*/
#define SMB2_CHANNEL_NONE 0x00000000
#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */
-#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000002 /* SMB3.02 or later */
/* SMB2 read request without RFC1001 length at the beginning */
struct smb2_read_plain_req {
@@ -1178,7 +1178,8 @@ struct smb2_file_link_info { /* encoding of request for level 11 */
char FileName[0]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
-#define SMB2_MAX_EA_BUF 2048
+#define SMB2_MIN_EA_BUF 2048
+#define SMB2_MAX_EA_BUF 65536
struct smb2_file_full_ea_info { /* encoding of response for level 15 */
__le32 next_entry_offset;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 003217099ef3..e9ab5227e7a8 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -134,6 +134,7 @@ extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
+ int ea_buf_size,
struct smb2_file_full_ea_info *data);
extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 67367cf1f8cd..99493946e2f9 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -390,6 +390,7 @@ generate_smb30signingkey(struct cifs_ses *ses)
return generate_smb3signingkey(ses, &triplet);
}
+#ifdef CONFIG_CIFS_SMB311
int
generate_smb311signingkey(struct cifs_ses *ses)
@@ -398,25 +399,26 @@ generate_smb311signingkey(struct cifs_ses *ses)
struct derivation *d;
d = &triplet.signing;
- d->label.iov_base = "SMB2AESCMAC";
- d->label.iov_len = 12;
- d->context.iov_base = "SmbSign";
- d->context.iov_len = 8;
+ d->label.iov_base = "SMBSigningKey";
+ d->label.iov_len = 14;
+ d->context.iov_base = ses->preauth_sha_hash;
+ d->context.iov_len = 64;
d = &triplet.encryption;
- d->label.iov_base = "SMB2AESCCM";
- d->label.iov_len = 11;
- d->context.iov_base = "ServerIn ";
- d->context.iov_len = 10;
+ d->label.iov_base = "SMBC2SCipherKey";
+ d->label.iov_len = 16;
+ d->context.iov_base = ses->preauth_sha_hash;
+ d->context.iov_len = 64;
d = &triplet.decryption;
- d->label.iov_base = "SMB2AESCCM";
- d->label.iov_len = 11;
- d->context.iov_base = "ServerOut";
- d->context.iov_len = 10;
+ d->label.iov_base = "SMBS2CCipherKey";
+ d->label.iov_len = 16;
+ d->context.iov_base = ses->preauth_sha_hash;
+ d->context.iov_len = 64;
return generate_smb3signingkey(ses, &triplet);
}
+#endif /* 311 */
int
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 5bb630a769e0..201fc08a8b4f 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Cache operations for Coda.
* For Linux 2.1: (C) 1997 Carnegie Mellon University
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index f13e09057c6b..845b5a66952a 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* cnode related routines for the coda kernel code
(C) 1996 Peter Braam
*/
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
index c910b5eb1ceb..c9f7a77c013e 100644
--- a/fs/coda/coda_cache.h
+++ b/fs/coda/coda_cache.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Coda filesystem -- Linux Minicache
*
* Copyright (C) 1989 - 1997 Carnegie Mellon University
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index c64075213218..d702ba1a2bf9 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* coda_fs_i.h
*
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 381c993b1427..bb0b3e0ed6c2 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _CODA_INT_
#define _CODA_INT_
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index f1714cfb589c..ca599df0dcb1 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Inode operations for Coda filesystem
* Original version: (C) 1996 P. Braam and M. Callahan
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index d3c361883c28..126155cadfa9 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Coda File System, Linux Kernel module
*
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 274ab5586dd0..00876ddadb43 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Directory operations for Coda filesystem
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 363402fcb3ed..1cbc1f2298ee 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* File operations for Coda.
* Original version: (C) 1996 Peter Braam
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6058df380cc0..6f0a6a4d5faa 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Super block/filesystem wide operations
*
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index b0b9cda41928..e0c17b7dccce 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Pioctl operations for Coda.
* Original version: (C) 1996 Peter Braam
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 03736e20d720..202297d156df 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Symlink inode operations for Coda filesystem
* Original version: (C) 1996 P. Braam and M. Callahan
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 34218a8a28cd..0301d45000a8 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Sysctl operations for Coda filesystem
* Original version: (C) 1996 P. Braam and M. Callahan
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index e82357c89979..a37f003530d7 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Mostly platform independent upcall operations to Venus:
* -- upcalls
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d27b326d96f4..bd5d91e119ca 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ioctl32.c: Conversion between 32bit and 64bit native ioctls.
*
diff --git a/fs/coredump.c b/fs/coredump.c
index 0eec03696707..52c63d6c9143 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index ec4f1d4fdad0..975d98fc26b5 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* uncompress.c
*
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 9f6607f17b53..cb496989a6b6 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o
-fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o
fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 483784d5eb73..0d5e6a569d58 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This contains encryption functions for per-file encryption.
*
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index c7835df7e7b8..732a786cce9d 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -126,21 +126,6 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
}
EXPORT_SYMBOL(fscrypt_get_ctx);
-/**
- * page_crypt_complete() - completion callback for page crypto
- * @req: The asynchronous cipher request context
- * @res: The result of the cipher operation
- */
-static void page_crypt_complete(struct crypto_async_request *req, int res)
-{
- struct fscrypt_completion_result *ecr = req->data;
-
- if (res == -EINPROGRESS)
- return;
- ecr->res = res;
- complete(&ecr->completion);
-}
-
int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
u64 lblk_num, struct page *src_page,
struct page *dest_page, unsigned int len,
@@ -151,7 +136,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
u8 padding[FS_IV_SIZE - sizeof(__le64)];
} iv;
struct skcipher_request *req = NULL;
- DECLARE_FS_COMPLETION_RESULT(ecr);
+ DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_ctfm;
@@ -179,7 +164,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- page_crypt_complete, &ecr);
+ crypto_req_done, &wait);
sg_init_table(&dst, 1);
sg_set_page(&dst, dest_page, len, offs);
@@ -187,14 +172,9 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
sg_set_page(&src, src_page, len, offs);
skcipher_request_set_crypt(req, &src, &dst, len, &iv);
if (rw == FS_DECRYPT)
- res = crypto_skcipher_decrypt(req);
+ res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
else
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
+ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res) {
printk_ratelimited(KERN_ERR
@@ -340,7 +320,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
dir = dget_parent(dentry);
- if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+ if (!IS_ENCRYPTED(d_inode(dir))) {
dput(dir);
return 0;
}
@@ -410,11 +390,8 @@ int fscrypt_initialize(unsigned int cop_flags)
{
int i, res = -ENOMEM;
- /*
- * No need to allocate a bounce page pool if there already is one or
- * this FS won't use it.
- */
- if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool)
+ /* No need to allocate a bounce page pool if this FS won't use it. */
+ if (cop_flags & FS_CFLG_OWN_PAGES)
return 0;
mutex_lock(&fscrypt_init_mutex);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index ad9f814fdead..305541bcd108 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This contains functions for filename crypto management
*
@@ -15,21 +16,6 @@
#include "fscrypt_private.h"
/**
- * fname_crypt_complete() - completion callback for filename crypto
- * @req: The asynchronous cipher request context
- * @res: The result of the cipher operation
- */
-static void fname_crypt_complete(struct crypto_async_request *req, int res)
-{
- struct fscrypt_completion_result *ecr = req->data;
-
- if (res == -EINPROGRESS)
- return;
- ecr->res = res;
- complete(&ecr->completion);
-}
-
-/**
* fname_encrypt() - encrypt a filename
*
* The caller must have allocated sufficient memory for the @oname string.
@@ -40,7 +26,7 @@ static int fname_encrypt(struct inode *inode,
const struct qstr *iname, struct fscrypt_str *oname)
{
struct skcipher_request *req = NULL;
- DECLARE_FS_COMPLETION_RESULT(ecr);
+ DECLARE_CRYPTO_WAIT(wait);
struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
@@ -76,17 +62,12 @@ static int fname_encrypt(struct inode *inode,
}
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- fname_crypt_complete, &ecr);
+ crypto_req_done, &wait);
sg_init_one(&sg, oname->name, cryptlen);
skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv);
/* Do the encryption */
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- /* Request is being completed asynchronously; wait for it */
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
+ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
printk_ratelimited(KERN_ERR
@@ -110,7 +91,7 @@ static int fname_decrypt(struct inode *inode,
struct fscrypt_str *oname)
{
struct skcipher_request *req = NULL;
- DECLARE_FS_COMPLETION_RESULT(ecr);
+ DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_ctfm;
@@ -131,7 +112,7 @@ static int fname_decrypt(struct inode *inode,
}
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- fname_crypt_complete, &ecr);
+ crypto_req_done, &wait);
/* Initialize IV */
memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
@@ -140,11 +121,7 @@ static int fname_decrypt(struct inode *inode,
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
- res = crypto_skcipher_decrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
+ res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
printk_ratelimited(KERN_ERR
@@ -382,8 +359,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
memset(fname, 0, sizeof(struct fscrypt_name));
fname->usr_fname = iname;
- if (!dir->i_sb->s_cop->is_encrypted(dir) ||
- fscrypt_is_dot_dotdot(iname)) {
+ if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) {
fname->disk_name.name = (unsigned char *)iname->name;
fname->disk_name.len = iname->len;
return 0;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index a1d5021c31ef..c0b4f5597e1a 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fscrypt_private.h
*
@@ -11,7 +12,8 @@
#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H
-#include <linux/fscrypt_supp.h>
+#define __FS_HAS_ENCRYPTION 1
+#include <linux/fscrypt.h>
#include <crypto/hash.h>
/* Encryption parameters */
@@ -69,16 +71,6 @@ typedef enum {
#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002
-struct fscrypt_completion_result {
- struct completion completion;
- int res;
-};
-
-#define DECLARE_FS_COMPLETION_RESULT(ecr) \
- struct fscrypt_completion_result ecr = { \
- COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 }
-
-
/* crypto.c */
extern int fscrypt_initialize(unsigned int cop_flags);
extern struct workqueue_struct *fscrypt_read_workqueue;
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
new file mode 100644
index 000000000000..9f5fb2eb9cf7
--- /dev/null
+++ b/fs/crypto/hooks.c
@@ -0,0 +1,112 @@
+/*
+ * fs/crypto/hooks.c
+ *
+ * Encryption hooks for higher-level filesystem operations.
+ */
+
+#include <linux/ratelimit.h>
+#include "fscrypt_private.h"
+
+/**
+ * fscrypt_file_open - prepare to open a possibly-encrypted regular file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * Currently, an encrypted regular file can only be opened if its encryption key
+ * is available; access to the raw encrypted contents is not supported.
+ * Therefore, we first set up the inode's encryption key (if not already done)
+ * and return an error if it's unavailable.
+ *
+ * We also verify that if the parent directory (from the path via which the file
+ * is being opened) is encrypted, then the inode being opened uses the same
+ * encryption policy. This is needed as part of the enforcement that all files
+ * in an encrypted directory tree use the same encryption policy, as a
+ * protection against certain types of offline attacks. Note that this check is
+ * needed even when opening an *unencrypted* file, since it's forbidden to have
+ * an unencrypted file in an encrypted directory.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ */
+int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+ int err;
+ struct dentry *dir;
+
+ err = fscrypt_require_key(inode);
+ if (err)
+ return err;
+
+ dir = dget_parent(file_dentry(filp));
+ if (IS_ENCRYPTED(d_inode(dir)) &&
+ !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+ pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu",
+ d_inode(dir)->i_ino, inode->i_ino);
+ err = -EPERM;
+ }
+ dput(dir);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_file_open);
+
+int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
+{
+ int err;
+
+ err = fscrypt_require_key(dir);
+ if (err)
+ return err;
+
+ if (!fscrypt_has_permitted_context(dir, inode))
+ return -EPERM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_link);
+
+int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ int err;
+
+ err = fscrypt_require_key(old_dir);
+ if (err)
+ return err;
+
+ err = fscrypt_require_key(new_dir);
+ if (err)
+ return err;
+
+ if (old_dir != new_dir) {
+ if (IS_ENCRYPTED(new_dir) &&
+ !fscrypt_has_permitted_context(new_dir,
+ d_inode(old_dentry)))
+ return -EPERM;
+
+ if ((flags & RENAME_EXCHANGE) &&
+ IS_ENCRYPTED(old_dir) &&
+ !fscrypt_has_permitted_context(old_dir,
+ d_inode(new_dentry)))
+ return -EPERM;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);
+
+int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry)
+{
+ int err = fscrypt_get_encryption_info(dir);
+
+ if (err)
+ return err;
+
+ if (fscrypt_has_encryption_key(dir)) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+ spin_unlock(&dentry->d_lock);
+ }
+
+ d_set_d_op(dentry, &fscrypt_d_ops);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 018c588c7ac3..5e6e846f5a24 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* key management facility for FS encryption support.
*
@@ -17,17 +18,6 @@
static struct crypto_shash *essiv_hash_tfm;
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
- struct fscrypt_completion_result *ecr = req->data;
-
- if (rc == -EINPROGRESS)
- return;
-
- ecr->res = rc;
- complete(&ecr->completion);
-}
-
/**
* derive_key_aes() - Derive a key using AES-128-ECB
* @deriving_key: Encryption key used for derivation.
@@ -42,7 +32,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
{
int res = 0;
struct skcipher_request *req = NULL;
- DECLARE_FS_COMPLETION_RESULT(ecr);
+ DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
@@ -59,7 +49,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
}
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- derive_crypt_complete, &ecr);
+ crypto_req_done, &wait);
res = crypto_skcipher_setkey(tfm, deriving_key,
FS_AES_128_ECB_KEY_SIZE);
if (res < 0)
@@ -69,11 +59,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
sg_init_one(&dst_sg, derived_raw_key, source_key->size);
skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
NULL);
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
+ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
out:
skcipher_request_free(req);
crypto_free_skcipher(tfm);
@@ -109,6 +95,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info,
goto out;
}
ukp = user_key_payload_locked(keyring_key);
+ if (!ukp) {
+ /* key was revoked before we acquired its semaphore */
+ res = -EKEYREVOKED;
+ goto out;
+ }
if (ukp->datalen != sizeof(struct fscrypt_key)) {
res = -EINVAL;
goto out;
@@ -268,7 +259,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
if (res < 0) {
if (!fscrypt_dummy_context_enabled(inode) ||
- inode->i_sb->s_cop->is_encrypted(inode))
+ IS_ENCRYPTED(inode))
return res;
/* Fake up a context for an unencrypted directory */
memset(&ctx, 0, sizeof(ctx));
@@ -368,7 +359,7 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
struct fscrypt_info *prev;
if (ci == NULL)
- ci = ACCESS_ONCE(inode->i_crypt_info);
+ ci = READ_ONCE(inode->i_crypt_info);
if (ci == NULL)
return;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index ce07a86200f3..c6d431a5cce9 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Encryption policy functions for per-file encryption support.
*
@@ -109,7 +110,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
struct fscrypt_policy policy;
int res;
- if (!inode->i_sb->s_cop->is_encrypted(inode))
+ if (!IS_ENCRYPTED(inode))
return -ENODATA;
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
@@ -166,11 +167,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
return 1;
/* No restrictions if the parent directory is unencrypted */
- if (!cops->is_encrypted(parent))
+ if (!IS_ENCRYPTED(parent))
return 1;
/* Encrypted directories must not contain unencrypted files */
- if (!cops->is_encrypted(child))
+ if (!IS_ENCRYPTED(child))
return 0;
/*
diff --git a/fs/dax.c b/fs/dax.c
index f001d8c72a06..f3a44a7c14b3 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -938,7 +938,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
- return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
+ return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
}
static loff_t
diff --git a/fs/dcache.c b/fs/dcache.c
index f90141387f01..bcc9f6981569 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
{
/*
* Be careful about RCU walk racing with rename:
- * use 'lockless_dereference' to fetch the name pointer.
+ * use 'READ_ONCE' to fetch the name pointer.
*
* NOTE! Even if a rename will mean that the length
* was not loaded atomically, we don't care. The
@@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
* early because the data cannot match (there can
* be no NUL in the ct/tcount data)
*/
- const unsigned char *cs = lockless_dereference(dentry->d_name.name);
+ const unsigned char *cs = READ_ONCE(dentry->d_name.name);
return dentry_string_cmp(cs, ct, tcount);
}
@@ -630,7 +630,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
rcu_read_lock();
spin_unlock(&dentry->d_lock);
again:
- parent = ACCESS_ONCE(dentry->d_parent);
+ parent = READ_ONCE(dentry->d_parent);
spin_lock(&parent->d_lock);
/*
* We can't blindly lock dentry until we are sure
@@ -721,7 +721,7 @@ static inline bool fast_dput(struct dentry *dentry)
* around with a zero refcount.
*/
smp_rmb();
- d_flags = ACCESS_ONCE(dentry->d_flags);
+ d_flags = READ_ONCE(dentry->d_flags);
d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
/* Nothing to do? Dropping the reference was all we needed? */
@@ -850,11 +850,11 @@ struct dentry *dget_parent(struct dentry *dentry)
* locking.
*/
rcu_read_lock();
- ret = ACCESS_ONCE(dentry->d_parent);
+ ret = READ_ONCE(dentry->d_parent);
gotref = lockref_get_not_zero(&ret->d_lockref);
rcu_read_unlock();
if (likely(gotref)) {
- if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
+ if (likely(ret == READ_ONCE(dentry->d_parent)))
return ret;
dput(ret);
}
@@ -3040,7 +3040,7 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
* @buflen: allocated length of the buffer
* @name: name string and length qstr structure
*
- * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
+ * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
* make sure that either the old or the new name pointer and length are
* fetched. However, there may be mismatch between length and pointer.
* The length cannot be trusted, we need to copy it byte-by-byte until
@@ -3054,8 +3054,8 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
*/
static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
{
- const char *dname = ACCESS_ONCE(name->name);
- u32 dlen = ACCESS_ONCE(name->len);
+ const char *dname = READ_ONCE(name->name);
+ u32 dlen = READ_ONCE(name->len);
char *p;
smp_read_barrier_depends();
@@ -3120,7 +3120,7 @@ restart:
struct dentry * parent;
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
- struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
+ struct mount *parent = READ_ONCE(mnt->mnt_parent);
/* Escaped? */
if (dentry != vfsmnt->mnt_root) {
bptr = *buffer;
@@ -3130,7 +3130,7 @@ restart:
}
/* Global root? */
if (mnt != parent) {
- dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+ dentry = READ_ONCE(mnt->mnt_mountpoint);
mnt = parent;
vfsmnt = &mnt->mnt;
continue;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 5fa2211e49ae..98fe1325da9d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -45,6 +45,12 @@
#define DIO_PAGES 64
/*
+ * Flags for dio_complete()
+ */
+#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */
+#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */
+
+/*
* This code generally works in units of "dio_blocks". A dio_block is
* somewhere between the hard sector size and the filesystem block size. it
* is determined on a per-invocation basis. When talking to the filesystem
@@ -225,10 +231,11 @@ static inline struct page *dio_get_page(struct dio *dio,
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
{
loff_t offset = dio->iocb->ki_pos;
ssize_t transferred = 0;
+ int err;
/*
* AIO submission can race with bio completion to get here while
@@ -259,18 +266,37 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
ret = transferred;
if (dio->end_io) {
- int err;
-
// XXX: ki_pos??
err = dio->end_io(dio->iocb, offset, ret, dio->private);
if (err)
ret = err;
}
+ /*
+ * Try again to invalidate clean pages which might have been cached by
+ * non-direct readahead, or faulted in by get_user_pages() if the source
+ * of the write was an mmap'ed region of the file we're writing. Either
+ * one is a pretty crazy thing to do, so we don't support it 100%. If
+ * this invalidation fails, tough, the write still worked...
+ *
+ * And this page cache invalidation has to be after dio->end_io(), as
+ * some filesystems convert unwritten extents to real allocations in
+ * end_io() when necessary, otherwise a racing buffer read would cache
+ * zeros from unwritten extents.
+ */
+ if (flags & DIO_COMPLETE_INVALIDATE &&
+ ret > 0 && dio->op == REQ_OP_WRITE &&
+ dio->inode->i_mapping->nrpages) {
+ err = invalidate_inode_pages2_range(dio->inode->i_mapping,
+ offset >> PAGE_SHIFT,
+ (offset + ret - 1) >> PAGE_SHIFT);
+ WARN_ON_ONCE(err);
+ }
+
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(dio->inode);
- if (is_async) {
+ if (flags & DIO_COMPLETE_ASYNC) {
/*
* generic_write_sync expects ki_pos to have been updated
* already, but the submission path only does this for
@@ -291,7 +317,7 @@ static void dio_aio_complete_work(struct work_struct *work)
{
struct dio *dio = container_of(work, struct dio, complete_work);
- dio_complete(dio, 0, true);
+ dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
}
static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
@@ -304,6 +330,7 @@ static void dio_bio_end_aio(struct bio *bio)
struct dio *dio = bio->bi_private;
unsigned long remaining;
unsigned long flags;
+ bool defer_completion = false;
/* cleanup the bio */
dio_bio_complete(dio, bio);
@@ -315,12 +342,24 @@ static void dio_bio_end_aio(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- if (dio->result && dio->defer_completion) {
+ /*
+ * Defer completion when defer_completion is set or
+ * when the inode has pages mapped and this is AIO write.
+ * We need to invalidate those pages because there is a
+ * chance they contain stale data in the case buffered IO
+ * went in between AIO submission and completion into the
+ * same region.
+ */
+ if (dio->result)
+ defer_completion = dio->defer_completion ||
+ (dio->op == REQ_OP_WRITE &&
+ dio->inode->i_mapping->nrpages);
+ if (defer_completion) {
INIT_WORK(&dio->complete_work, dio_aio_complete_work);
queue_work(dio->inode->i_sb->s_dio_done_wq,
&dio->complete_work);
} else {
- dio_complete(dio, 0, true);
+ dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
}
}
}
@@ -838,7 +877,8 @@ out:
*/
if (sdio->boundary) {
ret = dio_send_cur_page(dio, sdio, map_bh);
- dio_bio_submit(dio, sdio);
+ if (sdio->bio)
+ dio_bio_submit(dio, sdio);
put_page(sdio->cur_page);
sdio->cur_page = NULL;
}
@@ -1112,7 +1152,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
- unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+ unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
@@ -1210,10 +1250,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
* so that we can call ->fsync.
*/
- if (dio->is_async && iov_iter_rw(iter) == WRITE &&
- ((iocb->ki_filp->f_flags & O_DSYNC) ||
- IS_SYNC(iocb->ki_filp->f_mapping->host))) {
- retval = dio_set_defer_completion(dio);
+ if (dio->is_async && iov_iter_rw(iter) == WRITE) {
+ retval = 0;
+ if ((iocb->ki_filp->f_flags & O_DSYNC) ||
+ IS_SYNC(iocb->ki_filp->f_mapping->host))
+ retval = dio_set_defer_completion(dio);
+ else if (!dio->inode->i_sb->s_dio_done_wq) {
+ /*
+ * In case of AIO write racing with buffered read we
+ * need to defer completion. We can't decide this now,
+ * however the workqueue needs to be initialized here.
+ */
+ retval = sb_init_dio_done_wq(dio->inode->i_sb);
+ }
if (retval) {
/*
* We grab i_mutex only for reads so we don't have
@@ -1322,7 +1371,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio_await_completion(dio);
if (drop_refcount(dio) == 0) {
- retval = dio_complete(dio, retval, false);
+ retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
} else
BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index ca1c9124c8ce..3545fdafc6fb 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DLM) += dlm.o
dlm-y := ast.o \
config.o \
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d72d52b90433..82377017130f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implement the manual drop-all-pagecache function
*/
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 9c351bf757b2..3fbc0ff79699 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -84,11 +84,16 @@ struct ecryptfs_page_crypt_context {
static inline struct ecryptfs_auth_tok *
ecryptfs_get_encrypted_key_payload_data(struct key *key)
{
- if (key->type == &key_type_encrypted)
- return (struct ecryptfs_auth_tok *)
- (&((struct encrypted_key_payload *)key->payload.data[0])->payload_data);
- else
+ struct encrypted_key_payload *payload;
+
+ if (key->type != &key_type_encrypted)
return NULL;
+
+ payload = key->payload.data[0];
+ if (!payload)
+ return ERR_PTR(-EKEYREVOKED);
+
+ return (struct ecryptfs_auth_tok *)payload->payload_data;
}
static inline struct key *ecryptfs_get_encrypted_key(char *sig)
@@ -114,12 +119,17 @@ static inline struct ecryptfs_auth_tok *
ecryptfs_get_key_payload_data(struct key *key)
{
struct ecryptfs_auth_tok *auth_tok;
+ struct user_key_payload *ukp;
auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
- if (!auth_tok)
- return (struct ecryptfs_auth_tok *)user_key_payload_locked(key)->data;
- else
+ if (auth_tok)
return auth_tok;
+
+ ukp = user_key_payload_locked(key);
+ if (!ukp)
+ return ERR_PTR(-EKEYREVOKED);
+
+ return (struct ecryptfs_auth_tok *)ukp->data;
}
#define ECRYPTFS_MAX_KEYSET_SIZE 1024
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 3cf1546dca82..fa218cd64f74 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -459,7 +459,8 @@ out:
* @auth_tok_key: key containing the authentication token
* @auth_tok: authentication token
*
- * Returns zero on valid auth tok; -EINVAL otherwise
+ * Returns zero on valid auth tok; -EINVAL if the payload is invalid; or
+ * -EKEYREVOKED if the key was revoked before we acquired its semaphore.
*/
static int
ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key,
@@ -468,6 +469,12 @@ ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key,
int rc = 0;
(*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key);
+ if (IS_ERR(*auth_tok)) {
+ rc = PTR_ERR(*auth_tok);
+ *auth_tok = NULL;
+ goto out;
+ }
+
if (ecryptfs_verify_version((*auth_tok)->version)) {
printk(KERN_ERR "Data structure version mismatch. Userspace "
"tools must match eCryptfs kernel module with major "
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index a7be96e5f1cb..f892ac7c2a35 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* dir.c
*
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
index 70f5d4f9a945..13a4d9622633 100644
--- a/fs/efs/efs.h
+++ b/fs/efs/efs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 1999 Al Smith
*
diff --git a/fs/efs/file.c b/fs/efs/file.c
index a37dcee46866..9e641da6fab2 100644
--- a/fs/efs/file.c
+++ b/fs/efs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* file.c
*
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index d34a40edcdb2..38961ee1d1af 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* namei.c
*
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 5c42f1e34a2f..65b59009555b 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* super.c
*
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 4870cc82deb0..923eb91654d5 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* symlink.c
*
diff --git a/fs/exec.c b/fs/exec.c
index ac34d9724684..1d6243d9f2b6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1410,7 +1410,7 @@ static void free_bprm(struct linux_binprm *bprm)
kfree(bprm);
}
-int bprm_change_interp(char *interp, struct linux_binprm *bprm)
+int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
{
/* If a binfmt changed the interp, free it first. */
if (bprm->interp != bprm->filename)
@@ -1802,6 +1802,7 @@ static int do_execveat_common(int fd, struct filename *filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
+ membarrier_execve(current);
acct_update_integrals(current);
task_numa_free(current);
free_bprm(bprm);
@@ -1910,7 +1911,7 @@ void set_dumpable(struct mm_struct *mm, int value)
return;
do {
- old = ACCESS_ONCE(mm->flags);
+ old = READ_ONCE(mm->flags);
new = (old & ~MMF_DUMPABLE_MASK) | value;
} while (cmpxchg(&mm->flags, old, new) != old);
}
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index 445b0e996a12..311479d864a7 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux ext2-filesystem routines.
#
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 51f0aea70cb4..224c04abb2e5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/acl.c
*
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 44937f9fcf32..0f01c759daac 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
File: fs/ext2/acl.h
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index d0bdb74f0e15..e1b3724bebf2 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/balloc.c
*
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index e2709695b177..987647986f47 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/dir.c
*
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 28de3edd4f4d..032295e1d386 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index ff3a3636a5ca..c67b486488fd 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/file.c
*
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 395fc074c0db..a1fc3dabca41 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/ialloc.c
*
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 4dca6f348714..9b2ac55ac34f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/inode.c
*
@@ -820,11 +821,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret == 0) {
iomap->type = IOMAP_HOLE;
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->length = 1 << blkbits;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->blkno = (sector_t)bno << (blkbits - 9);
+ iomap->addr = (u64)bno << blkbits;
iomap->length = (u64)ret << blkbits;
iomap->flags |= IOMAP_F_MERGED;
}
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 087f122cca42..0367c0039e68 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/ioctl.c
*
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 814e405a2da6..e078075dc66f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/namei.c
*
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index eeffb0138a17..d5589ddcc281 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/symlink.c
*
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 1b9b1268d418..62d9a659a8ff 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/xattr.c
*
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 6f82ab1b00ca..cee888cdc235 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
File: linux/ext2_xattr.h
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 7b9e9c1842d5..9a682e440acb 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/xattr_security.c
* Handler for storing security labels as extended attributes.
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 65049b71af13..49add1107850 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/xattr_trusted.c
* Handler for trusted extended attributes.
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index fb2f992ae763..c243a3b4d69d 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext2/xattr_user.c
* Handler for extended user attributes.
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index e38039fd96ff..73b850f5659c 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,6 +37,7 @@ config EXT4_FS
select CRC16
select CRYPTO
select CRYPTO_CRC32C
+ select FS_IOMAP
help
This is the next generation of the ext3 filesystem.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index d9beca1653c5..8fdfcd3c3e04 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux ext4-filesystem routines.
#
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 46ff2229ff5e..fb50f9aa6ead 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/acl.c
*
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index da2c79577d72..a48fc5ae2701 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
File: fs/ext4/acl.h
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e04ec868e37e..a943e568292e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/balloc.c
*
@@ -600,22 +601,21 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
* ext4_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
* for the current or committing transaction to complete, and then
- * return TRUE.
- *
- * if the total number of retries exceed three times, return FALSE.
+ * return TRUE. We will only retry once.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
- (*retries)++ > 3 ||
+ (*retries)++ > 1 ||
!EXT4_SB(sb)->s_journal)
return 0;
- jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
-
smp_mb();
- if (EXT4_SB(sb)->s_mb_free_pending)
- jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ if (EXT4_SB(sb)->s_mb_free_pending == 0)
+ return 0;
+
+ jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+ jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
return 1;
}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 4a606afb171f..f63e028c638c 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/bitmap.c
*
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index fdb19543af1e..bee888e0e2db 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/block_validity.c
*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b04e882179c6..d5babc9f222b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/dir.c
*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e2abe01c8c6b..4e091eae38b1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ext4.h
*
@@ -33,17 +34,15 @@
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <crypto/hash.h>
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
/*
* The fourth extended filesystem constants/structures
*/
@@ -545,8 +544,8 @@ struct ext4_new_group_data {
__u64 inode_table;
__u32 blocks_count;
__u16 reserved_blocks;
- __u16 unused;
- __u32 free_blocks_count;
+ __u16 mdata_blocks;
+ __u32 free_clusters_count;
};
/* Indexes used to index group tables in ext4_new_group_data */
@@ -644,43 +643,6 @@ enum {
#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
-#ifndef FS_IOC_FSGETXATTR
-/* Until the uapi changes get merged for project quota... */
-
-#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr)
-#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr)
-
-/*
- * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
- */
-struct fsxattr {
- __u32 fsx_xflags; /* xflags field value (get/set) */
- __u32 fsx_extsize; /* extsize field value (get/set)*/
- __u32 fsx_nextents; /* nextents field value (get) */
- __u32 fsx_projid; /* project identifier (get/set) */
- unsigned char fsx_pad[12];
-};
-
-/*
- * Flags for the fsx_xflags field
- */
-#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
-#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
-#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
-#define FS_XFLAG_APPEND 0x00000010 /* all writes append */
-#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
-#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */
-#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
-#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
-#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
-#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
-#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
-#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
-#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
-#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
-#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
-#endif /* !defined(FS_IOC_FSGETXATTR) */
-
#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
@@ -1392,8 +1354,6 @@ struct ext4_sb_info {
int s_first_ino;
unsigned int s_inode_readahead_blks;
unsigned int s_inode_goal;
- spinlock_t s_next_gen_lock;
- u32 s_next_generation;
u32 s_hash_seed[4];
int s_def_hash_version;
int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
@@ -2515,9 +2475,6 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
-extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
- unsigned int map_len,
- struct extent_status *result);
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -3048,6 +3005,10 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
extern int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
int *has_inline, __u64 start, __u64 len);
+
+struct iomap;
+extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);
+
extern int ext4_try_to_evict_inline_data(handle_t *handle,
struct inode *inode,
int needed);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 5b342ac67d2e..2d593201cf7a 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Interface between ext4 and JBD
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 97f0fd06728d..07bca11749d4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4794,7 +4794,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
+ (offset + len > i_size_read(inode) ||
+ offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
if (ret)
@@ -4965,7 +4966,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
+ (offset + len > i_size_read(inode) ||
+ offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
if (ret)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e7f12a204cbc..763ef185dd17 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/ext4/extents_status.c
*
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f7aa24f4642d..ca90fc96f47e 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/ext4/extents_status.h
*
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1da660ac3bc..ad204d2724ac 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/file.c
*
@@ -20,6 +21,7 @@
#include <linux/time.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/dax.h>
@@ -364,7 +366,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct vfsmount *mnt = filp->f_path.mnt;
- struct dentry *dir;
struct path path;
char buf[64], *cp;
int ret;
@@ -404,25 +405,11 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
ext4_journal_stop(handle);
}
}
- if (ext4_encrypted_inode(inode)) {
- ret = fscrypt_get_encryption_info(inode);
- if (ret)
- return -EACCES;
- if (!fscrypt_has_encryption_key(inode))
- return -ENOKEY;
- }
- dir = dget_parent(file_dentry(filp));
- if (ext4_encrypted_inode(d_inode(dir)) &&
- !fscrypt_has_permitted_context(d_inode(dir), inode)) {
- ext4_warning(inode->i_sb,
- "Inconsistent encryption contexts: %lu/%lu",
- (unsigned long) d_inode(dir)->i_ino,
- (unsigned long) inode->i_ino);
- dput(dir);
- return -EPERM;
- }
- dput(dir);
+ ret = fscrypt_file_open(inode, filp);
+ if (ret)
+ return ret;
+
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
@@ -438,248 +425,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
}
/*
- * Here we use ext4_map_blocks() to get a block mapping for a extent-based
- * file rather than ext4_ext_walk_space() because we can introduce
- * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
- * function. When extent status tree has been fully implemented, it will
- * track all extent status for a file and we can directly use it to
- * retrieve the offset for SEEK_DATA/SEEK_HOLE.
- */
-
-/*
- * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
- * lookup page cache to check whether or not there has some data between
- * [startoff, endoff] because, if this range contains an unwritten extent,
- * we determine this extent as a data or a hole according to whether the
- * page cache has data or not.
- */
-static int ext4_find_unwritten_pgoff(struct inode *inode,
- int whence,
- ext4_lblk_t end_blk,
- loff_t *offset)
-{
- struct pagevec pvec;
- unsigned int blkbits;
- pgoff_t index;
- pgoff_t end;
- loff_t endoff;
- loff_t startoff;
- loff_t lastoff;
- int found = 0;
-
- blkbits = inode->i_sb->s_blocksize_bits;
- startoff = *offset;
- lastoff = startoff;
- endoff = (loff_t)end_blk << blkbits;
-
- index = startoff >> PAGE_SHIFT;
- end = (endoff - 1) >> PAGE_SHIFT;
-
- pagevec_init(&pvec, 0);
- do {
- int i;
- unsigned long nr_pages;
-
- nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
- &index, end);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- /*
- * If current offset is smaller than the page offset,
- * there is a hole at this offset.
- */
- if (whence == SEEK_HOLE && lastoff < endoff &&
- lastoff < page_offset(pvec.pages[i])) {
- found = 1;
- *offset = lastoff;
- goto out;
- }
-
- lock_page(page);
-
- if (unlikely(page->mapping != inode->i_mapping)) {
- unlock_page(page);
- continue;
- }
-
- if (!page_has_buffers(page)) {
- unlock_page(page);
- continue;
- }
-
- if (page_has_buffers(page)) {
- lastoff = page_offset(page);
- bh = head = page_buffers(page);
- do {
- if (lastoff + bh->b_size <= startoff)
- goto next;
- if (buffer_uptodate(bh) ||
- buffer_unwritten(bh)) {
- if (whence == SEEK_DATA)
- found = 1;
- } else {
- if (whence == SEEK_HOLE)
- found = 1;
- }
- if (found) {
- *offset = max_t(loff_t,
- startoff, lastoff);
- unlock_page(page);
- goto out;
- }
-next:
- lastoff += bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
- }
-
- lastoff = page_offset(page) + PAGE_SIZE;
- unlock_page(page);
- }
-
- pagevec_release(&pvec);
- } while (index <= end);
-
- /* There are no pages upto endoff - that would be a hole in there. */
- if (whence == SEEK_HOLE && lastoff < endoff) {
- found = 1;
- *offset = lastoff;
- }
-out:
- pagevec_release(&pvec);
- return found;
-}
-
-/*
- * ext4_seek_data() retrieves the offset for SEEK_DATA.
- */
-static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
-{
- struct inode *inode = file->f_mapping->host;
- struct extent_status es;
- ext4_lblk_t start, last, end;
- loff_t dataoff, isize;
- int blkbits;
- int ret;
-
- inode_lock(inode);
-
- isize = i_size_read(inode);
- if (offset < 0 || offset >= isize) {
- inode_unlock(inode);
- return -ENXIO;
- }
-
- blkbits = inode->i_sb->s_blocksize_bits;
- start = offset >> blkbits;
- last = start;
- end = isize >> blkbits;
- dataoff = offset;
-
- do {
- ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
- if (ret <= 0) {
- /* No extent found -> no data */
- if (ret == 0)
- ret = -ENXIO;
- inode_unlock(inode);
- return ret;
- }
-
- last = es.es_lblk;
- if (last != start)
- dataoff = (loff_t)last << blkbits;
- if (!ext4_es_is_unwritten(&es))
- break;
-
- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- es.es_lblk + es.es_len, &dataoff))
- break;
- last += es.es_len;
- dataoff = (loff_t)last << blkbits;
- cond_resched();
- } while (last <= end);
-
- inode_unlock(inode);
-
- if (dataoff > isize)
- return -ENXIO;
-
- return vfs_setpos(file, dataoff, maxsize);
-}
-
-/*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
- */
-static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
-{
- struct inode *inode = file->f_mapping->host;
- struct extent_status es;
- ext4_lblk_t start, last, end;
- loff_t holeoff, isize;
- int blkbits;
- int ret;
-
- inode_lock(inode);
-
- isize = i_size_read(inode);
- if (offset < 0 || offset >= isize) {
- inode_unlock(inode);
- return -ENXIO;
- }
-
- blkbits = inode->i_sb->s_blocksize_bits;
- start = offset >> blkbits;
- last = start;
- end = isize >> blkbits;
- holeoff = offset;
-
- do {
- ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
- if (ret < 0) {
- inode_unlock(inode);
- return ret;
- }
- /* Found a hole? */
- if (ret == 0 || es.es_lblk > last) {
- if (last != start)
- holeoff = (loff_t)last << blkbits;
- break;
- }
- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (ext4_es_is_unwritten(&es) &&
- ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- last + es.es_len, &holeoff))
- break;
-
- last += es.es_len;
- holeoff = (loff_t)last << blkbits;
- cond_resched();
- } while (last <= end);
-
- inode_unlock(inode);
-
- if (holeoff > isize)
- holeoff = isize;
-
- return vfs_setpos(file, holeoff, maxsize);
-}
-
-/*
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
* by calling generic_file_llseek_size() with the appropriate maxbytes
* value for each.
@@ -695,18 +440,24 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
maxbytes = inode->i_sb->s_maxbytes;
switch (whence) {
- case SEEK_SET:
- case SEEK_CUR:
- case SEEK_END:
+ default:
return generic_file_llseek_size(file, offset, whence,
maxbytes, i_size_read(inode));
- case SEEK_DATA:
- return ext4_seek_data(file, offset, maxbytes);
case SEEK_HOLE:
- return ext4_seek_hole(file, offset, maxbytes);
+ inode_lock_shared(inode);
+ offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
+ inode_unlock_shared(inode);
+ break;
+ case SEEK_DATA:
+ inode_lock_shared(inode);
+ offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
+ inode_unlock_shared(inode);
+ break;
}
- return -EINVAL;
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, maxbytes);
}
const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index f9230580a84b..26a7fe5c4fd3 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/fsync.c
*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ee823022aa34..b4267d72f249 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/ialloc.c
*
@@ -1138,9 +1139,7 @@ got:
inode->i_ino);
goto out;
}
- spin_lock(&sbi->s_next_gen_lock);
- inode->i_generation = sbi->s_next_generation++;
- spin_unlock(&sbi->s_next_gen_lock);
+ inode->i_generation = prandom_u32();
/* Precompute checksum seed for inode metadata */
if (ext4_has_metadata_csum(sb)) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 7ffa290cbb8e..c32802c956d5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/indirect.c
*
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 28c5c3abddb3..1367553c43bb 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -12,6 +12,7 @@
* GNU General Public License for more details.
*/
+#include <linux/iomap.h>
#include <linux/fiemap.h>
#include "ext4_jbd2.h"
@@ -302,11 +303,6 @@ static int ext4_create_inline_data(handle_t *handle,
EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
- /*
- * Propagate changes to inode->i_flags as well - e.g. S_DAX may
- * get cleared
- */
- ext4_set_inode_flags(inode);
get_bh(is.iloc.bh);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -451,11 +447,6 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
}
}
ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
- /*
- * Propagate changes to inode->i_flags as well - e.g. S_DAX may
- * get set.
- */
- ext4_set_inode_flags(inode);
get_bh(is.iloc.bh);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -1827,6 +1818,38 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
return ret;
}
+int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
+{
+ __u64 addr;
+ int error = -EAGAIN;
+ struct ext4_iloc iloc;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode))
+ goto out;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ goto out;
+
+ addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+ addr += offsetof(struct ext4_inode, i_block);
+
+ brelse(iloc.bh);
+
+ iomap->addr = addr;
+ iomap->offset = 0;
+ iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
+ i_size_read(inode));
+ iomap->type = 0;
+ iomap->flags = IOMAP_F_DATA_INLINE;
+
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return error;
+}
+
int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
int *has_inline, __u64 start, __u64 len)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 31db875bc7a1..2633150e41b9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/inode.c
*
@@ -3393,7 +3394,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-#ifdef CONFIG_FS_DAX
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
@@ -3402,17 +3402,54 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned long first_block = offset >> blkbits;
unsigned long last_block = (offset + length - 1) >> blkbits;
struct ext4_map_blocks map;
+ bool delalloc = false;
int ret;
- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
- return -ERANGE;
+
+ if (flags & IOMAP_REPORT) {
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_inline_data_iomap(inode, iomap);
+ if (ret != -EAGAIN) {
+ if (ret == 0 && offset >= iomap->length)
+ ret = -ENOENT;
+ return ret;
+ }
+ }
+ } else {
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
+ }
map.m_lblk = first_block;
map.m_len = last_block - first_block + 1;
- if (!(flags & IOMAP_WRITE)) {
+ if (flags & IOMAP_REPORT) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
- } else {
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0) {
+ ext4_lblk_t end = map.m_lblk + map.m_len - 1;
+ struct extent_status es;
+
+ ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
+
+ if (!es.es_len || es.es_lblk > end) {
+ /* entire range is a hole */
+ } else if (es.es_lblk > map.m_lblk) {
+ /* range starts with a hole */
+ map.m_len = es.es_lblk - map.m_lblk;
+ } else {
+ ext4_lblk_t offs = 0;
+
+ if (es.es_lblk < map.m_lblk)
+ offs = map.m_lblk - es.es_lblk;
+ map.m_lblk = es.es_lblk + offs;
+ map.m_len = es.es_len - offs;
+ delalloc = true;
+ }
+ }
+ } else if (flags & IOMAP_WRITE) {
int dio_credits;
handle_t *handle;
int retries = 0;
@@ -3463,17 +3500,21 @@ retry:
}
}
ext4_journal_stop(handle);
+ } else {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
}
iomap->flags = 0;
iomap->bdev = inode->i_sb->s_bdev;
iomap->dax_dev = sbi->s_daxdev;
iomap->offset = first_block << blkbits;
+ iomap->length = (u64)map.m_len << blkbits;
if (ret == 0) {
- iomap->type = IOMAP_HOLE;
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->length = (u64)map.m_len << blkbits;
+ iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
} else {
if (map.m_flags & EXT4_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
@@ -3483,12 +3524,12 @@ retry:
WARN_ON_ONCE(1);
return -EIO;
}
- iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
- iomap->length = (u64)map.m_len << blkbits;
+ iomap->addr = (u64)map.m_pblk << blkbits;
}
if (map.m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
+
return 0;
}
@@ -3549,8 +3590,6 @@ const struct iomap_ops ext4_iomap_ops = {
.iomap_end = ext4_iomap_end,
};
-#endif
-
static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
@@ -4572,6 +4611,21 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
+static bool ext4_should_use_dax(struct inode *inode)
+{
+ if (!test_opt(inode->i_sb, DAX))
+ return false;
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (ext4_should_journal_data(inode))
+ return false;
+ if (ext4_has_inline_data(inode))
+ return false;
+ if (ext4_encrypted_inode(inode))
+ return false;
+ return true;
+}
+
void ext4_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT4_I(inode)->i_flags;
@@ -4587,12 +4641,13 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) &&
- !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
- !ext4_encrypted_inode(inode))
+ if (ext4_should_use_dax(inode))
new_fl |= S_DAX;
+ if (flags & EXT4_ENCRYPT_FL)
+ new_fl |= S_ENCRYPTED;
inode_set_flags(inode, new_fl,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
+ S_ENCRYPTED);
}
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5308,6 +5363,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ error = fscrypt_prepare_setattr(dentry, attr);
+ if (error)
+ return error;
+
if (is_quota_modification(inode, attr)) {
error = dquot_initialize(inode);
if (error)
@@ -5353,14 +5412,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
loff_t oldsize = inode->i_size;
int shrink = (attr->ia_size <= inode->i_size);
- if (ext4_encrypted_inode(inode)) {
- error = fscrypt_get_encryption_info(inode);
- if (error)
- return error;
- if (!fscrypt_has_encryption_key(inode))
- return -ENOKEY;
- }
-
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -5966,11 +6017,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
}
ext4_set_aops(inode);
- /*
- * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated.
- * E.g. S_DAX may get cleared / set.
- */
- ext4_set_inode_flags(inode);
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_journal_flag_rwsem);
@@ -6106,70 +6152,3 @@ int ext4_filemap_fault(struct vm_fault *vmf)
return err;
}
-
-/*
- * Find the first extent at or after @lblk in an inode that is not a hole.
- * Search for @map_len blocks at most. The extent is returned in @result.
- *
- * The function returns 1 if we found an extent. The function returns 0 in
- * case there is no extent at or after @lblk and in that case also sets
- * @result->es_len to 0. In case of error, the error code is returned.
- */
-int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
- unsigned int map_len, struct extent_status *result)
-{
- struct ext4_map_blocks map;
- struct extent_status es = {};
- int ret;
-
- map.m_lblk = lblk;
- map.m_len = map_len;
-
- /*
- * For non-extent based files this loop may iterate several times since
- * we do not determine full hole size.
- */
- while (map.m_len > 0) {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
- /* There's extent covering m_lblk? Just return it. */
- if (ret > 0) {
- int status;
-
- ext4_es_store_pblock(result, map.m_pblk);
- result->es_lblk = map.m_lblk;
- result->es_len = map.m_len;
- if (map.m_flags & EXT4_MAP_UNWRITTEN)
- status = EXTENT_STATUS_UNWRITTEN;
- else
- status = EXTENT_STATUS_WRITTEN;
- ext4_es_store_status(result, status);
- return 1;
- }
- ext4_es_find_delayed_extent_range(inode, map.m_lblk,
- map.m_lblk + map.m_len - 1,
- &es);
- /* Is delalloc data before next block in extent tree? */
- if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
- ext4_lblk_t offset = 0;
-
- if (es.es_lblk < lblk)
- offset = lblk - es.es_lblk;
- result->es_lblk = es.es_lblk + offset;
- ext4_es_store_pblock(result,
- ext4_es_pblock(&es) + offset);
- result->es_len = es.es_len - offset;
- ext4_es_store_status(result, ext4_es_status(&es));
-
- return 1;
- }
- /* There's a hole at m_lblk, advance us after it */
- map.m_lblk += map.m_len;
- map_len -= map.m_len;
- map.m_len = map_len;
- cond_resched();
- }
- result->es_len = 0;
- return 0;
-}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index afb66d4ab5cf..b7558f292420 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/ioctl.c
*
@@ -14,6 +15,7 @@
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/quotaops.h>
+#include <linux/random.h>
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/delay.h>
@@ -98,7 +100,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
int err;
struct inode *inode_bl;
struct ext4_inode_info *ei_bl;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
return -EINVAL;
@@ -157,10 +158,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
- spin_lock(&sbi->s_next_gen_lock);
- inode->i_generation = sbi->s_next_generation++;
- inode_bl->i_generation = sbi->s_next_generation++;
- spin_unlock(&sbi->s_next_gen_lock);
+ inode->i_generation = prandom_u32();
+ inode_bl->i_generation = prandom_u32();
ext4_discard_preallocations(inode);
@@ -290,10 +289,20 @@ flags_err:
if (err)
goto flags_out;
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ /*
+ * Changes to the journaling mode can cause unsafe changes to
+ * S_DAX if we are using the DAX mount option.
+ */
+ if (test_opt(inode->i_sb, DAX)) {
+ err = -EBUSY;
+ goto flags_out;
+ }
+
err = ext4_change_inode_journal_flag(inode, jflag);
- if (err)
- goto flags_out;
+ if (err)
+ goto flags_out;
+ }
if (migrate) {
if (flags & EXT4_EXTENTS_FL)
err = ext4_ext_migrate(inode);
@@ -861,12 +870,6 @@ group_add_out:
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
- if (ext4_has_feature_bigalloc(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Online resizing not (yet) supported with bigalloc");
- return -EOPNOTSUPP;
- }
-
if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
sizeof(__u64))) {
return -EFAULT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 701085620cd8..d9f8b90a93ed 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4994,8 +4994,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- int err = 0, ret, blk_free_count;
- ext4_grpblk_t blocks_freed;
+ int err = 0, ret, free_clusters_count;
+ ext4_grpblk_t clusters_freed;
+ ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
+ ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
+ unsigned long cluster_count = last_cluster - first_cluster + 1;
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
@@ -5007,8 +5010,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
* Check to see if we are freeing blocks across a group
* boundary.
*/
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- ext4_warning(sb, "too much blocks added to group %u",
+ if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
+ ext4_warning(sb, "too many blocks added to group %u",
block_group);
err = -EINVAL;
goto error_return;
@@ -5054,14 +5057,14 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
if (err)
goto error_return;
- for (i = 0, blocks_freed = 0; i < count; i++) {
+ for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
ext4_error(sb, "bit already cleared for block %llu",
(ext4_fsblk_t)(block + i));
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
- blocks_freed++;
+ clusters_freed++;
}
}
@@ -5075,19 +5078,20 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
* them with group lock_held
*/
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
- mb_free_blocks(NULL, &e4b, bit, count);
- blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
- ext4_free_group_clusters_set(sb, desc, blk_free_count);
+ mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
+ mb_free_blocks(NULL, &e4b, bit, cluster_count);
+ free_clusters_count = clusters_freed +
+ ext4_free_group_clusters(sb, desc);
+ ext4_free_group_clusters_set(sb, desc, free_clusters_count);
ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
- EXT4_NUM_B2C(sbi, blocks_freed));
+ clusters_freed);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+ atomic64_add(clusters_freed,
&sbi->s_flex_groups[flex_group].free_clusters);
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 009300ee1561..dcf52540f379 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/ext4/mballoc.h
*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 84c54f15f1dd..27b9a76a0dfa 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/random.h>
#include <linux/buffer_head.h>
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c1cf020d1889..798b3ac680db 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/namei.c
*
@@ -1538,24 +1539,14 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct inode *inode;
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
+ int err;
- if (ext4_encrypted_inode(dir)) {
- int res = fscrypt_get_encryption_info(dir);
-
- /*
- * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
- * created while the directory was encrypted and we
- * have access to the key.
- */
- if (fscrypt_has_encryption_key(dir))
- fscrypt_set_encrypted_dentry(dentry);
- fscrypt_set_d_op(dentry);
- if (res && res != -ENOKEY)
- return ERR_PTR(res);
- }
+ err = fscrypt_prepare_lookup(dir, dentry, flags);
+ if (err)
+ return ERR_PTR(err);
- if (dentry->d_name.len > EXT4_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
+ if (dentry->d_name.len > EXT4_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh))
@@ -3221,9 +3212,10 @@ static int ext4_link(struct dentry *old_dentry,
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
- if (ext4_encrypted_inode(dir) &&
- !fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+
+ err = fscrypt_prepare_link(old_dentry, dir, dentry);
+ if (err)
+ return err;
if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
(!projid_eq(EXT4_I(dir)->i_projid,
@@ -3515,12 +3507,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
EXT4_I(old_dentry->d_inode)->i_projid)))
return -EXDEV;
- if ((ext4_encrypted_inode(old_dir) &&
- !fscrypt_has_encryption_key(old_dir)) ||
- (ext4_encrypted_inode(new_dir) &&
- !fscrypt_has_encryption_key(new_dir)))
- return -ENOKEY;
-
retval = dquot_initialize(old.dir);
if (retval)
return retval;
@@ -3549,13 +3535,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
goto end_rename;
- if ((old.dir != new.dir) &&
- ext4_encrypted_inode(new.dir) &&
- !fscrypt_has_permitted_context(new.dir, old.inode)) {
- retval = -EPERM;
- goto end_rename;
- }
-
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
@@ -3721,19 +3700,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
int retval;
struct timespec ctime;
- if ((ext4_encrypted_inode(old_dir) &&
- !fscrypt_has_encryption_key(old_dir)) ||
- (ext4_encrypted_inode(new_dir) &&
- !fscrypt_has_encryption_key(new_dir)))
- return -ENOKEY;
-
- if ((ext4_encrypted_inode(old_dir) ||
- ext4_encrypted_inode(new_dir)) &&
- (old_dir != new_dir) &&
- (!fscrypt_has_permitted_context(new_dir, old.inode) ||
- !fscrypt_has_permitted_context(old_dir, new.inode)))
- return -EPERM;
-
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
!projid_eq(EXT4_I(new_dir)->i_projid,
EXT4_I(old_dentry->d_inode)->i_projid)) ||
@@ -3860,12 +3826,19 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
+ int err;
+
if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb))))
return -EIO;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
+ err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
+ if (err)
+ return err;
+
if (flags & RENAME_EXCHANGE) {
return ext4_cross_rename(old_dir, old_dentry,
new_dir, new_dentry);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 55ad7dd149d0..db7590178dfc 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/page-io.c
*
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 04c90643af7a..9ffa6fad18db 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/readpage.c
*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 035cd3f4785e..50443bda8e98 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/resize.c
*
@@ -106,7 +107,7 @@ static int verify_group_input(struct super_block *sb,
overhead = ext4_group_overhead_blocks(sb, group);
metaend = start + overhead;
- input->free_blocks_count = free_blocks_count =
+ input->free_clusters_count = free_blocks_count =
input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
if (test_opt(sb, DEBUG))
@@ -257,6 +258,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
ext4_group_t last_group;
unsigned overhead;
__u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
+ int i;
BUG_ON(flex_gd->count == 0 || group_data == NULL);
@@ -293,7 +295,7 @@ next_group:
group_data[bb_index].block_bitmap = start_blk++;
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
- group_data[group].free_blocks_count--;
+ group_data[group].mdata_blocks++;
flex_gd->bg_flags[group] &= uninit_mask;
}
@@ -304,7 +306,7 @@ next_group:
group_data[ib_index].inode_bitmap = start_blk++;
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
- group_data[group].free_blocks_count--;
+ group_data[group].mdata_blocks++;
flex_gd->bg_flags[group] &= uninit_mask;
}
@@ -323,15 +325,22 @@ next_group:
if (start_blk + itb > next_group_start) {
flex_gd->bg_flags[group + 1] &= uninit_mask;
overhead = start_blk + itb - next_group_start;
- group_data[group + 1].free_blocks_count -= overhead;
+ group_data[group + 1].mdata_blocks += overhead;
itb -= overhead;
}
- group_data[group].free_blocks_count -= itb;
+ group_data[group].mdata_blocks += itb;
flex_gd->bg_flags[group] &= uninit_mask;
start_blk += EXT4_SB(sb)->s_itb_per_group;
}
+ /* Update free clusters count to exclude metadata blocks */
+ for (i = 0; i < flex_gd->count; i++) {
+ group_data[i].free_clusters_count -=
+ EXT4_NUM_B2C(EXT4_SB(sb),
+ group_data[i].mdata_blocks);
+ }
+
if (test_opt(sb, DEBUG)) {
int i;
group = group_data[0].group;
@@ -341,12 +350,13 @@ next_group:
flexbg_size);
for (i = 0; i < flex_gd->count; i++) {
- printk(KERN_DEBUG "adding %s group %u: %u "
- "blocks (%d free)\n",
+ ext4_debug(
+ "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n",
ext4_bg_has_super(sb, group + i) ? "normal" :
"no-super", group + i,
group_data[i].blocks_count,
- group_data[i].free_blocks_count);
+ group_data[i].free_clusters_count,
+ group_data[i].mdata_blocks);
}
}
return 0;
@@ -398,7 +408,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
}
/*
- * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ * set_flexbg_block_bitmap() mark clusters [@first_cluster, @last_cluster] used.
*
* Helper function for ext4_setup_new_group_blocks() which set .
*
@@ -408,22 +418,26 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
*/
static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
struct ext4_new_flex_group_data *flex_gd,
- ext4_fsblk_t block, ext4_group_t count)
+ ext4_fsblk_t first_cluster, ext4_fsblk_t last_cluster)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t count = last_cluster - first_cluster + 1;
ext4_group_t count2;
- ext4_debug("mark blocks [%llu/%u] used\n", block, count);
- for (count2 = count; count > 0; count -= count2, block += count2) {
+ ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster,
+ last_cluster);
+ for (count2 = count; count > 0;
+ count -= count2, first_cluster += count2) {
ext4_fsblk_t start;
struct buffer_head *bh;
ext4_group_t group;
int err;
- group = ext4_get_group_number(sb, block);
- start = ext4_group_first_block_no(sb, group);
+ group = ext4_get_group_number(sb, EXT4_C2B(sbi, first_cluster));
+ start = EXT4_B2C(sbi, ext4_group_first_block_no(sb, group));
group -= flex_gd->groups[0].group;
- count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
+ count2 = EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start);
if (count2 > count)
count2 = count;
@@ -444,9 +458,9 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
err = ext4_journal_get_write_access(handle, bh);
if (err)
return err;
- ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
- block - start, count2);
- ext4_set_bits(bh->b_data, block - start, count2);
+ ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
+ first_cluster, first_cluster - start, count2);
+ ext4_set_bits(bh->b_data, first_cluster - start, count2);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
if (unlikely(err))
@@ -595,9 +609,10 @@ handle_bb:
if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0, overhead);
+ ext4_set_bits(bh->b_data, 0,
+ EXT4_NUM_B2C(sbi, overhead));
}
- ext4_mark_bitmap_end(group_data[i].blocks_count,
+ ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
sb->s_blocksize * 8, bh->b_data);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
if (err)
@@ -642,7 +657,11 @@ handle_ib:
continue;
}
err = set_flexbg_block_bitmap(sb, handle,
- flex_gd, start, count);
+ flex_gd,
+ EXT4_B2C(sbi, start),
+ EXT4_B2C(sbi,
+ start + count
+ - 1));
if (err)
goto out;
count = group_table_count[j];
@@ -652,7 +671,11 @@ handle_ib:
if (count) {
err = set_flexbg_block_bitmap(sb, handle,
- flex_gd, start, count);
+ flex_gd,
+ EXT4_B2C(sbi, start),
+ EXT4_B2C(sbi,
+ start + count
+ - 1));
if (err)
goto out;
}
@@ -840,7 +863,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
ext4_std_error(sb, err);
goto exit_inode;
}
- inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+ inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >>
+ (9 - EXT4_SB(sb)->s_cluster_bits);
ext4_mark_iloc_dirty(handle, inode, &iloc);
memset(gdb_bh->b_data, 0, sb->s_blocksize);
err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
@@ -935,6 +959,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
{
struct super_block *sb = inode->i_sb;
int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+ int cluster_bits = EXT4_SB(sb)->s_cluster_bits;
struct buffer_head **primary;
struct buffer_head *dind;
struct ext4_iloc iloc;
@@ -1010,7 +1035,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
if (!err)
err = err2;
}
- inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+
+ inode->i_blocks += reserved_gdb * sb->s_blocksize >> (9 - cluster_bits);
ext4_mark_iloc_dirty(handle, inode, &iloc);
exit_bh:
@@ -1244,7 +1270,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
ext4_group_t group;
__u16 *bg_flags = flex_gd->bg_flags;
int i, gdb_off, gdb_num, err = 0;
-
+
for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
group = group_data->group;
@@ -1271,7 +1297,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
ext4_inode_table_set(sb, gdp, group_data->inode_table);
ext4_free_group_clusters_set(sb, gdp,
- EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
+ group_data->free_clusters_count);
ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
if (ext4_has_group_desc_csum(sb))
ext4_itable_unused_set(sb, gdp,
@@ -1327,7 +1353,7 @@ static void ext4_update_super(struct super_block *sb,
*/
for (i = 0; i < flex_gd->count; i++) {
blocks_count += group_data[i].blocks_count;
- free_blocks += group_data[i].free_blocks_count;
+ free_blocks += EXT4_C2B(sbi, group_data[i].free_clusters_count);
}
reserved_blocks = ext4_r_blocks_count(es) * 100;
@@ -1499,17 +1525,18 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
ext4_fsblk_t n_blocks_count,
unsigned long flexbg_size)
{
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
struct ext4_new_group_data *group_data = flex_gd->groups;
ext4_fsblk_t o_blocks_count;
ext4_group_t n_group;
ext4_group_t group;
ext4_group_t last_group;
ext4_grpblk_t last;
- ext4_grpblk_t blocks_per_group;
+ ext4_grpblk_t clusters_per_group;
unsigned long i;
- blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+ clusters_per_group = EXT4_CLUSTERS_PER_GROUP(sb);
o_blocks_count = ext4_blocks_count(es);
@@ -1530,9 +1557,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
int overhead;
group_data[i].group = group + i;
- group_data[i].blocks_count = blocks_per_group;
+ group_data[i].blocks_count = EXT4_BLOCKS_PER_GROUP(sb);
overhead = ext4_group_overhead_blocks(sb, group + i);
- group_data[i].free_blocks_count = blocks_per_group - overhead;
+ group_data[i].mdata_blocks = overhead;
+ group_data[i].free_clusters_count = EXT4_CLUSTERS_PER_GROUP(sb);
if (ext4_has_group_desc_csum(sb)) {
flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
EXT4_BG_INODE_UNINIT;
@@ -1546,10 +1574,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
/* We need to initialize block bitmap of last group. */
flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
- if ((last_group == n_group) && (last != blocks_per_group - 1)) {
- group_data[i - 1].blocks_count = last + 1;
- group_data[i - 1].free_blocks_count -= blocks_per_group-
- last - 1;
+ if ((last_group == n_group) && (last != clusters_per_group - 1)) {
+ group_data[i - 1].blocks_count = EXT4_C2B(sbi, last + 1);
+ group_data[i - 1].free_clusters_count -= clusters_per_group -
+ last - 1;
}
return 1;
@@ -1796,7 +1824,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
}
/* Do a quick sanity check of the resize inode */
- if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+ if (inode->i_blocks != 1 << (inode->i_blkbits -
+ (9 - sbi->s_cluster_bits)))
goto invalid_resize_inode;
for (i = 0; i < EXT4_N_BLOCKS; i++) {
if (i == EXT4_DIND_BLOCK) {
@@ -1959,7 +1988,7 @@ retry:
if (n_group == o_group)
add = n_blocks_count - o_blocks_count;
else
- add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+ add = EXT4_C2B(sbi, EXT4_CLUSTERS_PER_GROUP(sb) - (offset + 1));
if (add > 0) {
err = ext4_group_extend_no_check(sb, o_blocks_count, add);
if (err)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b104096fce9e..0556cd036b69 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1159,6 +1159,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
if (inode->i_ino == EXT4_ROOT_INO)
return -EPERM;
+ if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
+ return -EINVAL;
+
res = ext4_convert_inline_data(inode);
if (res)
return res;
@@ -1181,7 +1184,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
ext4_clear_inode_state(inode,
EXT4_STATE_MAY_INLINE_DATA);
/*
- * Update inode->i_flags - e.g. S_DAX may get disabled
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
*/
ext4_set_inode_flags(inode);
}
@@ -1206,7 +1210,10 @@ retry:
ctx, len, 0);
if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- /* Update inode->i_flags - e.g. S_DAX may get disabled */
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
ext4_set_inode_flags(inode);
res = ext4_mark_inode_dirty(handle, inode);
if (res)
@@ -1237,14 +1244,9 @@ static const struct fscrypt_operations ext4_cryptops = {
.get_context = ext4_get_context,
.set_context = ext4_set_context,
.dummy_context = ext4_dummy_context,
- .is_encrypted = ext4_encrypted_inode,
.empty_dir = ext4_empty_dir,
.max_namelen = ext4_max_namelen,
};
-#else
-static const struct fscrypt_operations ext4_cryptops = {
- .is_encrypted = ext4_encrypted_inode,
-};
#endif
#ifdef CONFIG_QUOTA
@@ -1677,7 +1679,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
return 1;
case Opt_i_version:
- sb->s_flags |= MS_I_VERSION;
+ sb->s_flags |= SB_I_VERSION;
return 1;
case Opt_lazytime:
sb->s_flags |= MS_LAZYTIME;
@@ -2060,7 +2062,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
- if (sb->s_flags & MS_I_VERSION)
+ if (sb->s_flags & SB_I_VERSION)
SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
@@ -2791,14 +2793,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
* This function is called once a day if we have errors logged
* on the file system
*/
-static void print_daily_error_info(unsigned long arg)
+static void print_daily_error_info(struct timer_list *t)
{
- struct super_block *sb = (struct super_block *) arg;
- struct ext4_sb_info *sbi;
- struct ext4_super_block *es;
-
- sbi = EXT4_SB(sb);
- es = sbi->s_es;
+ struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
+ struct super_block *sb = sbi->s_sb;
+ struct ext4_super_block *es = sbi->s_es;
if (es->s_error_count)
/* fsck newer than v1.41.13 is needed to clean this condition. */
@@ -3708,6 +3707,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (ext4_has_feature_inline_data(sb)) {
+ ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
+ " that may contain inline data");
+ goto failed_mount;
+ }
err = bdev_dax_supported(sb, blocksize);
if (err)
goto failed_mount;
@@ -3977,11 +3981,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_gdb_count = db_count;
- get_random_bytes(&sbi->s_next_generation, sizeof(u32));
- spin_lock_init(&sbi->s_next_gen_lock);
- setup_timer(&sbi->s_err_report, print_daily_error_info,
- (unsigned long) sb);
+ timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
/* Register extent status tree shrinker */
if (ext4_es_register_shrinker(sbi))
@@ -3996,7 +3997,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ext4_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
+#endif
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (ext4_has_feature_quota(sb))
@@ -4612,7 +4615,8 @@ static int ext4_load_journal(struct super_block *sb,
"required on readonly filesystem");
if (really_read_only) {
ext4_msg(sb, KERN_ERR, "write access "
- "unavailable, cannot proceed");
+ "unavailable, cannot proceed "
+ "(try mounting with noload)");
return -EROFS;
}
ext4_msg(sb, KERN_INFO, "write access will "
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 5c8fc53cb0e5..a2006c9af1d9 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/symlink.c
*
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 48c7a7d55ed3..e21afd52e7d7 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/sysfs.c
*
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index c70d06a383e2..b64a9fa0ff41 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/ext4/truncate.h
*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3b69330a4250..218a7ba57819 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/xattr.c
*
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 0d2dde1fa87a..f8cc07588ac9 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
File: fs/ext4/xattr.h
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index a8921112030d..629001b28632 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/xattr_security.c
* Handler for storing security labels as extended attributes.
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index c7765c735714..e9389e5d75c3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/xattr_trusted.c
* Handler for trusted extended attributes.
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index ca20e423034b..d4546184b34b 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/xattr_user.c
* Handler for extended user attributes.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index a0dc559b1b47..776c4b936504 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_F2FS_FS) += f2fs.o
f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9a7c90386947..115204fdefcc 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -23,13 +23,11 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/quotaops.h>
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
#include <crypto/hash.h>
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
#else
@@ -2525,7 +2523,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
void stop_discard_thread(struct f2fs_sb_info *sbi);
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount);
void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void release_discard_addrs(struct f2fs_sb_info *sbi);
int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
@@ -2949,6 +2947,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
file_set_encrypt(inode);
+ inode->i_flags |= S_ENCRYPTED;
#endif
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 50c88e37ed66..53fb08810ee9 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -43,8 +43,11 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
+ if (f2fs_encrypted_inode(inode))
+ new_fl |= S_ENCRYPTED;
inode_set_flags(inode, new_fl,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
+ S_ENCRYPTED);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 621b9b3d320b..c695ff462ee6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1210,11 +1210,11 @@ void stop_discard_thread(struct f2fs_sb_info *sbi)
}
/* This comes from f2fs_put_super and f2fs_trim_fs */
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount)
{
__issue_discard_cmd(sbi, false);
__drop_discard_cmd(sbi);
- __wait_discard_cmd(sbi, false);
+ __wait_discard_cmd(sbi, !umount);
}
static void mark_discard_range_all(struct f2fs_sb_info *sbi)
@@ -2244,7 +2244,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
}
/* It's time to issue all the filed discards */
mark_discard_range_all(sbi);
- f2fs_wait_discard_bios(sbi);
+ f2fs_wait_discard_bios(sbi, false);
out:
range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
return err;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 89f61eb3d167..97e03c637e90 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb)
}
/* be sure to wait for any on-going discard commands */
- f2fs_wait_discard_bios(sbi);
+ f2fs_wait_discard_bios(sbi, true);
if (f2fs_discard_en(sbi) && !sbi->discard_blks) {
struct cp_control cpc = {
@@ -1594,14 +1594,9 @@ static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
- .is_encrypted = f2fs_encrypted_inode,
.empty_dir = f2fs_empty_dir,
.max_namelen = f2fs_max_namelen,
};
-#else
-static const struct fscrypt_operations f2fs_cryptops = {
- .is_encrypted = f2fs_encrypted_inode,
-};
#endif
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -2320,7 +2315,9 @@ try_onemore:
#endif
sb->s_op = &f2fs_sops;
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
sb->s_cop = &f2fs_cryptops;
+#endif
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
index 964b634f6667..70645ce2f7fc 100644
--- a/fs/fat/Makefile
+++ b/fs/fat/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux fat filesystem support.
#
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 5d384921524d..e9bed49df6b7 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/fat/cache.c
*
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 051dac1ce3be..8fc1093da47d 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FAT_H
#define _FAT_H
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 448a1119f0be..30f47d0f74a0 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/fcntl.c
*
@@ -724,7 +725,7 @@ static void send_sigio_to_task(struct task_struct *p,
* F_SETSIG can change ->signum lockless in parallel, make
* sure we read it once and use the same value throughout.
*/
- int signum = ACCESS_ONCE(fown->signum);
+ int signum = READ_ONCE(fown->signum);
if (!sigio_perm(p, fown, signum))
return;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 58a61f55e0d0..474adc8d2a3a 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/fs.h>
diff --git a/fs/file.c b/fs/file.c
index 1fc7fbbb4510..4eecbf4244a5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/file.c
*
diff --git a/fs/file_table.c b/fs/file_table.c
index 61517f57f8ef..49e1f2f1a4cb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -201,11 +201,11 @@ static void __fput(struct file *file)
eventpoll_release(file);
locks_remove_file(file);
+ ima_file_free(file);
if (unlikely(file->f_flags & FASYNC)) {
if (file->f_op->fasync)
file->f_op->fasync(-1, file, 0);
}
- ima_file_free(file);
if (file->f_op->release)
file->f_op->release(inode, file);
security_file_free(file);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a920ad2629ac..f2728a4a03a1 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/filesystems.c
*
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index e747b3d720ee..a6497cf8ae53 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -78,7 +79,7 @@ void mnt_pin_kill(struct mount *m)
while (1) {
struct hlist_node *p;
rcu_read_lock();
- p = ACCESS_ONCE(m->mnt_pins.first);
+ p = READ_ONCE(m->mnt_pins.first);
if (!p) {
rcu_read_unlock();
break;
@@ -92,7 +93,7 @@ void group_pin_kill(struct hlist_head *p)
while (1) {
struct hlist_node *q;
rcu_read_lock();
- q = ACCESS_ONCE(p->first);
+ q = READ_ONCE(p->first);
if (!q) {
rcu_read_unlock();
break;
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 6d561531cb36..79e08e05ef84 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for general filesystem caching code
#
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index b5ab06fabc60..0438d4cd91ef 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -331,6 +331,13 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
rcu_read_lock();
confkey = user_key_payload_rcu(key);
+ if (!confkey) {
+ /* key was revoked */
+ rcu_read_unlock();
+ key_put(key);
+ goto no_config;
+ }
+
buf = confkey->data;
for (len = confkey->datalen - 1; len >= 0; len--) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 13c65dd2d37d..a42d89371748 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -33,7 +33,7 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
* Lockless access is OK, because file->private data is set
* once during mount and is valid until the file is released.
*/
- return ACCESS_ONCE(file->private_data);
+ return READ_ONCE(file->private_data);
}
static void fuse_request_init(struct fuse_req *req, struct page **pages,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 622081b97426..24967382a7b1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1308,7 +1308,8 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
*/
over = !dir_emit(ctx, dirent->name, dirent->namelen,
dirent->ino, dirent->type);
- ctx->pos = dirent->off;
+ if (!over)
+ ctx->pos = dirent->off;
}
buf += reclen;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 65c88379a3a1..94a745acaef8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1059,7 +1059,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (sb->s_flags & MS_MANDLOCK)
goto err;
- sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);
+ sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
if (!parse_fuse_opt(data, &d, is_bdev))
goto err;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 90c6a8faaecb..43c827a7cce5 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -4,6 +4,7 @@ config GFS2_FS
select FS_POSIX_ACL
select CRC32
select QUOTACTL
+ select FS_IOMAP
help
A cluster filesystem.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 86128202384f..41b2aa4bc3bf 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ccflags-y := -I$(src)
obj-$(CONFIG_GFS2_FS) += gfs2.o
gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 9d5eecb123de..776717f1eeea 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -141,6 +141,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
ret = __gfs2_set_acl(inode, acl, type);
if (!ret && mode != inode->i_mode) {
+ inode->i_ctime = current_time(inode);
inode->i_mode = mode;
mark_inode_dirty(inode);
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3dd0cceefa43..d5f0d96169c5 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -13,6 +13,7 @@
#include <linux/blkdev.h>
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
+#include <linux/iomap.h>
#include "gfs2.h"
#include "incore.h"
@@ -36,6 +37,8 @@
struct metapath {
struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
__u16 mp_list[GFS2_MAX_META_HEIGHT];
+ int mp_fheight; /* find_metapath height */
+ int mp_aheight; /* actual height (lookup height) */
};
/**
@@ -235,9 +238,9 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
{
unsigned int i;
+ mp->mp_fheight = height;
for (i = height; i--;)
mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
-
}
static inline unsigned int metapath_branch_start(const struct metapath *mp)
@@ -248,7 +251,7 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
}
/**
- * metaptr1 - Return the first possible metadata pointer in a metaath buffer
+ * metaptr1 - Return the first possible metadata pointer in a metapath buffer
* @height: The metadata height (0 = dinode)
* @mp: The metapath
*/
@@ -345,10 +348,13 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
for (x = 0; x < end_of_metadata; x++) {
ret = lookup_mp_height(ip, mp, x);
if (ret)
- return ret;
+ goto out;
}
- return ip->i_height;
+ ret = ip->i_height;
+out:
+ mp->mp_aheight = ret;
+ return ret;
}
/**
@@ -480,10 +486,11 @@ static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
* @inode: The GFS2 inode
* @lblock: The logical starting block of the extent
* @bh_map: This is used to return the mapping details
- * @mp: The metapath
- * @sheight: The starting height (i.e. whats already mapped)
- * @height: The height to build to
+ * @zero_new: True if newly allocated blocks should be zeroed
+ * @mp: The metapath, with proper height information calculated
* @maxlen: The max number of data blocks to alloc
+ * @dblock: Pointer to return the resulting new block
+ * @dblks: Pointer to return the number of blocks allocated
*
* In this routine we may have to alloc:
* i) Indirect blocks to grow the metadata tree height
@@ -499,63 +506,63 @@ static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
* Returns: errno on error
*/
-static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
- struct buffer_head *bh_map, struct metapath *mp,
- const unsigned int sheight,
- const unsigned int height,
- const size_t maxlen)
+static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
+ unsigned flags, struct metapath *mp)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct super_block *sb = sdp->sd_vfs;
struct buffer_head *dibh = mp->mp_bh[0];
- u64 bn, dblock = 0;
+ u64 bn;
unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
unsigned dblks = 0;
unsigned ptrs_per_blk;
- const unsigned end_of_metadata = height - 1;
+ const unsigned end_of_metadata = mp->mp_fheight - 1;
int ret;
- int eob = 0;
enum alloc_state state;
__be64 *ptr;
__be64 zero_bn = 0;
+ size_t maxlen = iomap->length >> inode->i_blkbits;
- BUG_ON(sheight < 1);
+ BUG_ON(mp->mp_aheight < 1);
BUG_ON(dibh == NULL);
gfs2_trans_add_meta(ip->i_gl, dibh);
- if (height == sheight) {
+ if (mp->mp_fheight == mp->mp_aheight) {
struct buffer_head *bh;
+ int eob;
+
/* Bottom indirect block exists, find unalloced extent size */
ptr = metapointer(end_of_metadata, mp);
bh = mp->mp_bh[end_of_metadata];
- dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
- &eob);
+ dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
+ maxlen, &eob);
BUG_ON(dblks < 1);
state = ALLOC_DATA;
} else {
/* Need to allocate indirect blocks */
- ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
+ ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
+ sdp->sd_diptrs;
dblks = min(maxlen, (size_t)(ptrs_per_blk -
mp->mp_list[end_of_metadata]));
- if (height == ip->i_height) {
+ if (mp->mp_fheight == ip->i_height) {
/* Writing into existing tree, extend tree down */
- iblks = height - sheight;
+ iblks = mp->mp_fheight - mp->mp_aheight;
state = ALLOC_GROW_DEPTH;
} else {
/* Building up tree height */
state = ALLOC_GROW_HEIGHT;
- iblks = height - ip->i_height;
+ iblks = mp->mp_fheight - ip->i_height;
branch_start = metapath_branch_start(mp);
- iblks += (height - branch_start);
+ iblks += (mp->mp_fheight - branch_start);
}
}
/* start of the second part of the function (state machine) */
blks = dblks + iblks;
- i = sheight;
+ i = mp->mp_aheight;
do {
int error;
n = blks - alloced;
@@ -573,9 +580,10 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
sizeof(struct gfs2_dinode));
zero_bn = *ptr;
}
- for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
+ for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
+ i++, n--)
gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
- if (i - 1 == height - ip->i_height) {
+ if (i - 1 == mp->mp_fheight - ip->i_height) {
i--;
gfs2_buffer_copy_tail(mp->mp_bh[i],
sizeof(struct gfs2_meta_header),
@@ -587,7 +595,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
sizeof(struct gfs2_meta_header));
*ptr = zero_bn;
state = ALLOC_GROW_DEPTH;
- for(i = branch_start; i < height; i++) {
+ for(i = branch_start; i < mp->mp_fheight; i++) {
if (mp->mp_bh[i] == NULL)
break;
brelse(mp->mp_bh[i]);
@@ -599,12 +607,12 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
break;
/* Branching from existing tree */
case ALLOC_GROW_DEPTH:
- if (i > 1 && i < height)
+ if (i > 1 && i < mp->mp_fheight)
gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
- for (; i < height && n > 0; i++, n--)
+ for (; i < mp->mp_fheight && n > 0; i++, n--)
gfs2_indirect_init(mp, ip->i_gl, i,
mp->mp_list[i-1], bn++);
- if (i == height)
+ if (i == mp->mp_fheight)
state = ALLOC_DATA;
if (n == 0)
break;
@@ -615,119 +623,269 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
dblks = n;
ptr = metapointer(end_of_metadata, mp);
- dblock = bn;
+ iomap->addr = bn << inode->i_blkbits;
+ iomap->flags |= IOMAP_F_NEW;
while (n-- > 0)
*ptr++ = cpu_to_be64(bn++);
- if (buffer_zeronew(bh_map)) {
- ret = sb_issue_zeroout(sb, dblock, dblks,
- GFP_NOFS);
+ if (flags & IOMAP_ZERO) {
+ ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
+ dblks, GFP_NOFS);
if (ret) {
fs_err(sdp,
"Failed to zero data buffers\n");
- clear_buffer_zeronew(bh_map);
+ flags &= ~IOMAP_ZERO;
}
}
break;
}
- } while ((state != ALLOC_DATA) || !dblock);
+ } while (iomap->addr == IOMAP_NULL_ADDR);
- ip->i_height = height;
+ iomap->length = (u64)dblks << inode->i_blkbits;
+ ip->i_height = mp->mp_fheight;
gfs2_add_inode_blocks(&ip->i_inode, alloced);
gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
- map_bh(bh_map, inode->i_sb, dblock);
- bh_map->b_size = dblks << inode->i_blkbits;
- set_buffer_new(bh_map);
return 0;
}
/**
- * gfs2_block_map - Map a block from an inode to a disk block
+ * hole_size - figure out the size of a hole
* @inode: The inode
- * @lblock: The logical block number
- * @bh_map: The bh to be mapped
- * @create: True if its ok to alloc blocks to satify the request
+ * @lblock: The logical starting block number
+ * @mp: The metapath
*
- * Sets buffer_mapped() if successful, sets buffer_boundary() if a
- * read of metadata will be required before the next block can be
- * mapped. Sets buffer_new() if new blocks were allocated.
+ * Returns: The hole size in bytes
*
- * Returns: errno
*/
+static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct metapath mp_eof;
+ u64 factor = 1;
+ int hgt;
+ u64 holesz = 0;
+ const __be64 *first, *end, *ptr;
+ const struct buffer_head *bh;
+ u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
+ int zeroptrs;
+ bool done = false;
+
+ /* Get another metapath, to the very last byte */
+ find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
+ for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
+ bh = mp->mp_bh[hgt];
+ if (bh) {
+ zeroptrs = 0;
+ first = metapointer(hgt, mp);
+ end = (const __be64 *)(bh->b_data + bh->b_size);
+
+ for (ptr = first; ptr < end; ptr++) {
+ if (*ptr) {
+ done = true;
+ break;
+ } else {
+ zeroptrs++;
+ }
+ }
+ } else {
+ zeroptrs = sdp->sd_inptrs;
+ }
+ if (factor * zeroptrs >= lblock_stop - lblock + 1) {
+ holesz = lblock_stop - lblock + 1;
+ break;
+ }
+ holesz += factor * zeroptrs;
-int gfs2_block_map(struct inode *inode, sector_t lblock,
- struct buffer_head *bh_map, int create)
+ factor *= sdp->sd_inptrs;
+ if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
+ (mp->mp_list[hgt - 1])++;
+ }
+ return holesz << inode->i_blkbits;
+}
+
+static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+
+ iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
+ sizeof(struct gfs2_dinode);
+ iomap->offset = 0;
+ iomap->length = i_size_read(inode);
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_DATA_INLINE;
+}
+
+/**
+ * gfs2_iomap_begin - Map blocks from an inode to disk blocks
+ * @inode: The inode
+ * @pos: Starting position in bytes
+ * @length: Length to map, in bytes
+ * @flags: iomap flags
+ * @iomap: The iomap structure
+ *
+ * Returns: errno
+ */
+int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap *iomap)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- unsigned int bsize = sdp->sd_sb.sb_bsize;
- const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
+ struct metapath mp = { .mp_aheight = 1, };
+ unsigned int factor = sdp->sd_sb.sb_bsize;
const u64 *arr = sdp->sd_heightsize;
__be64 *ptr;
- u64 size;
- struct metapath mp;
+ sector_t lblock;
+ sector_t lend;
int ret;
int eob;
unsigned int len;
struct buffer_head *bh;
u8 height;
- BUG_ON(maxlen == 0);
+ trace_gfs2_iomap_start(ip, pos, length, flags);
+ if (!length) {
+ ret = -EINVAL;
+ goto out;
+ }
- memset(&mp, 0, sizeof(mp));
- bmap_lock(ip, create);
- clear_buffer_mapped(bh_map);
- clear_buffer_new(bh_map);
- clear_buffer_boundary(bh_map);
- trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
+ if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) {
+ gfs2_stuffed_iomap(inode, iomap);
+ if (pos >= iomap->length)
+ return -ENOENT;
+ ret = 0;
+ goto out;
+ }
+
+ lblock = pos >> inode->i_blkbits;
+ lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
+
+ iomap->offset = lblock << inode->i_blkbits;
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
+ iomap->flags = IOMAP_F_MERGED;
+ bmap_lock(ip, 0);
+
+ /*
+ * Directory data blocks have a struct gfs2_meta_header header, so the
+ * remaining size is smaller than the filesystem block size. Logical
+ * block numbers for directories are in units of this remaining size!
+ */
if (gfs2_is_dir(ip)) {
- bsize = sdp->sd_jbsize;
+ factor = sdp->sd_jbsize;
arr = sdp->sd_jheightsize;
}
ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
if (ret)
- goto out;
+ goto out_release;
height = ip->i_height;
- size = (lblock + 1) * bsize;
- while (size > arr[height])
+ while ((lblock + 1) * factor > arr[height])
height++;
find_metapath(sdp, lblock, &mp, height);
- ret = 1;
if (height > ip->i_height || gfs2_is_stuffed(ip))
goto do_alloc;
+
ret = lookup_metapath(ip, &mp);
if (ret < 0)
- goto out;
- if (ret != ip->i_height)
+ goto out_release;
+
+ if (mp.mp_aheight != ip->i_height)
goto do_alloc;
+
ptr = metapointer(ip->i_height - 1, &mp);
if (*ptr == 0)
goto do_alloc;
- map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
+
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
+
bh = mp.mp_bh[ip->i_height - 1];
- len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
- bh_map->b_size = (len << inode->i_blkbits);
+ len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
if (eob)
- set_buffer_boundary(bh_map);
+ iomap->flags |= IOMAP_F_BOUNDARY;
+ iomap->length = (u64)len << inode->i_blkbits;
+
ret = 0;
-out:
+
+out_release:
release_metapath(&mp);
- trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
- bmap_unlock(ip, create);
+ bmap_unlock(ip, 0);
+out:
+ trace_gfs2_iomap_end(ip, iomap, ret);
return ret;
do_alloc:
- /* All allocations are done here, firstly check create flag */
- if (!create) {
- BUG_ON(gfs2_is_stuffed(ip));
+ if (!(flags & IOMAP_WRITE)) {
+ if (pos >= i_size_read(inode)) {
+ ret = -ENOENT;
+ goto out_release;
+ }
ret = 0;
+ iomap->length = hole_size(inode, lblock, &mp);
+ goto out_release;
+ }
+
+ ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+ goto out_release;
+}
+
+/**
+ * gfs2_block_map - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @bh_map: The bh to be mapped
+ * @create: True if its ok to alloc blocks to satify the request
+ *
+ * Sets buffer_mapped() if successful, sets buffer_boundary() if a
+ * read of metadata will be required before the next block can be
+ * mapped. Sets buffer_new() if new blocks were allocated.
+ *
+ * Returns: errno
+ */
+
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh_map, int create)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct iomap iomap;
+ int ret, flags = 0;
+
+ clear_buffer_mapped(bh_map);
+ clear_buffer_new(bh_map);
+ clear_buffer_boundary(bh_map);
+ trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
+
+ if (create)
+ flags |= IOMAP_WRITE;
+ if (buffer_zeronew(bh_map))
+ flags |= IOMAP_ZERO;
+ ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
+ bh_map->b_size, flags, &iomap);
+ if (ret) {
+ if (!create && ret == -ENOENT) {
+ /* Return unmapped buffer beyond the end of file. */
+ ret = 0;
+ }
goto out;
}
- /* At this point ret is the tree depth of already allocated blocks */
- ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
- goto out;
+ if (iomap.length > bh_map->b_size) {
+ iomap.length = bh_map->b_size;
+ iomap.flags &= ~IOMAP_F_BOUNDARY;
+ }
+ if (iomap.addr != IOMAP_NULL_ADDR)
+ map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
+ bh_map->b_size = iomap.length;
+ if (iomap.flags & IOMAP_F_BOUNDARY)
+ set_buffer_boundary(bh_map);
+ if (iomap.flags & IOMAP_F_NEW)
+ set_buffer_new(bh_map);
+
+out:
+ trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
+ return ret;
}
/*
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 81ded5e2aaa2..443cc182cf18 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,6 +10,8 @@
#ifndef __BMAP_DOT_H__
#define __BMAP_DOT_H__
+#include <linux/iomap.h>
+
#include "inode.h"
struct inode;
@@ -47,6 +49,8 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
extern int gfs2_block_map(struct inode *inode, sector_t lblock,
struct buffer_head *bh, int create);
+extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap *iomap);
extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
u64 *dblock, unsigned *extlen);
extern int gfs2_setattr_size(struct inode *inode, u64 size);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 33a0cb5701a3..58705ef8643a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -60,9 +60,7 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
loff_t error;
switch (whence) {
- case SEEK_END: /* These reference inode->i_size */
- case SEEK_DATA:
- case SEEK_HOLE:
+ case SEEK_END:
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (!error) {
@@ -70,8 +68,21 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
gfs2_glock_dq_uninit(&i_gh);
}
break;
+
+ case SEEK_DATA:
+ error = gfs2_seek_data(file, offset);
+ break;
+
+ case SEEK_HOLE:
+ error = gfs2_seek_hole(file, offset);
+ break;
+
case SEEK_CUR:
case SEEK_SET:
+ /*
+ * These don't reference inode->i_size and don't depend on the
+ * block mapping, so we don't need the glock.
+ */
error = generic_file_llseek(file, offset, whence);
break;
default:
@@ -108,45 +119,22 @@ static int gfs2_readdir(struct file *file, struct dir_context *ctx)
}
/**
- * fsflags_cvt
- * @table: A table of 32 u32 flags
- * @val: a 32 bit value to convert
- *
- * This function can be used to convert between fsflags values and
- * GFS2's own flags values.
+ * fsflag_gfs2flag
*
- * Returns: the converted flags
+ * The FS_JOURNAL_DATA_FL flag maps to GFS2_DIF_INHERIT_JDATA for directories,
+ * and to GFS2_DIF_JDATA for non-directories.
*/
-static u32 fsflags_cvt(const u32 *table, u32 val)
-{
- u32 res = 0;
- while(val) {
- if (val & 1)
- res |= *table;
- table++;
- val >>= 1;
- }
- return res;
-}
-
-static const u32 fsflags_to_gfs2[32] = {
- [3] = GFS2_DIF_SYNC,
- [4] = GFS2_DIF_IMMUTABLE,
- [5] = GFS2_DIF_APPENDONLY,
- [7] = GFS2_DIF_NOATIME,
- [12] = GFS2_DIF_EXHASH,
- [14] = GFS2_DIF_INHERIT_JDATA,
- [17] = GFS2_DIF_TOPDIR,
-};
-
-static const u32 gfs2_to_fsflags[32] = {
- [gfs2fl_Sync] = FS_SYNC_FL,
- [gfs2fl_Immutable] = FS_IMMUTABLE_FL,
- [gfs2fl_AppendOnly] = FS_APPEND_FL,
- [gfs2fl_NoAtime] = FS_NOATIME_FL,
- [gfs2fl_ExHash] = FS_INDEX_FL,
- [gfs2fl_TopLevel] = FS_TOPDIR_FL,
- [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+static struct {
+ u32 fsflag;
+ u32 gfsflag;
+} fsflag_gfs2flag[] = {
+ {FS_SYNC_FL, GFS2_DIF_SYNC},
+ {FS_IMMUTABLE_FL, GFS2_DIF_IMMUTABLE},
+ {FS_APPEND_FL, GFS2_DIF_APPENDONLY},
+ {FS_NOATIME_FL, GFS2_DIF_NOATIME},
+ {FS_INDEX_FL, GFS2_DIF_EXHASH},
+ {FS_TOPDIR_FL, GFS2_DIF_TOPDIR},
+ {FS_JOURNAL_DATA_FL, GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA},
};
static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
@@ -154,17 +142,23 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
struct inode *inode = file_inode(filp);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
- int error;
- u32 fsflags;
+ int i, error;
+ u32 gfsflags, fsflags = 0;
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
error = gfs2_glock_nq(&gh);
if (error)
goto out_uninit;
- fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
- if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
- fsflags |= FS_JOURNAL_DATA_FL;
+ gfsflags = ip->i_diskflags;
+ if (S_ISDIR(inode->i_mode))
+ gfsflags &= ~GFS2_DIF_JDATA;
+ else
+ gfsflags &= ~GFS2_DIF_INHERIT_JDATA;
+ for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++)
+ if (gfsflags & fsflag_gfs2flag[i].gfsflag)
+ fsflags |= fsflag_gfs2flag[i].fsflag;
+
if (put_user(fsflags, ptr))
error = -EFAULT;
@@ -199,7 +193,6 @@ void gfs2_set_inode_flags(struct inode *inode)
GFS2_DIF_APPENDONLY| \
GFS2_DIF_NOATIME| \
GFS2_DIF_SYNC| \
- GFS2_DIF_SYSTEM| \
GFS2_DIF_TOPDIR| \
GFS2_DIF_INHERIT_JDATA)
@@ -238,10 +231,6 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
if ((new_flags ^ flags) == 0)
goto out;
- error = -EINVAL;
- if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
- goto out;
-
error = -EPERM;
if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
goto out;
@@ -256,7 +245,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
goto out;
}
if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
- if (flags & GFS2_DIF_JDATA)
+ if (new_flags & GFS2_DIF_JDATA)
gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
error = filemap_fdatawrite(inode->i_mapping);
if (error)
@@ -264,6 +253,8 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
error = filemap_fdatawait(inode->i_mapping);
if (error)
goto out;
+ if (new_flags & GFS2_DIF_JDATA)
+ gfs2_ordered_del_inode(ip);
}
error = gfs2_trans_begin(sdp, RES_DINODE, 0);
if (error)
@@ -271,6 +262,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
goto out_trans_end;
+ inode->i_ctime = current_time(inode);
gfs2_trans_add_meta(ip->i_gl, bh);
ip->i_diskflags = new_flags;
gfs2_dinode_out(ip, bh->b_data);
@@ -289,19 +281,33 @@ out_drop_write:
static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
{
struct inode *inode = file_inode(filp);
- u32 fsflags, gfsflags;
+ u32 fsflags, gfsflags = 0;
+ u32 mask;
+ int i;
if (get_user(fsflags, ptr))
return -EFAULT;
- gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
- if (!S_ISDIR(inode->i_mode)) {
- gfsflags &= ~GFS2_DIF_TOPDIR;
- if (gfsflags & GFS2_DIF_INHERIT_JDATA)
- gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
- return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM);
+ for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) {
+ if (fsflags & fsflag_gfs2flag[i].fsflag) {
+ fsflags &= ~fsflag_gfs2flag[i].fsflag;
+ gfsflags |= fsflag_gfs2flag[i].gfsflag;
+ }
+ }
+ if (fsflags || gfsflags & ~GFS2_FLAGS_USER_SET)
+ return -EINVAL;
+
+ mask = GFS2_FLAGS_USER_SET;
+ if (S_ISDIR(inode->i_mode)) {
+ mask &= ~GFS2_DIF_JDATA;
+ } else {
+ /* The GFS2_DIF_TOPDIR flag is only valid for directories. */
+ if (gfsflags & GFS2_DIF_TOPDIR)
+ return -EINVAL;
+ mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA);
}
- return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA));
+
+ return do_gfs2_set_flags(filp, gfsflags, mask);
}
static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 98e845b7841b..11066d8647d2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1945,13 +1945,9 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
{
struct gfs2_glock_iter *gi = seq->private;
loff_t n = *pos;
- int ret;
-
- if (gi->last_pos <= *pos)
- n = (*pos - gi->last_pos);
- ret = rhashtable_walk_start(&gi->hti);
- if (ret)
+ rhashtable_walk_enter(&gl_hash_table, &gi->hti);
+ if (rhashtable_walk_start(&gi->hti) != 0)
return NULL;
do {
@@ -1959,6 +1955,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
} while (gi->gl && n--);
gi->last_pos = *pos;
+
return gi->gl;
}
@@ -1970,6 +1967,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
(*pos)++;
gi->last_pos = *pos;
gfs2_glock_iter_next(gi);
+
return gi->gl;
}
@@ -1980,6 +1978,7 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
gi->gl = NULL;
rhashtable_walk_stop(&gi->hti);
+ rhashtable_walk_exit(&gi->hti);
}
static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -2042,12 +2041,10 @@ static int __gfs2_glocks_open(struct inode *inode, struct file *file,
struct gfs2_glock_iter *gi = seq->private;
gi->sdp = inode->i_private;
- gi->last_pos = 0;
seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
gi->gl = NULL;
- rhashtable_walk_enter(&gl_hash_table, &gi->hti);
}
return ret;
}
@@ -2063,7 +2060,6 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file)
struct gfs2_glock_iter *gi = seq->private;
gi->gl = NULL;
- rhashtable_walk_exit(&gi->hti);
return seq_release_private(inode, file);
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 863749e29bf9..4e971b1c7f92 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,7 +18,7 @@
#include <linux/posix_acl.h>
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
#include <linux/security.h>
#include <linux/uaccess.h>
@@ -189,7 +189,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
gfs2_set_iop(inode);
- inode->i_atime.tv_sec = 0;
+ /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */
+ inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1);
inode->i_atime.tv_nsec = 0;
unlock_new_inode(inode);
@@ -1986,6 +1987,7 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat,
struct inode *inode = d_inode(path->dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
+ u32 gfsflags;
int error;
gfs2_holder_mark_uninitialized(&gh);
@@ -1995,13 +1997,30 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat,
return error;
}
+ gfsflags = ip->i_diskflags;
+ if (gfsflags & GFS2_DIF_APPENDONLY)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (gfsflags & GFS2_DIF_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_ENCRYPTED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+
generic_fillattr(inode, stat);
+
if (gfs2_holder_initialized(&gh))
gfs2_glock_dq_uninit(&gh);
return 0;
}
+const struct iomap_ops gfs2_iomap_ops = {
+ .iomap_begin = gfs2_iomap_begin,
+};
+
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -2009,41 +2028,59 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct gfs2_holder gh;
int ret;
- ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
- if (ret)
- return ret;
-
- inode_lock(inode);
+ inode_lock_shared(inode);
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
if (ret)
goto out;
- if (gfs2_is_stuffed(ip)) {
- u64 phys = ip->i_no_addr << inode->i_blkbits;
- u64 size = i_size_read(inode);
- u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
- FIEMAP_EXTENT_DATA_INLINE;
- phys += sizeof(struct gfs2_dinode);
- phys += start;
- if (start + len > size)
- len = size - start;
- if (start < size)
- ret = fiemap_fill_next_extent(fieinfo, start, phys,
- len, flags);
- if (ret == 1)
- ret = 0;
- } else {
- ret = __generic_block_fiemap(inode, fieinfo, start, len,
- gfs2_block_map);
- }
+ ret = iomap_fiemap(inode, fieinfo, start, len, &gfs2_iomap_ops);
gfs2_glock_dq_uninit(&gh);
+
out:
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return ret;
}
+loff_t gfs2_seek_data(struct file *file, loff_t offset)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ loff_t ret;
+
+ inode_lock_shared(inode);
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ if (!ret)
+ ret = iomap_seek_data(inode, offset, &gfs2_iomap_ops);
+ gfs2_glock_dq_uninit(&gh);
+ inode_unlock_shared(inode);
+
+ if (ret < 0)
+ return ret;
+ return vfs_setpos(file, ret, inode->i_sb->s_maxbytes);
+}
+
+loff_t gfs2_seek_hole(struct file *file, loff_t offset)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ loff_t ret;
+
+ inode_lock_shared(inode);
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ if (!ret)
+ ret = iomap_seek_hole(inode, offset, &gfs2_iomap_ops);
+ gfs2_glock_dq_uninit(&gh);
+ inode_unlock_shared(inode);
+
+ if (ret < 0)
+ return ret;
+ return vfs_setpos(file, ret, inode->i_sb->s_maxbytes);
+}
+
const struct inode_operations gfs2_file_iops = {
.permission = gfs2_permission,
.setattr = gfs2_setattr,
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index aace8ce34a18..b5b6341a4f5c 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,8 @@ extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
extern int gfs2_open_common(struct inode *inode, struct file *file);
+extern loff_t gfs2_seek_data(struct file *file, loff_t offset);
+extern loff_t gfs2_seek_hole(struct file *file, loff_t offset);
extern const struct inode_operations gfs2_file_iops;
extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 8e54f2e3a304..9cb5c9a97d69 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -754,14 +754,15 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
int ret = 0;
+ bool flush_all = (wbc->sync_mode == WB_SYNC_ALL || gfs2_is_jdata(ip));
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (flush_all)
gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
if (bdi->wb.dirty_exceeded)
gfs2_ail1_flush(sdp, wbc);
else
filemap_fdatawrite(metamapping);
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (flush_all)
ret = filemap_fdatawait(metamapping);
if (ret)
mark_inode_dirty_sync(inode);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 49ac55da4e33..f67a709589d3 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM gfs2
@@ -12,6 +13,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/writeback.h>
#include <linux/ktime.h>
+#include <linux/iomap.h>
#include "incore.h"
#include "glock.h"
#include "rgrp.h"
@@ -469,6 +471,70 @@ TRACE_EVENT(gfs2_bmap,
__entry->errno)
);
+TRACE_EVENT(gfs2_iomap_start,
+
+ TP_PROTO(const struct gfs2_inode *ip, loff_t pos, ssize_t length,
+ u16 flags),
+
+ TP_ARGS(ip, pos, length, flags),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( u64, inum )
+ __field( loff_t, pos )
+ __field( ssize_t, length )
+ __field( u16, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
+ __entry->inum = ip->i_no_addr;
+ __entry->pos = pos;
+ __entry->length = length;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("%u,%u bmap %llu iomap start %llu/%lu flags:%08x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->inum,
+ (unsigned long long)__entry->pos,
+ (unsigned long)__entry->length, (u16)__entry->flags)
+);
+
+TRACE_EVENT(gfs2_iomap_end,
+
+ TP_PROTO(const struct gfs2_inode *ip, struct iomap *iomap, int ret),
+
+ TP_ARGS(ip, iomap, ret),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( u64, inum )
+ __field( loff_t, offset )
+ __field( ssize_t, length )
+ __field( u16, flags )
+ __field( u16, type )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
+ __entry->inum = ip->i_no_addr;
+ __entry->offset = iomap->offset;
+ __entry->length = iomap->length;
+ __entry->flags = iomap->flags;
+ __entry->type = iomap->type;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%u,%u bmap %llu iomap end %llu/%lu ty:%d flags:%08x rc:%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->inum,
+ (unsigned long long)__entry->offset,
+ (unsigned long)__entry->length, (u16)__entry->type,
+ (u16)__entry->flags, __entry->ret)
+);
+
/* Keep track of blocks as they are allocated/freed */
TRACE_EVENT(gfs2_block_alloc,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index affef3c066e0..a85ca8b2c9ba 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -145,7 +145,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
*
* This is used in two distinct cases:
* i) In ordered write mode
- * We put the data buffer on a list so that we can ensure that its
+ * We put the data buffer on a list so that we can ensure that it's
* synced to disk at the right time
* ii) In journaled data mode
* We need to journal the data block in the same way as metadata in
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index ea09e41dbb49..05de20954659 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -231,7 +231,6 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd;
struct gfs2_holder rg_gh;
- struct buffer_head *dibh;
__be64 *dataptrs;
u64 bn = 0;
u64 bstart = 0;
@@ -308,13 +307,8 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
ea->ea_num_ptrs = 0;
}
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
- }
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
gfs2_trans_end(sdp);
@@ -616,7 +610,6 @@ static int gfs2_xattr_get(const struct xattr_handler *handler,
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
- bool need_unlock = false;
int ret;
/* During lookup, SELinux calls this function with the glock locked. */
@@ -625,10 +618,11 @@ static int gfs2_xattr_get(const struct xattr_handler *handler,
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
if (ret)
return ret;
- need_unlock = true;
+ } else {
+ gfs2_holder_mark_uninitialized(&gh);
}
ret = __gfs2_xattr_get(inode, name, buffer, size, handler->flags);
- if (need_unlock)
+ if (gfs2_holder_initialized(&gh))
gfs2_glock_dq_uninit(&gh);
return ret;
}
@@ -749,7 +743,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
ea_skeleton_call_t skeleton_call, void *private)
{
struct gfs2_alloc_parms ap = { .target = blks };
- struct buffer_head *dibh;
int error;
error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
@@ -774,13 +767,8 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (error)
goto out_end_trans;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
- }
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
out_end_trans:
gfs2_trans_end(GFS2_SB(&ip->i_inode));
@@ -891,7 +879,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
struct gfs2_ea_header *ea, struct ea_set *es)
{
struct gfs2_ea_request *er = es->es_er;
- struct buffer_head *dibh;
int error;
error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
@@ -908,14 +895,9 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
if (es->es_el)
ea_set_remove_stuffed(ip, es->es_el);
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto out;
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
-out:
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+
gfs2_trans_end(GFS2_SB(&ip->i_inode));
return error;
}
@@ -1111,7 +1093,6 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
{
struct gfs2_ea_header *ea = el->el_ea;
struct gfs2_ea_header *prev = el->el_prev;
- struct buffer_head *dibh;
int error;
error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
@@ -1132,13 +1113,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
ea->ea_type = GFS2_EATYPE_UNUSED;
}
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
- }
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
@@ -1268,11 +1244,20 @@ static int gfs2_xattr_set(const struct xattr_handler *handler,
if (ret)
return ret;
- ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- if (ret)
- return ret;
+ /* May be called from gfs_setattr with the glock locked. */
+
+ if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (ret)
+ return ret;
+ } else {
+ if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
+ return -EIO;
+ gfs2_holder_mark_uninitialized(&gh);
+ }
ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags);
- gfs2_glock_dq_uninit(&gh);
+ if (gfs2_holder_initialized(&gh))
+ gfs2_glock_dq_uninit(&gh);
return ret;
}
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 0933600e11c8..74fa62643136 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfs/attr.c
*
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index de69d8a24f6d..4af318fbda77 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfs/bfind.c
*
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index d77d844b668b..8aec5e732abf 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfs/bnode.c
*
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 6fc766df0461..ad04a5741016 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfs/brec.c
*
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 37cdd955eceb..374b5688e29e 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfs/btree.c
*
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index f6bd266d70b5..c8b252dbb26c 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/hfs/btree.h
*
diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index 683fca2e5e65..f6a56542f8d7 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
## Makefile for the linux hfsplus filesystem routines.
#
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
index 95c8ed9ec17f..488c2b75cf41 100644
--- a/fs/hfsplus/acl.h
+++ b/fs/hfsplus/acl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/hfsplus/acl.h
*
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index e5b221de7de6..2bab6b3cdba4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/attributes.c
*
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 528e38b5af7f..ca2ba8c9f82e 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/bfind.c
*
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index c0ae274c0a22..cebce0cfe340 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/bitmap.c
*
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index ce014ceb89ef..d77015c3f22c 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/bnode.c
*
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 754fdf8c6356..808f4d8c859c 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/brec.c
*
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index d9d1a36ba826..de14b2b6881b 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/btree.c
*
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index a5e00f7a4c14..a196369ba779 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/catalog.c
*
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 31d5e3f1fe17..e8120a282435 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/dir.c
*
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index a3eb640b4f8f..e8770935ce6d 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/extents.c
*
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a3f03b247463..a015044daa05 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/include/linux/hfsplus_fs.h
*
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 8298d0985f81..456e87aec7fd 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/include/linux/hfsplus_raw.h
*
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 4f26b6877130..190c60efbc99 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/inode.c
*
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 0a156d84e67d..5e6502ef7415 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/ioctl.c
*
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bb806e58c977..047e05c57560 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/options.c
*
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index 6bb5d7c42888..066114dcc3a2 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/posix_acl.c
*
diff --git a/fs/hfsplus/tables.c b/fs/hfsplus/tables.c
index 1b911730a0c1..a5fb8ee7d019 100644
--- a/fs/hfsplus/tables.c
+++ b/fs/hfsplus/tables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/tables.c
*
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index e563939882f3..dfa90c21948f 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/unicode.c
*
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 10032b919a85..08c1580bdf7a 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/wrapper.c
*
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index d37bb88dc746..e538b758c448 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/xattr.c
*
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 68f6b539371f..a4e611d69710 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/hfsplus/xattr.h
*
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 37b3efa733ef..f5550b006e88 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/xattr_trusted.c
*
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index 94519d6c627d..fbad91e1dada 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/xattr_trusted.c
*
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index fae6c0ea0030..74d19faf255e 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hfsplus/xattr_user.c
*
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 91e19f9dffe5..ffaec2e7526c 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __UM_FS_HOSTFS
#define __UM_FS_HOSTFS
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index 098bf0f4f386..66617b1557c6 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/alloc.c
*
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 2d5b254ad9e2..c14c9a035ee0 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/anode.c
*
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index f626114449e4..e285d6b3bba4 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/buffer.c
*
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index bb87d65f0d97..89a36fdc68cb 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/dentry.c
*
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index fa6bbb4f509f..8d6b7e35faf9 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/dir.c
*
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 86ab7e790b4e..3b834563b1f1 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/dnode.c
*
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index ce3f98ba993a..102ba18e561f 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/ea.c
*
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index f26138425b16..1ecec124e76f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/file.c
*
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index cce025aff1b1..823a328791c0 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/hpfs/hpfs.h
*
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index d352f3a6af7f..2577ef1034ef 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/hpfs/hpfs_fn.h
*
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index b9c724ed1e7e..eb8b4baf0f2e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/inode.c
*
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index a136929189f0..e0e60b148400 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/map.c
*
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index b00d396d22c6..ef7ba77f36b8 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/name.c
*
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f30c14414518..a3615e4c730d 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/hpfs/namei.c
*
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 59073e9f01a4..ed113ea17aff 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -842,9 +842,12 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
struct inode *inode = mapping->host;
+ pgoff_t index = page->index;
remove_huge_page(page);
- hugetlb_fix_reserve_counts(inode);
+ if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
+ hugetlb_fix_reserve_counts(inode);
+
return 0;
}
diff --git a/fs/inode.c b/fs/inode.c
index d1e35b53bb23..fd401028a309 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2090,7 +2090,7 @@ void inode_set_flags(struct inode *inode, unsigned int flags,
WARN_ON_ONCE(flags & ~mask);
do {
- old_flags = ACCESS_ONCE(inode->i_flags);
+ old_flags = READ_ONCE(inode->i_flags);
new_flags = (old_flags & ~mask) | flags;
} while (unlikely(cmpxchg(&inode->i_flags, old_flags,
new_flags) != old_flags));
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 569db68d02b3..5ace7efb0d04 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ioctl.c
*
diff --git a/fs/iomap.c b/fs/iomap.c
index 269b24a01f32..5011a964a550 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -350,8 +350,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
struct iomap *iomap)
{
- sector_t sector = iomap->blkno +
- (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+ sector_t sector = (iomap->addr +
+ (pos & PAGE_MASK) - iomap->offset) >> 9;
return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
offset, bytes);
@@ -510,11 +510,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
flags |= FIEMAP_EXTENT_MERGED;
if (iomap->flags & IOMAP_F_SHARED)
flags |= FIEMAP_EXTENT_SHARED;
+ if (iomap->flags & IOMAP_F_DATA_INLINE)
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
return fiemap_fill_next_extent(fi, iomap->offset,
- iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+ iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
iomap->length, flags);
-
}
static loff_t
@@ -713,6 +714,8 @@ struct iomap_dio {
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
struct kiocb *iocb = dio->iocb;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ loff_t offset = iocb->ki_pos;
ssize_t ret;
if (dio->end_io) {
@@ -726,12 +729,33 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
if (likely(!ret)) {
ret = dio->size;
/* check for short read */
- if (iocb->ki_pos + ret > dio->i_size &&
+ if (offset + ret > dio->i_size &&
!(dio->flags & IOMAP_DIO_WRITE))
- ret = dio->i_size - iocb->ki_pos;
+ ret = dio->i_size - offset;
iocb->ki_pos += ret;
}
+ /*
+ * Try again to invalidate clean pages which might have been cached by
+ * non-direct readahead, or faulted in by get_user_pages() if the source
+ * of the write was an mmap'ed region of the file we're writing. Either
+ * one is a pretty crazy thing to do, so we don't support it 100%. If
+ * this invalidation fails, tough, the write still worked...
+ *
+ * And this page cache invalidation has to be after dio->end_io(), as
+ * some filesystems convert unwritten extents to real allocations in
+ * end_io() when necessary, otherwise a racing buffer read would cache
+ * zeros from unwritten extents.
+ */
+ if (!dio->error &&
+ (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+ int err;
+ err = invalidate_inode_pages2_range(inode->i_mapping,
+ offset >> PAGE_SHIFT,
+ (offset + dio->size - 1) >> PAGE_SHIFT);
+ WARN_ON_ONCE(err);
+ }
+
inode_dio_end(file_inode(iocb->ki_filp));
kfree(dio);
@@ -807,7 +831,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio = bio_alloc(GFP_KERNEL, 1);
bio_set_dev(bio, iomap->bdev);
bio->bi_iter.bi_sector =
- iomap->blkno + ((pos - iomap->offset) >> 9);
+ (iomap->addr + pos - iomap->offset) >> 9;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
@@ -886,7 +910,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
bio = bio_alloc(GFP_KERNEL, nr_pages);
bio_set_dev(bio, iomap->bdev);
bio->bi_iter.bi_sector =
- iomap->blkno + ((pos - iomap->offset) >> 9);
+ (iomap->addr + pos - iomap->offset) >> 9;
bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
@@ -993,6 +1017,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
WARN_ON_ONCE(ret);
ret = 0;
+ if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
+ !inode->i_sb->s_dio_done_wq) {
+ ret = sb_init_dio_done_wq(inode->i_sb);
+ if (ret < 0)
+ goto out_free_dio;
+ }
+
inode_dio_begin(inode);
blk_start_plug(&plug);
@@ -1015,13 +1046,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret < 0)
iomap_dio_set_error(dio, ret);
- if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
- !inode->i_sb->s_dio_done_wq) {
- ret = sb_init_dio_done_wq(inode->i_sb);
- if (ret < 0)
- iomap_dio_set_error(dio, ret);
- }
-
if (!atomic_dec_and_test(&dio->ref)) {
if (!is_sync_kiocb(iocb))
return -EIOCBQUEUED;
@@ -1042,19 +1066,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
ret = iomap_dio_complete(dio);
- /*
- * Try again to invalidate clean pages which might have been cached by
- * non-direct readahead, or faulted in by get_user_pages() if the source
- * of the write was an mmap'ed region of the file we're writing. Either
- * one is a pretty crazy thing to do, so we don't support it 100%. If
- * this invalidation fails, tough, the write still worked...
- */
- if (iov_iter_rw(iter) == WRITE) {
- int err = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT, end >> PAGE_SHIFT);
- WARN_ON_ONCE(err);
- }
-
return ret;
out_free_dio:
diff --git a/fs/isofs/Makefile b/fs/isofs/Makefile
index bf162f0942d5..6498fd2b0f60 100644
--- a/fs/isofs/Makefile
+++ b/fs/isofs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux isofs filesystem routines.
#
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e7599615e4e0..947ce22f5b3c 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/isofs/dir.c
*
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 0c5f721b4e91..85a9093769a9 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/isofs/export.c
*
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index db692f554158..447a24d77b89 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -514,9 +514,11 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root)
if (sbi->s_fmode != ISOFS_INVALID_MODE)
seq_printf(m, ",fmode=%o", sbi->s_fmode);
+#ifdef CONFIG_JOLIET
if (sbi->s_nls_iocharset &&
strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0)
seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset);
+#endif
return 0;
}
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 133a456b0425..57d4c3e2e94a 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index a048de81c093..be8b6a9d0b92 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/isofs/joliet.c
*
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index aee592767f1d..cac468f04820 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/isofs/namei.c
*
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 0ec137310320..94ef92fe806c 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/isofs/rock.c
*
diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h
index ed09e2b08637..ef03625431bb 100644
--- a/fs/isofs/rock.h
+++ b/fs/isofs/rock.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* These structs are used by the system-use-sharing protocol, in which the
* Rock Ridge extensions are embedded. It is quite possible that other
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index 005a15cfd30a..42544bf0e222 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/isofs/util.c
*/
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 7d5ef3bf3f3e..d2a85c9720e9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -165,11 +165,11 @@ static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
* Helper function used to manage commit timeouts
*/
-static void commit_timeout(unsigned long __data)
+static void commit_timeout(struct timer_list *t)
{
- struct task_struct * p = (struct task_struct *) __data;
+ journal_t *journal = from_timer(journal, t, j_commit_timer);
- wake_up_process(p);
+ wake_up_process(journal->j_task);
}
/*
@@ -197,8 +197,7 @@ static int kjournald2(void *arg)
* Set up an interval timer which can be used to trigger a commit wakeup
* after the commit interval expires
*/
- setup_timer(&journal->j_commit_timer, commit_timeout,
- (unsigned long)current);
+ timer_setup(&journal->j_commit_timer, commit_timeout, 0);
set_freezable();
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 60e5d49ca03e..5294969d5bf9 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux Journalling Flash File System v2 (JFFS2)
#
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index d20d4737b3ef..285ec189ed5c 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux JFS filesystem routines.
#
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 5c5ac5b3aec3..ba34dae8bd9f 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/jfs/ioctl.c
*
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 1c4b9ad4d7ab..1a3b0cc22ad3 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -663,6 +663,8 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
} else {
INCREMENT(mpStat.pagealloc);
mp = alloc_metapage(GFP_NOFS);
+ if (!mp)
+ goto unlock;
mp->page = page;
mp->sb = inode->i_sb;
mp->flag = 0;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2f14677169c3..2f7b3af5b8b7 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -853,7 +853,6 @@ out:
}
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
- inode->i_version++;
inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
inode_unlock(inode);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 9b320cc2a8cf..6d5e83ed4476 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux lock manager stuff
#
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index c349fc0f9b80..00d5ef5f99f7 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/clnt4xdr.c
*
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 3b4724a6c4ee..2c6176387143 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/clntxdr.c
*
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index d716c9993a26..0d4e590e0549 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/host.c
*
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 9d8166c39c54..9fbbd11f9ecb 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/mon.c
*
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index fb8cac88251a..5bec78c8e431 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LOCKD_NETNS_H__
#define __LOCKD_NETNS_H__
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
index 8f72cb237ef3..ca9228a56d65 100644
--- a/fs/lockd/procfs.c
+++ b/fs/lockd/procfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Procfs support for lockd
*
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
index 184a15edd18d..ba9a82f4ce28 100644
--- a/fs/lockd/procfs.h
+++ b/fs/lockd/procfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Procfs support for lockd
*
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 82925f17ec45..1bddf70d9656 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/svc4proc.c
*
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 3507c80d1d4b..3701bccab478 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/svclock.c
*
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 07915162581d..0d670c5c378f 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/svcproc.c
*
diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c
index b0ae07008700..ade4931b2da2 100644
--- a/fs/lockd/svcshare.c
+++ b/fs/lockd/svcshare.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/svcshare.c
*
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 442bbd0b0b29..7147e4aebecc 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/xdr.c
*
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 2a0cd5679c49..7ed9edf9aed4 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/lockd/xdr4.c
*
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index c2c3fd3277b5..f4e5e5181a14 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/minix/bitmap.c
*
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index baa9721f1299..dcfe5b25378b 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/minix/dir.c
*
diff --git a/fs/minix/file.c b/fs/minix/file.c
index a6a4797aa0d4..c50b0a20fcd9 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/minix/file.c
*
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index 2d1ca08870f7..043c3fdbc8e7 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* Generic part */
typedef struct {
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 46ca39d6c735..046cc96ee7ad 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include "minix.h"
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index 1ee101352586..f7fc7ecccccc 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/buffer_head.h>
#include "minix.h"
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 663d66138d06..df081e8afcc3 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef FS_MINIX_H
#define FS_MINIX_H
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 1e0f11f5dac9..ccf0f00030bf 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/minix/namei.c
*
diff --git a/fs/mount.h b/fs/mount.h
index 6790767d1883..f39bc9da4d73 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
diff --git a/fs/mpage.c b/fs/mpage.c
index 37bb77c1302c..b7e7f570733a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/mpage.c
*
@@ -468,6 +469,16 @@ static void clean_buffers(struct page *page, unsigned first_unmapped)
try_to_free_buffers(page);
}
+/*
+ * For situations where we want to clean all buffers attached to a page.
+ * We don't need to calculate how many buffers are attached to the page,
+ * we just need to specify a number larger than the maximum number of buffers.
+ */
+void clean_page_buffers(struct page *page)
+{
+ clean_buffers(page, ~0U);
+}
+
static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
@@ -605,10 +616,8 @@ alloc_new:
if (bio == NULL) {
if (first_unmapped == blocks_per_page) {
if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
- page, wbc)) {
- clean_buffers(page, first_unmapped);
+ page, wbc))
goto out;
- }
}
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
diff --git a/fs/namei.c b/fs/namei.c
index c75ea03ca147..5424b10cfdc4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/namei.c
*
@@ -1209,7 +1210,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
/* Given that we're not holding a lock here, we retain the value in a
* local variable for each dentry as we look at it so that we don't see
* the components of that value change under us */
- while (managed = ACCESS_ONCE(path->dentry->d_flags),
+ while (managed = READ_ONCE(path->dentry->d_flags),
managed &= DCACHE_MANAGED_DENTRY,
unlikely(managed != 0)) {
/* Allow the filesystem to manage the transit without i_mutex
@@ -1394,7 +1395,7 @@ int follow_down(struct path *path)
unsigned managed;
int ret;
- while (managed = ACCESS_ONCE(path->dentry->d_flags),
+ while (managed = READ_ONCE(path->dentry->d_flags),
unlikely(managed & DCACHE_MANAGED_DENTRY)) {
/* Allow the filesystem to manage the transit without i_mutex
* being held.
diff --git a/fs/namespace.c b/fs/namespace.c
index 54059b142d6b..e158ec6b527b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -353,7 +353,7 @@ int __mnt_want_write(struct vfsmount *m)
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
- while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
+ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
cpu_relax();
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
@@ -468,7 +468,9 @@ static inline int may_write_real(struct file *file)
/* File refers to upper, writable layer? */
upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
- if (upperdentry && file_inode(file) == d_inode(upperdentry))
+ if (upperdentry &&
+ (file_inode(file) == d_inode(upperdentry) ||
+ file_inode(file) == d_inode(dentry)))
return 0;
/* Lower layer: can't write to real file, sorry... */
@@ -2823,7 +2825,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
SB_MANDLOCK |
SB_DIRSYNC |
SB_SILENT |
- SB_POSIXACL);
+ SB_POSIXACL |
+ SB_I_VERSION);
if (flags & MS_REMOUNT)
retval = do_remount(&path, flags, sb_flags, mnt_flags,
diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile
index c66af563f2ce..66fe5f878817 100644
--- a/fs/ncpfs/Makefile
+++ b/fs/ncpfs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux ncp filesystem routines.
#
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 088f52484d6e..0c57c5c5d40a 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* dir.c
*
@@ -119,10 +120,6 @@ static inline int ncp_case_sensitive(const struct inode *i)
/*
* Note: leave the hash unchanged if the directory
* is case-sensitive.
- *
- * Accessing the parent inode can be racy under RCU pathwalking.
- * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
- * the callers will handle races.
*/
static int
ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
@@ -147,11 +144,6 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
return 0;
}
-/*
- * Accessing the parent inode can be racy under RCU pathwalking.
- * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
- * the callers will handle races.
- */
static int
ncp_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index a06c07619ee6..8f8cc0334ddd 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* file.c
*
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 344889cd120e..5c941bef14c4 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* getopt.c
*/
diff --git a/fs/ncpfs/getopt.h b/fs/ncpfs/getopt.h
index cccc007dcaf9..30f0da317670 100644
--- a/fs/ncpfs/getopt.h
+++ b/fs/ncpfs/getopt.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_GETOPT_H
#define _LINUX_GETOPT_H
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 6d0f14c86099..129f1937fa2c 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -618,7 +618,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
server->tx.creq = NULL;
server->rcv.creq = NULL;
- init_timer(&server->timeout_tm);
+ timer_setup(&server->timeout_tm, ncpdgram_timeout_call, 0);
#undef NCP_PACKET_SIZE
#define NCP_PACKET_SIZE 131072
error = -ENOMEM;
@@ -650,8 +650,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
} else {
INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc);
INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc);
- server->timeout_tm.data = (unsigned long)server;
- server->timeout_tm.function = ncpdgram_timeout_call;
}
release_sock(sock->sk);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 12550c2320cc..d378b98cd7b6 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ioctl.c
*
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 6719c0be674d..a5c5cf2ff007 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* mmap.c
*
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
index b9f69e1b1f43..bdd262b6c198 100644
--- a/fs/ncpfs/ncp_fs.h
+++ b/fs/ncpfs/ncp_fs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/ncp_fs.h>
#include "ncp_fs_i.h"
#include "ncp_fs_sb.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
index c4794504f843..3432bafb53a5 100644
--- a/fs/ncpfs/ncp_fs_i.h
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ncp_fs_i.h
*
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 366fd63cc506..f06cde4adf71 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ncp_fs_sb.h
*
@@ -149,7 +150,7 @@ extern void ncp_tcp_rcv_proc(struct work_struct *work);
extern void ncp_tcp_tx_proc(struct work_struct *work);
extern void ncpdgram_rcv_proc(struct work_struct *work);
extern void ncpdgram_timeout_proc(struct work_struct *work);
-extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncpdgram_timeout_call(struct timer_list *t);
extern void ncp_tcp_data_ready(struct sock* sk);
extern void ncp_tcp_write_space(struct sock* sk);
extern void ncp_tcp_error_report(struct sock* sk);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 88dbbc9fcf4d..804adfebba2f 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ncplib_kernel.c
*
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index b4c87cfcee95..aaae8aa9bf7d 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ncplib_kernel.h
*
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 08907599dcd2..8085b1a3ba47 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ncpsign_kernel.c
*
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index d9a1438bb1f6..57ff0a0650b8 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ncpsign_kernel.h
*
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 98b6db0ed63e..efb176b1751a 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ncpfs/sock.c
*
@@ -116,10 +117,10 @@ void ncp_tcp_write_space(struct sock *sk)
schedule_work(&server->tx.tq);
}
-void ncpdgram_timeout_call(unsigned long v)
+void ncpdgram_timeout_call(struct timer_list *t)
{
- struct ncp_server *server = (void*)v;
-
+ struct ncp_server *server = from_timer(server, t, timeout_tm);
+
schedule_work(&server->timeout_tq);
}
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index a6d26b46fc05..b6e16da4837a 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ncpfs/symlink.c
*
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 1fb118902d57..c587e3c4c6a6 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux nfs filesystem routines.
#
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index a69ef4e9c24c..95f74bd2c067 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index c85fbfd2d0d9..7a57ff2528af 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 2ae676f93e6b..b60627bcfc62 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/cache_lib.c
*
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 4116d2c3f52f..4e6236a86cf7 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Helper routines for the NFS client caches
*
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 2cddf7f437e6..cd9d992feb2e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/callback.c
*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 3dc54d7cb19c..a20a0bce40a4 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/nfs/callback.h
*
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 14358de173fb..19151f6c0e97 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/callback_proc.c
*
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 681dd642f119..123c069429a7 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/callback_xdr.c
*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index efebe6cf4378..22880ef6d8dd 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -218,7 +218,6 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
static void pnfs_init_server(struct nfs_server *server)
{
rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
- rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
}
#else
@@ -888,6 +887,7 @@ struct nfs_server *nfs_alloc_server(void)
ida_init(&server->openowner_id);
ida_init(&server->lockowner_id);
pnfs_init_server(server);
+ rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
return server;
}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index e9d555796873..ddaf2644cf13 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/nfs/delegation.h
*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5ceaeb1f6fb6..f439f1c45008 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1081,7 +1081,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
int error;
if (flags & LOOKUP_RCU) {
- parent = ACCESS_ONCE(dentry->d_parent);
+ parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
return -ECHILD;
@@ -1168,7 +1168,7 @@ out_set_verifier:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
if (flags & LOOKUP_RCU) {
- if (parent != ACCESS_ONCE(dentry->d_parent))
+ if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
} else
dput(parent);
@@ -1582,7 +1582,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct inode *dir;
if (flags & LOOKUP_RCU) {
- parent = ACCESS_ONCE(dentry->d_parent);
+ parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
return -ECHILD;
@@ -1596,7 +1596,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
ret = -ECHILD;
if (!(flags & LOOKUP_RCU))
dput(parent);
- else if (parent != ACCESS_ONCE(dentry->d_parent))
+ else if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
goto out;
}
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index d25f10fb4926..060c658eab66 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/dns_resolve.c
*
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index 2e4f596d2923..576ff4b54c82 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Resolve DNS hostnames into valid ip addresses
*/
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 249cb96cc5b5..83fd09fc8f77 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2015, Primary Data, Inc. All rights reserved.
*
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 44c638b7876c..508126eb49f9 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -745,7 +745,8 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
dprintk("--> %s\n", __func__);
- nfs4_fl_put_deviceid(fl->dsaddr);
+ if (fl->dsaddr != NULL)
+ nfs4_fl_put_deviceid(fl->dsaddr);
/* This assumes a single RW lseg */
if (lseg->pls_range.iomode == IOMODE_RW) {
struct nfs4_filelayout *flo;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 98b34c9b0564..679cb087ef3f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* NFSv4 flexfile layout driver data structures.
*
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f32c58bbe556..d62279d3fc5d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Device operations for the pnfs nfs4 file layout driver.
*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5bdf952f414b..f9a4a5524bd5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* NFS internal definitions
*/
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 1fc5d1ce327e..20fef85d2bb1 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Trond Myklebust
*
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 0cb806fbd4c4..2ddaab1ac653 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/nfs/iostat.h
*
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 60bad882c123..d979ff4fee7e 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* In-kernel MOUNT protocol client
*
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 5fbd2bde91ba..fc9978c58265 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* NFS-private data for each "struct net". Accessed with net_generic().
*/
diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h
index 43679df56cd0..5ba00610aede 100644
--- a/fs/nfs/nfs.h
+++ b/fs/nfs/nfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2012 Netapp, Inc. All rights reserved.
*
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index fe68dabfbde6..85e4b4a233f9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/nfs2xdr.c
*
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index e134d6548ab7..f82e11c4cb56 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2014 Anna Schumaker.
*
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 720d92f5abfb..7173a4ee862c 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/nfs.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d1e87ec0df84..bc673fb47fb3 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/nfs3proc.c
*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e82c9e553224..6cd33bd5da87 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/nfs3xdr.c
*
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b6cd15314bab..19ec38f85ce0 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
*/
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6c2db51e67a7..9c374441f660 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
*/
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 5ee1b0f0d904..5966e1e7b1f5 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
*/
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ac4f10b7f6c1..dcfcf7fd7438 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/nfs/nfs4_fs.h
*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 0efba77789b9..626d1382002e 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/file.c
*
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index ac8406018962..1a69479a3a59 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index dd5d27da8c0c..30426c1a1bbd 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -274,7 +274,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
ssize_t ret;
ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
- if (ret <= 0)
+ if (ret < 0)
return ERR_PTR(ret);
rkey = request_key(&key_type_id_resolver, desc, "");
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 7d531da1bae3..8c3f327d858d 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/nfs4namespace.c
*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6c61e2b99635..f90090e8c959 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8399,8 +8399,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
lo = NFS_I(inode)->layout;
/* If the open stateid was bad, then recover it. */
if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
- nfs4_stateid_match_other(&lgp->args.stateid,
- &lgp->args.ctx->state->stateid)) {
+ !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
spin_unlock(&inode->i_lock);
exception->state = lgp->args.ctx->state;
exception->stateid = &lgp->args.stateid;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index dfae4880eacb..3c550f297561 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/nfs/nfs4session.h
*
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 8693d77c45ea..0d91d84e5822 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/nfs4sysctl.c
*
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index 2850bce19244..e9fb3e50a999 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
*/
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index be1da19c65d6..e7c6275519b0 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
*/
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 37c8af003275..14ed9791ec9c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1842,8 +1842,8 @@ static void encode_create_session(struct xdr_stream *xdr,
* Assumes OPEN is the biggest non-idempotent compound.
* 2 is the verifier.
*/
- max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
- RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
+ max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2)
+ * XDR_UNIT + RPC_MAX_AUTH_SIZE;
encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 89a15dbe5efc..effaa4247b91 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de>
*
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
index c74f7af23d77..b60d5fbd7727 100644
--- a/fs/nfs/nfstrace.c
+++ b/fs/nfs/nfstrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
*/
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 551711042ba4..093290c42d7c 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
*/
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7962e49097c3..f7fd9192d4bc 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/proc.c
*
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 5a1d0ded8979..06eb44b47885 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/symlink.c
*
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index bb6ed810fa6f..7aea195ddb35 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/sysctl.c
*
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e3949d93085c..630b4a3c1a93 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/nfs/unlink.c
*
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 5f5d3a76980c..2bfb58eefad1 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux nfs server
#
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 62469c60be23..697f8ae7792d 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
#include <linux/sched.h>
diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
index 53325a12ba62..dbd66424f600 100644
--- a/fs/nfsd/auth.h
+++ b/fs/nfsd/auth.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* nfsd-specific authentication stuff.
*
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c862c2489df0..70b8bf781fce 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
@@ -65,7 +66,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
bex->es = PNFS_BLOCK_READ_DATA;
else
bex->es = PNFS_BLOCK_READWRITE_DATA;
- bex->soff = (iomap.blkno << 9);
+ bex->soff = iomap.addr;
break;
case IOMAP_UNWRITTEN:
if (seg->iomode & IOMODE_RW) {
@@ -78,7 +79,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
}
bex->es = PNFS_BLOCK_INVALID_DATA;
- bex->soff = (iomap.blkno << 9);
+ bex->soff = iomap.addr;
break;
}
/*FALLTHRU*/
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index ac6f54546fdd..442543304930 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 397bc7563a49..bc5166bfe46b 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NFSD_BLOCKLAYOUTXDR_H
#define _NFSD_BLOCKLAYOUTXDR_H 1
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index dd96a3830004..046b3f048757 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Request reply cache. This was heavily inspired by the
* implementation in 4.3BSD/4.4BSD.
diff --git a/fs/nfsd/current_stateid.h b/fs/nfsd/current_stateid.h
index 34075cee573a..c28540d86742 100644
--- a/fs/nfsd/current_stateid.h
+++ b/fs/nfsd/current_stateid.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NFSD4_CURRENT_STATE_H
#define _NFSD4_CURRENT_STATE_H
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 3bc08c394a3f..46b48dbbdd32 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NFS exporting and validation.
*
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 730f15eeb7ed..c8b74126ddaa 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
*/
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 34c1c449fddf..6dfede6d172a 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
*
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index b67287383010..db7ef07ae50c 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
*
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
index 5e3fd7fc1a9f..e81d2a5cf381 100644
--- a/fs/nfsd/flexfilelayoutxdr.c
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
*/
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h
index 467defd4e563..8e195aeca023 100644
--- a/fs/nfsd/flexfilelayoutxdr.h
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
*/
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 1a03bc3059e8..3f5b3d7b62b7 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file contains all the stubs needed when communicating with lockd.
* This level of indirection is necessary so we can run nfsd+lockd without
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 6276ec8608b0..cbab1d2d8a75 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Process version 2 NFSACL requests.
*
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 01976529f042..13bca4a2f89d 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Process version 3 NFSACL requests.
*
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2cb56a0d6625..1d0ce3c57d93 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Process version 3 NFS requests.
*
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index bf444b664011..f38acd905441 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* XDR support for nfsd/protocol version 3.
*
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index e122da696f1b..ea45d954e8d7 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 3c69db7d4905..8487486ec496 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -927,6 +927,13 @@ nfsd4_secinfo_release(union nfsd4_op_u *u)
exp_put(u->secinfo.si_exp);
}
+static void
+nfsd4_secinfo_no_name_release(union nfsd4_op_u *u)
+{
+ if (u->secinfo_no_name.sin_exp)
+ exp_put(u->secinfo_no_name.sin_exp);
+}
+
static __be32
nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
@@ -2375,7 +2382,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
},
[OP_SECINFO_NO_NAME] = {
.op_func = nfsd4_secinfo_no_name,
- .op_release = nfsd4_secinfo_release,
+ .op_release = nfsd4_secinfo_no_name_release,
.op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SECINFO_NO_NAME",
.op_rsize_bop = nfsd4_secinfo_rsize,
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 96fd15979cbd..334f2ad60704 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Request reply cache. This is currently a global cache, but this may
* change in the future and be a per-client cache.
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b9c538ab7a59..3fce905d0365 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Hodge-podge collection of knfsd-related stuff.
* I will sort this out later.
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index cfe7500d5847..8aa011820c4a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NFS server file handle treatment.
*
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index e47cf6c2ac28..43f31cf49bae 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
*
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5076ae2b8258..43c0419b8ddb 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Process version 2 NFS requests.
*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7e3af3ef0917..e02bd2783124 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Central processing for nfsd.
*
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index e4da2717982d..644a0342f0e0 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* XDR support for nfsd
*
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index d27a5aa60022..4f4282d4eeca 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_NFSD_PNFS_H
#define _FS_NFSD_PNFS_H 1
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index d97338bb6a39..9bce3b913189 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* procfs-based user access to knfsd statistics
*
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index a5c944b771c6..b23fdac69820 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Statistics for NFS server.
*
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 3287041905da..8b2f1d92c579 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index bc69d40c4e8b..a3c9bfa77def 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* File operations used by nfsd. Some of these have been ripped from
* other parts of the kernel because they weren't exported, others
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 1bbdccecbf3d..be6d8e00453f 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
*/
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 457ce45e5084..2f4f22e6b8cb 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* XDR types for nfsd. This is mainly a typing exercise. */
#ifndef LINUX_NFSD_H
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 80d7da620e91..056bf8a7364e 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* XDR types for NFSv3 in nfsd.
*
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 49b719dfef95..517239af0302 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#define NFS4_MAXTAGLEN 20
#define NFS4_enc_cb_null_sz 0
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index fc603e0431bb..43b60b8a4d07 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_NILFS2_FS) += nilfs2.o
nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
btnode.o bmap.o btree.o direct.o dat.o recovery.o \
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
index 00107fdb9343..d29fd837c42c 100644
--- a/fs/nilfs2/export.h
+++ b/fs/nilfs2/export.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef NILFS_EXPORT_H
#define NILFS_EXPORT_H
diff --git a/fs/nls/Makefile b/fs/nls/Makefile
index 8ae37c1b5249..ac54db297128 100644
--- a/fs/nls/Makefile
+++ b/fs/nls/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for native language support
#
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 3e969ae91b60..63a4b8828df4 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o mark.o \
fdinfo.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 2fa99aeaa095..09640b546363 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4eb6f5efa282..7dacb7d80727 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fsnotify_backend.h>
#include <linux/path.h>
#include <linux/slab.h>
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 907a481ac781..9752e7270e61 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
#include <linux/fcntl.h>
#include <linux/file.h>
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index dd63aa9a6f9a..517f88c1dbe5 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/fsnotify_backend.h>
diff --git a/fs/notify/fdinfo.h b/fs/notify/fdinfo.h
index 9664c4904d6b..5c9937e02e21 100644
--- a/fs/notify/fdinfo.h
+++ b/fs/notify/fdinfo.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FSNOTIFY_FDINFO_H__
#define __FSNOTIFY_FDINFO_H__
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index bf012e8ecd14..60f365dc1408 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FS_NOTIFY_FSNOTIFY_H_
#define __FS_NOTIFY_FSNOTIFY_H_
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 9ff67b61da8a..c00d2caca894 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/slab.h> /* struct kmem_cache */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 08127a2b8559..ef243e14b6eb 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/fs.h>
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 2ff263e6d363..3e736572ed00 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
# Rules for making the NTFS driver.
obj-$(CONFIG_NTFS_FS) += ntfs.o
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 4342c7ee7d20..99ee093182cb 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS) += \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a177eae3aa1a..addd7c5f2d3e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7304,13 +7304,24 @@ out:
static int ocfs2_trim_extent(struct super_block *sb,
struct ocfs2_group_desc *gd,
- u32 start, u32 count)
+ u64 group, u32 start, u32 count)
{
u64 discard, bcount;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
bcount = ocfs2_clusters_to_blocks(sb, count);
- discard = le64_to_cpu(gd->bg_blkno) +
- ocfs2_clusters_to_blocks(sb, start);
+ discard = ocfs2_clusters_to_blocks(sb, start);
+
+ /*
+ * For the first cluster group, the gd->bg_blkno is not at the start
+ * of the group, but at an offset from the start. If we add it while
+ * calculating discard for first group, we will wrongly start fstrim a
+ * few blocks after the desried start block and the range can cross
+ * over into the next cluster group. So, add it only if this is not
+ * the first cluster group.
+ */
+ if (group != osb->first_cluster_group_blkno)
+ discard += le64_to_cpu(gd->bg_blkno);
trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
@@ -7318,7 +7329,7 @@ static int ocfs2_trim_extent(struct super_block *sb,
}
static int ocfs2_trim_group(struct super_block *sb,
- struct ocfs2_group_desc *gd,
+ struct ocfs2_group_desc *gd, u64 group,
u32 start, u32 max, u32 minbits)
{
int ret = 0, count = 0, next;
@@ -7337,7 +7348,7 @@ static int ocfs2_trim_group(struct super_block *sb,
next = ocfs2_find_next_bit(bitmap, max, start);
if ((next - start) >= minbits) {
- ret = ocfs2_trim_extent(sb, gd,
+ ret = ocfs2_trim_extent(sb, gd, group,
start, next - start);
if (ret < 0) {
mlog_errno(ret);
@@ -7435,7 +7446,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
gd = (struct ocfs2_group_desc *)gd_bh->b_data;
- cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ cnt = ocfs2_trim_group(sb, gd, group,
+ first_bit, last_bit, minlen);
brelse(gd_bh);
gd_bh = NULL;
if (cnt < 0) {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 4506ec5ec2ea..ab30c005cc4b 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ocfs2/ioctl.c
*
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 0cd5323bd3f0..9f5e4d95e37f 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ioctl.h
*
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
index 1274ee0f1fe2..1051507cc684 100644
--- a/fs/ocfs2/mmap.h
+++ b/fs/ocfs2/mmap.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef OCFS2_MMAP_H
#define OCFS2_MMAP_H
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 0b58abcf1c6d..a0b5d00ef0a9 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ocfs2
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index d153e6e31529..ebb5c99f490e 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* quota.h for OCFS2
*
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index c94b6baaa551..b39d14cbfa34 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implementation of operations over global quota file
*/
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index aa700fd10610..16c42ed0dca8 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implementation of operations over local quota file
*/
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index 83f4e76511c2..7147ba6a6afc 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index f0f8bc75e609..4008be73de54 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _OMFS_H
#define _OMFS_H
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index 83a98330ed66..caecb3d5a344 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _OMFS_FS_H
#define _OMFS_FS_H
diff --git a/fs/orangefs/Makefile b/fs/orangefs/Makefile
index a9d6a968fe6d..9b6c50bb173b 100644
--- a/fs/orangefs/Makefile
+++ b/fs/orangefs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the ORANGEFS filesystem.
#
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 9108ef433e6d..c2d8233b1e82 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 5355efba4bc8..ae782df5c063 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index 2826859bdc2c..ded456f17de6 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index d327cbd17756..a8cc588d6224 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright 2017 Omnibond Systems, L.L.C.
*/
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 163001c95501..ea2332e16af9 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 336ecbf8c268..e4a8e6a7eb17 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 9428ea0aac16..28825a5b6d09 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 478e88bd7f9d..7e9e5d0ea3bc 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 7ef473f3d642..59f444dced9b 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h
index 71f64f4057b5..c2c3c5a0eeab 100644
--- a/fs/orangefs/orangefs-bufmap.h
+++ b/fs/orangefs/orangefs-bufmap.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index aa3830b741c7..3b6982bf6bcf 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
index 387db17cde2b..b6001bb28f5a 100644
--- a/fs/orangefs/orangefs-debug.h
+++ b/fs/orangefs/orangefs-debug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 5f59917fd631..1c59dff530de 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* What: /sys/kernel/debug/orangefs/debug-help
* Date: June 2015
diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h
index 803517269ba6..b5fd9cd4960f 100644
--- a/fs/orangefs/orangefs-debugfs.h
+++ b/fs/orangefs/orangefs-debugfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
int orangefs_debugfs_init(int);
void orangefs_debugfs_cleanup(void);
int orangefs_client_debug_init(void);
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index efe08c763e56..dc6609824965 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index ea0ce507a6ab..004af348fb80 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index afd2f523b283..079a465796f3 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Documentation/ABI/stable/orangefs-sysfs:
*
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index aab6f1842963..f82336496311 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 48bcc1bbe415..e0bf5e4dce0d 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/spinlock_types.h>
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 47f3fb9cbec4..47ebd9bfd1a1 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
index 02b1bbdbcc42..d856cdf91763 100644
--- a/fs/orangefs/symlink.c
+++ b/fs/orangefs/symlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h
index b8249f8fdd80..16118452aa12 100644
--- a/fs/orangefs/upcall.h
+++ b/fs/orangefs/upcall.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index 61e2ca7fec55..835c6e148afc 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
* (C) 2011 Omnibond Systems
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 81ac88bb91ff..03bcb871544d 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
*
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index aad97b30d5e6..c441f9387a1b 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -561,10 +561,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
c->tmpfile = true;
err = ovl_copy_up_locked(c);
} else {
- err = -EIO;
- if (lock_rename(c->workdir, c->destdir) != NULL) {
- pr_err("overlayfs: failed to lock workdir+upperdir\n");
- } else {
+ err = ovl_lock_rename_workdir(c->workdir, c->destdir);
+ if (!err) {
err = ovl_copy_up_locked(c);
unlock_rename(c->workdir, c->destdir);
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 3309b1912241..cc961a3bd3bd 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -216,26 +216,6 @@ out_unlock:
return err;
}
-static int ovl_lock_rename_workdir(struct dentry *workdir,
- struct dentry *upperdir)
-{
- /* Workdir should not be the same as upperdir */
- if (workdir == upperdir)
- goto err;
-
- /* Workdir should not be subdir of upperdir and vice versa */
- if (lock_rename(workdir, upperdir) != NULL)
- goto err_unlock;
-
- return 0;
-
-err_unlock:
- unlock_rename(workdir, upperdir);
-err:
- pr_err("overlayfs: failed to lock workdir+upperdir\n");
- return -EIO;
-}
-
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index a619addecafc..321511ed8c42 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -598,18 +598,30 @@ static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
return true;
}
-struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry)
+struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry,
+ struct dentry *index)
{
struct dentry *lowerdentry = ovl_dentry_lower(dentry);
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
struct inode *inode;
+ /* Already indexed or could be indexed on copy up? */
+ bool indexed = (index || (ovl_indexdir(dentry->d_sb) && !upperdentry));
+
+ if (WARN_ON(upperdentry && indexed && !lowerdentry))
+ return ERR_PTR(-EIO);
if (!realinode)
realinode = d_inode(lowerdentry);
- if (!S_ISDIR(realinode->i_mode) &&
- (upperdentry || (lowerdentry && ovl_indexdir(dentry->d_sb)))) {
- struct inode *key = d_inode(lowerdentry ?: upperdentry);
+ /*
+ * Copy up origin (lower) may exist for non-indexed upper, but we must
+ * not use lower as hash key in that case.
+ * Hash inodes that are or could be indexed by origin inode and
+ * non-indexed upper inodes that could be hard linked by upper inode.
+ */
+ if (!S_ISDIR(realinode->i_mode) && (upperdentry || indexed)) {
+ struct inode *key = d_inode(indexed ? lowerdentry :
+ upperdentry);
unsigned int nlink;
inode = iget5_locked(dentry->d_sb, (unsigned long) key,
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index c3addd1114f1..a12dc10bf726 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -405,14 +405,13 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack,
* be treated as stale (i.e. after unlink of the overlay inode).
* We don't know the verification rules for directory and whiteout
* index entries, because they have not been implemented yet, so return
- * EROFS if those entries are found to avoid corrupting an index that
- * was created by a newer kernel.
+ * EINVAL if those entries are found to abort the mount to avoid
+ * corrupting an index that was created by a newer kernel.
*/
- err = -EROFS;
+ err = -EINVAL;
if (d_is_dir(index) || ovl_is_whiteout(index))
goto fail;
- err = -EINVAL;
if (index->d_name.len < sizeof(struct ovl_fh)*2)
goto fail;
@@ -506,6 +505,11 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry,
index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
if (IS_ERR(index)) {
+ err = PTR_ERR(index);
+ if (err == -ENOENT) {
+ index = NULL;
+ goto out;
+ }
pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n"
"overlayfs: mount with '-o index=off' to disable inodes index.\n",
d_inode(origin)->i_ino, name.len, name.name,
@@ -515,18 +519,9 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry,
inode = d_inode(index);
if (d_is_negative(index)) {
- if (upper && d_inode(origin)->i_nlink > 1) {
- pr_warn_ratelimited("overlayfs: hard link with origin but no index (ino=%lu).\n",
- d_inode(origin)->i_ino);
- goto fail;
- }
-
- dput(index);
- index = NULL;
+ goto out_dput;
} else if (upper && d_inode(upper) != inode) {
- pr_warn_ratelimited("overlayfs: wrong index found (index=%pd2, ino=%lu, upper ino=%lu).\n",
- index, inode->i_ino, d_inode(upper)->i_ino);
- goto fail;
+ goto out_dput;
} else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) ||
((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) {
/*
@@ -546,6 +541,11 @@ out:
kfree(name.name);
return index;
+out_dput:
+ dput(index);
+ index = NULL;
+ goto out;
+
fail:
dput(index);
index = ERR_PTR(-EIO);
@@ -634,6 +634,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
}
if (d.redirect) {
+ err = -ENOMEM;
upperredirect = kstrdup(d.redirect, GFP_KERNEL);
if (!upperredirect)
goto out_put_upper;
@@ -708,7 +709,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
upperdentry = dget(index);
if (upperdentry || ctr) {
- inode = ovl_get_inode(dentry, upperdentry);
+ inode = ovl_get_inode(dentry, upperdentry, index);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_free_oe;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index d4e8c1a08fb0..d9a0edd4e57e 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -235,6 +235,7 @@ bool ovl_inuse_trylock(struct dentry *dentry);
void ovl_inuse_unlock(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry, bool *locked);
void ovl_nlink_end(struct dentry *dentry, bool locked);
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
static inline bool ovl_is_impuredir(struct dentry *dentry)
{
@@ -285,7 +286,8 @@ int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
bool ovl_is_private_xattr(const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
-struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry);
+struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry,
+ struct dentry *index);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 878a750986dd..36b49bd09264 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -37,6 +37,9 @@ struct ovl_fs {
bool noxattr;
/* sb common to all layers */
struct super_block *same_sb;
+ /* Did we take the inuse lock? */
+ bool upperdir_locked;
+ bool workdir_locked;
};
/* private information held for every overlayfs dentry */
@@ -74,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode)
static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
{
- return lockless_dereference(oi->__upperdentry);
+ return READ_ONCE(oi->__upperdentry);
}
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 62e9b22a2077..c310e3ff7f3f 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
- realfile = lockless_dereference(od->upperfile);
+ realfile = READ_ONCE(od->upperfile);
if (!realfile) {
struct path upperpath;
@@ -988,6 +988,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
struct path *lowerstack, unsigned int numlower)
{
int err;
+ struct dentry *index = NULL;
struct inode *dir = dentry->d_inode;
struct path path = { .mnt = mnt, .dentry = dentry };
LIST_HEAD(list);
@@ -1007,8 +1008,6 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
inode_lock_nested(dir, I_MUTEX_PARENT);
list_for_each_entry(p, &list, l_node) {
- struct dentry *index;
-
if (p->name[0] == '.') {
if (p->len == 1)
continue;
@@ -1018,18 +1017,20 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
index = lookup_one_len(p->name, dentry, p->len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
+ index = NULL;
break;
}
err = ovl_verify_index(index, lowerstack, numlower);
- if (err) {
- if (err == -EROFS)
- break;
+ /* Cleanup stale and orphan index entries */
+ if (err && (err == -ESTALE || err == -ENOENT))
err = ovl_cleanup(dir, index);
- if (err)
- break;
- }
+ if (err)
+ break;
+
dput(index);
+ index = NULL;
}
+ dput(index);
inode_unlock(dir);
out:
ovl_cache_free(&list);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index fd5ea4facc62..f5738e96a052 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -174,6 +174,9 @@ static struct inode *ovl_alloc_inode(struct super_block *sb)
{
struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL);
+ if (!oi)
+ return NULL;
+
oi->cache = NULL;
oi->redirect = NULL;
oi->version = 0;
@@ -211,9 +214,10 @@ static void ovl_put_super(struct super_block *sb)
dput(ufs->indexdir);
dput(ufs->workdir);
- ovl_inuse_unlock(ufs->workbasedir);
+ if (ufs->workdir_locked)
+ ovl_inuse_unlock(ufs->workbasedir);
dput(ufs->workbasedir);
- if (ufs->upper_mnt)
+ if (ufs->upper_mnt && ufs->upperdir_locked)
ovl_inuse_unlock(ufs->upper_mnt->mnt_root);
mntput(ufs->upper_mnt);
for (i = 0; i < ufs->numlower; i++)
@@ -881,9 +885,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_put_upperpath;
err = -EBUSY;
- if (!ovl_inuse_trylock(upperpath.dentry)) {
- pr_err("overlayfs: upperdir is in-use by another mount\n");
+ if (ovl_inuse_trylock(upperpath.dentry)) {
+ ufs->upperdir_locked = true;
+ } else if (ufs->config.index) {
+ pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
goto out_put_upperpath;
+ } else {
+ pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
}
err = ovl_mount_dir(ufs->config.workdir, &workpath);
@@ -901,9 +909,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
}
err = -EBUSY;
- if (!ovl_inuse_trylock(workpath.dentry)) {
- pr_err("overlayfs: workdir is in-use by another mount\n");
+ if (ovl_inuse_trylock(workpath.dentry)) {
+ ufs->workdir_locked = true;
+ } else if (ufs->config.index) {
+ pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
goto out_put_workpath;
+ } else {
+ pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
}
ufs->workbasedir = workpath.dentry;
@@ -1156,11 +1168,13 @@ out_put_lowerpath:
out_free_lowertmp:
kfree(lowertmp);
out_unlock_workdentry:
- ovl_inuse_unlock(workpath.dentry);
+ if (ufs->workdir_locked)
+ ovl_inuse_unlock(workpath.dentry);
out_put_workpath:
path_put(&workpath);
out_unlock_upperdentry:
- ovl_inuse_unlock(upperpath.dentry);
+ if (ufs->upperdir_locked)
+ ovl_inuse_unlock(upperpath.dentry);
out_put_upperpath:
path_put(&upperpath);
out_free_config:
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 117794582f9f..b9b239fa5cfd 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -430,7 +430,7 @@ void ovl_inuse_unlock(struct dentry *dentry)
}
}
-/* Called must hold OVL_I(inode)->oi_lock */
+/* Caller must hold OVL_I(inode)->lock */
static void ovl_cleanup_index(struct dentry *dentry)
{
struct inode *dir = ovl_indexdir(dentry->d_sb)->d_inode;
@@ -469,6 +469,9 @@ static void ovl_cleanup_index(struct dentry *dentry)
err = PTR_ERR(index);
if (!IS_ERR(index))
err = ovl_cleanup(dir, index);
+ else
+ index = NULL;
+
inode_unlock(dir);
if (err)
goto fail;
@@ -557,3 +560,22 @@ void ovl_nlink_end(struct dentry *dentry, bool locked)
mutex_unlock(&OVL_I(d_inode(dentry))->lock);
}
}
+
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
+{
+ /* Workdir should not be the same as upperdir */
+ if (workdir == upperdir)
+ goto err;
+
+ /* Workdir should not be subdir of upperdir and vice versa */
+ if (lock_rename(workdir, upperdir) != NULL)
+ goto err_unlock;
+
+ return 0;
+
+err_unlock:
+ unlock_rename(workdir, upperdir);
+err:
+ pr_err("overlayfs: failed to lock workdir+upperdir\n");
+ return -EIO;
+}
diff --git a/fs/pipe.c b/fs/pipe.c
index 97e5be897753..349c9d56d4b3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/pipe.c
*
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 12c6922c913c..f7456c4e7d0f 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux proc filesystem routines.
#
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 525157ca25cb..6f6fc1672ad1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/array.c
*
@@ -119,30 +120,25 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
* simple bit tests.
*/
static const char * const task_state_array[] = {
- "R (running)", /* 0 */
- "S (sleeping)", /* 1 */
- "D (disk sleep)", /* 2 */
- "T (stopped)", /* 4 */
- "t (tracing stop)", /* 8 */
- "X (dead)", /* 16 */
- "Z (zombie)", /* 32 */
+
+ /* states in TASK_REPORT: */
+ "R (running)", /* 0x00 */
+ "S (sleeping)", /* 0x01 */
+ "D (disk sleep)", /* 0x02 */
+ "T (stopped)", /* 0x04 */
+ "t (tracing stop)", /* 0x08 */
+ "X (dead)", /* 0x10 */
+ "Z (zombie)", /* 0x20 */
+ "P (parked)", /* 0x40 */
+
+ /* states beyond TASK_REPORT: */
+ "I (idle)", /* 0x80 */
};
static inline const char *get_task_state(struct task_struct *tsk)
{
- unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
-
- /*
- * Parked tasks do not run; they sit in __kthread_parkme().
- * Without this check, we would report them as running, which is
- * clearly wrong, so we report them as sleeping instead.
- */
- if (tsk->state == TASK_PARKED)
- state = TASK_INTERRUPTIBLE;
-
- BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
-
- return task_state_array[fls(state)];
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
+ return task_state_array[task_state_index(tsk)];
}
static inline int get_task_umask(struct task_struct *tsk)
@@ -458,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
cutime = sig->cutime;
cstime = sig->cstime;
cgtime = sig->cgtime;
- rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
+ rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
/* add up live thread stats at the group level */
if (whole) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ad3b0762cc3e..9d357b2ea6cb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/base.c
*
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index cbd82dff7e81..403cbb12a6e9 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 06f4d31e0396..e0f867cd8553 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index e5709343feb7..2c7f22b14489 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index c330495c3115..96fc70225e54 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched/signal.h>
#include <linux/errno.h>
#include <linux/dcache.h>
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index 46dafadd0083..f371a602bf58 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __PROCFS_FD_H__
#define __PROCFS_FD_H__
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e250910cffc8..225f541f7078 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/inode.c
*
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index a352d5703b41..6a6bee9c603c 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 45629f4b5402..4bc85cb8be6a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/proc/kcore.c kernel ELF core dumper
*
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index f9387bb7631b..e0f8774acd65 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/kmsg.c
*
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 983fce5c2418..9bc5c58c00ee 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/pid_namespace.h>
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index cdd979724c74..6bb20f864259 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 3803b24ca220..59b17e509f46 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/proc_fs.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2726536489b1..1491918a33c3 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/fs.h>
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8f479229b349..c5cbbdff3c3d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* /proc/sys support
*/
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 901bd06f437d..d0cf1c50bb6c 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* proc_tty.c -- handles /proc/tty
*
@@ -14,6 +15,7 @@
#include <linux/tty.h>
#include <linux/seq_file.h>
#include <linux/bitops.h>
+#include "internal.h"
/*
* The /proc/tty directory inodes...
@@ -164,7 +166,7 @@ void proc_tty_unregister_driver(struct tty_driver *driver)
if (!ent)
return;
- remove_proc_entry(driver->driver_name, proc_tty_driver);
+ remove_proc_entry(ent->name, proc_tty_driver);
driver->proc_entry = NULL;
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 926fb27f4ca2..4e42aba97f2e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/root.c
*
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 39857f6db5cf..31326bb23b8b 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index ad8a77f94beb..24072cc06e65 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/kernel_stat.h>
#include <linux/proc_fs.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bd4e55f4aa20..59749dfaef67 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cpumask.h>
#include <linux/fs.h>
#include <linux/init.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5589b4bd4b85..6744bd706ecf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/hugetlb.h>
@@ -1310,13 +1311,15 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
struct page *page = NULL;
- if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+ if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
if (pmd_present(pmd)) {
page = pmd_page(pmd);
flags |= PM_PRESENT;
+ if (pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
if (pm->show_pfn)
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1328,6 +1331,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
+ if (pmd_swp_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
page = migration_entry_to_page(entry);
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index b00b766098fa..5b62f57bd9bc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/file.h>
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 20614b62a9b7..b813e3b529f2 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 7981c4ffe787..95a708d83721 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
diff --git a/fs/proc/version.c b/fs/proc/version.c
index d2154eb6d78f..94901e8e700d 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 99dff222fe67..7b635d173213 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats}
*
@@ -27,7 +28,7 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
poll_wait(file, &p->ns->poll, wait);
- event = ACCESS_ONCE(ns->event);
+ event = READ_ONCE(ns->event);
if (m->poll_event != event) {
m->poll_event = event;
res |= POLLERR | POLLPRI;
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index b8803cc07fce..967b5891f325 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux pstorefs routines.
#
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 7f4e48c8d188..c029314478fa 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __PSTORE_INTERNAL_H__
#define __PSTORE_INTERNAL_H__
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2b21d180157c..086e491faf04 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -62,7 +62,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
static int pstore_new_entry;
static void pstore_timefunc(unsigned long);
-static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0);
+static DEFINE_TIMER(pstore_timer, pstore_timefunc);
static void pstore_dowork(struct work_struct *);
static DECLARE_WORK(pstore_work, pstore_dowork);
@@ -482,10 +482,7 @@ void pstore_record_init(struct pstore_record *record,
record->psi = psinfo;
/* Report zeroed timestamp if called before timekeeping has resumed. */
- if (__getnstimeofday(&record->time)) {
- record->time.tv_sec = 0;
- record->time.tv_nsec = 0;
- }
+ record->time = ns_to_timespec(ktime_get_real_fast_ns());
}
/*
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 76a7a697b778..163afc4ba4b2 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* QNX4 file system, Linux implementation.
*
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 781056a0480f..a6ee23aadd28 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* QNX4 file system, Linux implementation.
*
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index e62c8183777a..eca27878079d 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* QNX4 file system, Linux implementation.
*
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index c9b1be2c164d..6283705466a4 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fs.h>
#include <linux/qnx4_fs.h>
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 27637e0bdc9f..c1cfb8a19e9d 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* QNX6 file system, Linux implementation.
*
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 6c1a323137dd..72c2770830be 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* QNX6 file system, Linux implementation.
*
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index f23b5c4a66ad..34a6b126a3a9 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* QNX6 file system, Linux implementation.
*
diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c
index 62aaf3e3126a..d282c2c73404 100644
--- a/fs/qnx6/super_mmi.c
+++ b/fs/qnx6/super_mmi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* QNX6 file system, Linux implementation.
*
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index c66c37cdaa39..f2b49d0f0287 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_QUOTA) += dquot.o
obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
index fb1892fe3e56..779caed4f078 100644
--- a/fs/quota/compat.c
+++ b/fs/quota/compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/syscalls.h>
#include <linux/compat.h>
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8381db9db6d9..9f78b5015f2e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implementation of the diskquota system for the LINUX operating system. QUOTA
* is implemented using the BSD system call interface as the means of
@@ -1297,21 +1298,18 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space,
spin_lock(&dquot->dq_dqb_lock);
if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
- goto add;
+ goto finish;
tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
+ space + rsv_space;
- if (flags & DQUOT_SPACE_NOFAIL)
- goto add;
-
if (dquot->dq_dqb.dqb_bhardlimit &&
tspace > dquot->dq_dqb.dqb_bhardlimit &&
!ignore_hardlimit(dquot)) {
if (flags & DQUOT_SPACE_WARN)
prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
ret = -EDQUOT;
- goto out;
+ goto finish;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1322,7 +1320,7 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space,
if (flags & DQUOT_SPACE_WARN)
prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
ret = -EDQUOT;
- goto out;
+ goto finish;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1338,13 +1336,21 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space,
* be always printed
*/
ret = -EDQUOT;
- goto out;
+ goto finish;
}
}
-add:
- dquot->dq_dqb.dqb_rsvspace += rsv_space;
- dquot->dq_dqb.dqb_curspace += space;
-out:
+finish:
+ /*
+ * We have to be careful and go through warning generation & grace time
+ * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it
+ * only here...
+ */
+ if (flags & DQUOT_SPACE_NOFAIL)
+ ret = 0;
+ if (!ret) {
+ dquot->dq_dqb.dqb_rsvspace += rsv_space;
+ dquot->dq_dqb.dqb_curspace += space;
+ }
spin_unlock(&dquot->dq_dqb_lock);
return ret;
}
@@ -1980,7 +1986,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0,
&warn_to[cnt]);
if (ret) {
+ spin_lock(&transfer_to[cnt]->dq_dqb_lock);
dquot_decr_inodes(transfer_to[cnt], inode_usage);
+ spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
goto over_quota;
}
}
diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c
index ebc5e6285800..f814fa90af38 100644
--- a/fs/quota/kqid.c
+++ b/fs/quota/kqid.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/quota.h>
#include <linux/export.h>
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index e99b1a72d9a7..95acdae391b4 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cred.h>
#include <linux/init.h>
#include <linux/kernel.h>
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index a9c5dfe6b83e..43612e2a73af 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Quota code necessary even when VFS quota support is not compiled
* into the kernel. The interesting stuff is over in dquot.c, here
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..31cf27e0e9e0 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Definitions of structures for vfsv0 quota format
*/
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index c0187cda2c1e..a73e5b34db41 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -328,12 +328,16 @@ static int v2_write_dquot(struct dquot *dquot)
if (!dquot->dq_off) {
alloc = true;
down_write(&dqopt->dqio_sem);
+ } else {
+ down_read(&dqopt->dqio_sem);
}
ret = qtree_write_dquot(
sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
dquot);
if (alloc)
up_write(&dqopt->dqio_sem);
+ else
+ up_read(&dqopt->dqio_sem);
return ret;
}
diff --git a/fs/quota/quotaio_v1.h b/fs/quota/quotaio_v1.h
index 746654b5de70..bd11e2c08119 100644
--- a/fs/quota/quotaio_v1.h
+++ b/fs/quota/quotaio_v1.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_QUOTAIO_V1_H
#define _LINUX_QUOTAIO_V1_H
diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h
index 4e95430093d9..43cf0f0e2902 100644
--- a/fs/quota/quotaio_v2.h
+++ b/fs/quota/quotaio_v2.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Definitions of structures for vfsv0 quota format
*/
diff --git a/fs/read_write.c b/fs/read_write.c
index a2b9a47235c5..0046d72efe94 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/read_write.c
*
@@ -112,7 +113,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
* In the generic case the entire file is data, so as long as
* offset isn't at the end of the file then the offset is data.
*/
- if (offset >= eof)
+ if ((unsigned long long)offset >= eof)
return -ENXIO;
break;
case SEEK_HOLE:
@@ -120,7 +121,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
* There is a virtual hole at the end of the file, so as long as
* offset isn't i_size or larger, return i_size.
*/
- if (offset >= eof)
+ if ((unsigned long long)offset >= eof)
return -ENXIO;
offset = eof;
break;
diff --git a/fs/readdir.c b/fs/readdir.c
index 89659549c09d..1b83b0ad183b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/readdir.c
*
@@ -36,13 +37,12 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
if (res)
goto out;
- if (shared) {
- inode_lock_shared(inode);
- } else {
+ if (shared)
+ res = down_read_killable(&inode->i_rwsem);
+ else
res = down_write_killable(&inode->i_rwsem);
- if (res)
- goto out;
- }
+ if (res)
+ goto out;
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 3c3b00165114..a39a562c1c10 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux reiser-filesystem routines.
#
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index 4a211f5b34b8..0c1c847f992f 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/init.h>
#include <linux/posix_acl.h>
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index f59c667df15b..69ff280bdfe8 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Write ahead logging implementation copyright Chris Mason 2000
*
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
index 045b83ef9fd9..46bd7bd63a71 100644
--- a/fs/reiserfs/lock.c
+++ b/fs/reiserfs/lock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "reiserfs.h"
#include <linux/mutex.h>
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 1d34377fef97..48835a659948 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
* licensing and copyright details
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 2d5489b0a269..b0ae088dffc7 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
* details
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e87aa21c30de..46492fb37a4c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/reiserfs/xattr.c
*
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index 613ff5aef94e..c764352447ba 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/reiserfs_xattr.h>
#include <linux/init.h>
#include <linux/list.h>
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 54415f0e3d18..aa9380bac196 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/posix_acl.h>
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index e4cbb7719906..20be9a0e5870 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "reiserfs.h"
#include <linux/errno.h>
#include <linux/fs.h>
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index f15a5f9e84ce..5ed48da3d02b 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "reiserfs.h"
#include <linux/capability.h>
#include <linux/errno.h>
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index dc59df43b2db..a573ca45bacc 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "reiserfs.h"
#include <linux/errno.h>
#include <linux/fs.h>
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index 420beb7d495c..844928f15711 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux RomFS filesystem routines.
#
diff --git a/fs/select.c b/fs/select.c
index c6362e38ae92..063067e606ca 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file contains the procedures for the handling of select and poll
*
diff --git a/fs/seq_file.c b/fs/seq_file.c
index dc7c2be963ed..4be761c1a03d 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/seq_file.c
*
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d2c434112f42..1c667af86da5 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/signalfd.c
*
diff --git a/fs/splice.c b/fs/splice.c
index f3084cce0ea6..39e2dc01ac12 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -253,7 +253,7 @@ EXPORT_SYMBOL(add_to_pipe);
*/
int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
{
- unsigned int buffers = ACCESS_ONCE(pipe->buffers);
+ unsigned int buffers = READ_ONCE(pipe->buffers);
spd->nr_pages_max = buffers;
if (buffers <= PIPE_DEF_BUFFERS)
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 6655631c53ae..7bd9b8b856d0 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux squashfs routines.
#
diff --git a/fs/stat.c b/fs/stat.c
index 8a6aa8caf891..873785dae022 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/stat.c
*
diff --git a/fs/statfs.c b/fs/statfs.c
index fab9b6a3c116..c25dd9a26cc1 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
diff --git a/fs/super.c b/fs/super.c
index 166c4ee0d0ed..994db21f59bf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/super.c
*
diff --git a/fs/sync.c b/fs/sync.c
index a576aa2e6b09..83ac79a960dd 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* High-level sync()-related operations
*/
diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c
index 862c1f74a583..0e69dbdf7277 100644
--- a/fs/sysv/balloc.c
+++ b/fs/sysv/balloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/sysv/balloc.c
*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index f5191cb2c947..88e38cd8f5c9 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/sysv/dir.c
*
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 7ba997e31aeb..45fc79a18594 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/sysv/file.c
*
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index eb963fbb7903..6c9801986af6 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/sysv/ialloc.c
*
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 1c8bf9453a71..3c47b7d5d4cf 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/sysv/inode.c
*
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 83809f5b5eca..bcb67b0cabe7 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/sysv/itree.c
*
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index d8817f139763..250b0755b908 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/sysv/namei.c
*
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 1e7e27c729af..e913698779c0 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SYSV_H
#define _SYSV_H
diff --git a/fs/timerfd.c b/fs/timerfd.c
index ece0c02d7e63..040612ec9598 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/timerfd.c
*
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 6f3251c2bf08..9758f709c736 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_UBIFS_FS) += ubifs.o
ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 114ba455bac3..616a688f5d8f 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "ubifs.h"
static int ubifs_crypt_get_context(struct inode *inode, void *ctx, size_t len)
@@ -87,7 +88,6 @@ const struct fscrypt_operations ubifs_crypt_operations = {
.key_prefix = "ubifs:",
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
- .is_encrypted = __ubifs_crypt_is_encrypted,
.empty_dir = ubifs_crypt_empty_dir,
.max_namelen = ubifs_crypt_max_namelen,
};
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index fdc311246807..0164bcc827f8 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -38,7 +38,8 @@ void ubifs_set_inode_flags(struct inode *inode)
{
unsigned int flags = ubifs_inode(inode)->flags;
- inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+ inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC |
+ S_ENCRYPTED);
if (flags & UBIFS_SYNC_FL)
inode->i_flags |= S_SYNC;
if (flags & UBIFS_APPEND_FL)
@@ -47,6 +48,8 @@ void ubifs_set_inode_flags(struct inode *inode)
inode->i_flags |= S_IMMUTABLE;
if (flags & UBIFS_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
+ if (flags & UBIFS_CRYPT_FL)
+ inode->i_flags |= S_ENCRYPTED;
}
/*
diff --git a/fs/ubifs/misc.c b/fs/ubifs/misc.c
index 486a2844949f..586fd5b578a7 100644
--- a/fs/ubifs/misc.c
+++ b/fs/ubifs/misc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include "ubifs.h"
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5496b17b959c..7503e7cdf870 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2007,12 +2007,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
return c;
}
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
-const struct fscrypt_operations ubifs_crypt_operations = {
- .is_encrypted = __ubifs_crypt_is_encrypted,
-};
-#endif
-
static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ubifs_info *c = sb->s_fs_info;
@@ -2055,7 +2049,9 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
sb->s_xattr = ubifs_xattr_handlers;
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
sb->s_cop = &ubifs_crypt_operations;
+#endif
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index cd43651f1731..63c7468147eb 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -38,12 +38,11 @@
#include <linux/backing-dev.h>
#include <linux/security.h>
#include <linux/xattr.h>
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
#include <linux/random.h>
+
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
#include "ubifs-media.h"
/* Version of this UBIFS implementation */
@@ -1835,18 +1834,13 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
extern const struct fscrypt_operations ubifs_crypt_operations;
-static inline bool __ubifs_crypt_is_encrypted(struct inode *inode)
+static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
{
- struct ubifs_inode *ui = ubifs_inode(inode);
+ const struct ubifs_inode *ui = ubifs_inode(inode);
return ui->flags & UBIFS_CRYPT_FL;
}
-static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
-{
- return __ubifs_crypt_is_encrypted((struct inode *)inode);
-}
-
/* Normal UBIFS messages */
__printf(2, 3)
void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index c13eae819cbc..5ddc89d564fd 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -170,6 +170,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
if (err)
goto out_cancel;
+ ubifs_set_inode_flags(host);
mutex_unlock(&host_ui->ui_mutex);
ubifs_release_budget(c, &req);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index b1b9a63d8cf3..630426ffb775 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UDF_I_H
#define _UDF_I_H
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index c13875d669c0..68c9f1d618f5 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UDF_SB_H
#define __LINUX_UDF_SB_H
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 63b034984378..fa206558128d 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __UDF_DECL_H
#define __UDF_DECL_H
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 6a9f3a9cc428..a4363ac2cfeb 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __UDF_ENDIAN_H
#define __UDF_ENDIAN_H
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index f80be4c5df9d..b5cd79065ef9 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ufs/balloc.c
*
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index b4676322ddb6..1abe5454de47 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ufs/cylinder.c
*
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 48609f1d9580..2edc1755b7c5 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ufs/ufs_dir.c
*
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 042ddbf110cc..7e087581be7e 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ufs/file.c
*
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index d1dd8cc33179..916b4a428933 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ufs/ialloc.c
*
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index f36d6a53687d..afb601c0dda0 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ufs/inode.c
*
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 8eca4eda8450..32545cd00ceb 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ufs/namei.c
*
diff --git a/fs/ufs/swab.h b/fs/ufs/swab.h
index 8d974c4fd18b..a0e1d8c827f4 100644
--- a/fs/ufs/swab.h
+++ b/fs/ufs/swab.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/ufs/swab.h
*
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index c87f4c3fa9dd..b49e0efdf3d7 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UFS_UFS_H
#define _UFS_UFS_H 1
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 150eef6f1233..ef9ead44776a 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/include/linux/ufs_fs.h
*
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 02497a492eb2..4fa633f84274 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ufs/util.c
*
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 9fc7119a1551..1907be6d5808 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/fs/ufs/util.h
*
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ef4b48d1ea42..f46d133c0949 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -381,7 +381,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
* in __get_user_pages if userfaultfd_release waits on the
* caller of handle_userfault to release the mmap_sem.
*/
- if (unlikely(ACCESS_ONCE(ctx->released))) {
+ if (unlikely(READ_ONCE(ctx->released))) {
/*
* Don't return VM_FAULT_SIGBUS in this case, so a non
* cooperative manager can close the uffd after the
@@ -477,7 +477,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
vmf->flags, reason);
up_read(&mm->mmap_sem);
- if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+ if (likely(must_wait && !READ_ONCE(ctx->released) &&
(return_to_userland ? !signal_pending(current) :
!fatal_signal_pending(current)))) {
wake_up_poll(&ctx->fd_wqh, POLLIN);
@@ -586,8 +586,14 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
set_current_state(TASK_KILLABLE);
if (ewq->msg.event == 0)
break;
- if (ACCESS_ONCE(ctx->released) ||
+ if (READ_ONCE(ctx->released) ||
fatal_signal_pending(current)) {
+ /*
+ * &ewq->wq may be queued in fork_event, but
+ * __remove_wait_queue ignores the head
+ * parameter. It would be a problem if it
+ * didn't.
+ */
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
if (ewq->msg.event == UFFD_EVENT_FORK) {
struct userfaultfd_ctx *new;
@@ -827,7 +833,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
- ACCESS_ONCE(ctx->released) = true;
+ WRITE_ONCE(ctx->released, true);
if (!mmget_not_zero(mm))
goto wakeup;
@@ -1061,6 +1067,12 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
(unsigned long)
uwq->msg.arg.reserved.reserved1;
list_move(&uwq->wq.entry, &fork_event);
+ /*
+ * fork_nctx can be freed as soon as
+ * we drop the lock, unless we take a
+ * reference on it.
+ */
+ userfaultfd_ctx_get(fork_nctx);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
break;
@@ -1091,19 +1103,53 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (!ret && msg->event == UFFD_EVENT_FORK) {
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+ spin_lock(&ctx->event_wqh.lock);
+ if (!list_empty(&fork_event)) {
+ /*
+ * The fork thread didn't abort, so we can
+ * drop the temporary refcount.
+ */
+ userfaultfd_ctx_put(fork_nctx);
+
+ uwq = list_first_entry(&fork_event,
+ typeof(*uwq),
+ wq.entry);
+ /*
+ * If fork_event list wasn't empty and in turn
+ * the event wasn't already released by fork
+ * (the event is allocated on fork kernel
+ * stack), put the event back to its place in
+ * the event_wq. fork_event head will be freed
+ * as soon as we return so the event cannot
+ * stay queued there no matter the current
+ * "ret" value.
+ */
+ list_del(&uwq->wq.entry);
+ __add_wait_queue(&ctx->event_wqh, &uwq->wq);
- if (!ret) {
- spin_lock(&ctx->event_wqh.lock);
- if (!list_empty(&fork_event)) {
- uwq = list_first_entry(&fork_event,
- typeof(*uwq),
- wq.entry);
- list_del(&uwq->wq.entry);
- __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+ /*
+ * Leave the event in the waitqueue and report
+ * error to userland if we failed to resolve
+ * the userfault fork.
+ */
+ if (likely(!ret))
userfaultfd_event_complete(ctx, uwq);
- }
- spin_unlock(&ctx->event_wqh.lock);
+ } else {
+ /*
+ * Here the fork thread aborted and the
+ * refcount from the fork thread on fork_nctx
+ * has already been released. We still hold
+ * the reference we took before releasing the
+ * lock above. If resolve_userfault_fork
+ * failed we've to drop it because the
+ * fork_nctx has to be freed in such case. If
+ * it succeeded we'll hold it because the new
+ * uffd references it.
+ */
+ if (ret)
+ userfaultfd_ctx_put(fork_nctx);
}
+ spin_unlock(&ctx->event_wqh.lock);
}
return ret;
diff --git a/fs/utimes.c b/fs/utimes.c
index 51edb9f9507c..e4b3d7c2c9f5 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/namei.h>
diff --git a/fs/xattr.c b/fs/xattr.c
index 4424f7fecf14..61cd28ba25f3 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -250,7 +250,7 @@ xattr_getsecurity(struct inode *inode, const char *name, void *value,
}
memcpy(value, buffer, len);
out:
- security_release_secctx(buffer, len);
+ kfree(buffer);
out_noalloc:
return len;
}
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 1b98cfa342ab..f42fcf1b5465 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -71,6 +71,23 @@ config XFS_RT
If unsure, say N.
+config XFS_ONLINE_SCRUB
+ bool "XFS online metadata check support"
+ default n
+ depends on XFS_FS
+ help
+ If you say Y here you will be able to check metadata on a
+ mounted XFS filesystem. This feature is intended to reduce
+ filesystem downtime by supplementing xfs_repair. The key
+ advantage here is to look for problems proactively so that
+ they can be dealt with in a controlled manner.
+
+ This feature is considered EXPERIMENTAL. Use with caution!
+
+ See the xfs_scrub man page in section 8 for additional information.
+
+ If unsure, say N.
+
config XFS_WARN
bool "XFS Verbose Warnings"
depends on XFS_FS && !XFS_DEBUG
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index a6e955bfead8..7ceb41a9786a 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -49,6 +49,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_dquot_buf.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
+ xfs_iext_tree.o \
xfs_inode_fork.o \
xfs_inode_buf.o \
xfs_log_rlimit.o \
@@ -135,3 +136,31 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
+
+# online scrub/repair
+ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
+
+# Tracepoints like to blow up, so build that before everything else
+
+xfs-y += $(addprefix scrub/, \
+ trace.o \
+ agheader.o \
+ alloc.o \
+ attr.o \
+ bmap.o \
+ btree.o \
+ common.o \
+ dabtree.o \
+ dir.o \
+ ialloc.o \
+ inode.o \
+ parent.o \
+ refcount.o \
+ rmap.o \
+ scrub.o \
+ symlink.o \
+ )
+
+xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o
+xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
+endif
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 4d85992d75b2..758f37ac5ad3 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -119,8 +119,7 @@ kmem_zone_free(kmem_zone_t *zone, void *ptr)
static inline void
kmem_zone_destroy(kmem_zone_t *zone)
{
- if (zone)
- kmem_cache_destroy(zone);
+ kmem_cache_destroy(zone);
}
extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index b008ff3250eb..2291f4224e24 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -27,6 +27,7 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_alloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
@@ -156,7 +157,8 @@ __xfs_ag_resv_free(
trace_xfs_ag_resv_free(pag, type, 0);
resv = xfs_perag_resv(pag, type);
- pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ if (pag->pag_agno == 0)
+ pag->pag_mount->m_ag_max_usable += resv->ar_asked;
/*
* AGFL blocks are always considered "free", so whatever
* was reserved at mount time must be given back at umount.
@@ -216,7 +218,14 @@ __xfs_ag_resv_init(
return error;
}
- mp->m_ag_max_usable -= ask;
+ /*
+ * Reduce the maximum per-AG allocation length by however much we're
+ * trying to reserve for an AG. Since this is a filesystem-wide
+ * counter, we only make the adjustment for AG 0. This assumes that
+ * there aren't any AGs hungrier for per-AG reservation than AG 0.
+ */
+ if (pag->pag_agno == 0)
+ mp->m_ag_max_usable -= ask;
resv = xfs_perag_resv(pag, type);
resv->ar_asked = ask;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 744dcaec34cc..0da80019a917 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -31,6 +31,7 @@
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
#include "xfs_trace.h"
@@ -1584,6 +1585,10 @@ xfs_alloc_ag_vextent_small(
bp = xfs_btree_get_bufs(args->mp, args->tp,
args->agno, fbno, 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
xfs_trans_binval(args->tp, bp);
}
args->len = 1;
@@ -2141,6 +2146,10 @@ xfs_alloc_fix_freelist(
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto out_agbp_relse;
+ }
xfs_trans_binval(tp, bp);
}
@@ -2923,3 +2932,52 @@ xfs_alloc_query_all(
query.fn = fn;
return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
}
+
+/* Find the size of the AG, in blocks. */
+xfs_agblock_t
+xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ ASSERT(agno < mp->m_sb.sb_agcount);
+
+ if (agno < mp->m_sb.sb_agcount - 1)
+ return mp->m_sb.sb_agblocks;
+ return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Verify that an AG block number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+bool
+xfs_verify_agbno(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno)
+{
+ xfs_agblock_t eoag;
+
+ eoag = xfs_ag_block_count(mp, agno);
+ if (agbno >= eoag)
+ return false;
+ if (agbno <= XFS_AGFL_BLOCK(mp))
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an FS block number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+bool
+xfs_verify_fsbno(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno)
+{
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return false;
+ return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index ef26edc2e938..7ba2d129d504 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -232,5 +232,9 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur,
xfs_alloc_query_range_fn fn, void *priv);
int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
void *priv);
+xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno);
+bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 5c16db86b38f..53cc8b986eac 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -397,13 +397,9 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
/* rounded down */
offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
- switch (dp->i_d.di_format) {
- case XFS_DINODE_FMT_DEV:
+ if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) {
minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
return (offset >= minforkoff) ? minforkoff : 0;
- case XFS_DINODE_FMT_UUID:
- minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
- return (offset >= minforkoff) ? minforkoff : 0;
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 459f4b4f08fe..08df809e2315 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -38,6 +38,7 @@
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
#include "xfs_rtalloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
@@ -49,7 +50,6 @@
#include "xfs_rmap.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
-#include "xfs_rmap_btree.h"
#include "xfs_icache.h"
@@ -113,28 +113,21 @@ xfs_bmap_compute_maxlevels(
STATIC int /* error */
xfs_bmbt_lookup_eq(
struct xfs_btree_cur *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
+ struct xfs_bmbt_irec *irec,
int *stat) /* success/failure */
{
- cur->bc_rec.b.br_startoff = off;
- cur->bc_rec.b.br_startblock = bno;
- cur->bc_rec.b.br_blockcount = len;
+ cur->bc_rec.b = *irec;
return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
}
STATIC int /* error */
-xfs_bmbt_lookup_ge(
+xfs_bmbt_lookup_first(
struct xfs_btree_cur *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
int *stat) /* success/failure */
{
- cur->bc_rec.b.br_startoff = off;
- cur->bc_rec.b.br_startblock = bno;
- cur->bc_rec.b.br_blockcount = len;
+ cur->bc_rec.b.br_startoff = 0;
+ cur->bc_rec.b.br_startblock = 0;
+ cur->bc_rec.b.br_blockcount = 0;
return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
}
@@ -161,21 +154,17 @@ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
}
/*
- * Update the record referred to by cur to the value given
- * by [off, bno, len, state].
+ * Update the record referred to by cur to the value given by irec
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
STATIC int
xfs_bmbt_update(
struct xfs_btree_cur *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- xfs_exntst_t state)
+ struct xfs_bmbt_irec *irec)
{
union xfs_btree_rec rec;
- xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+ xfs_bmbt_disk_set_all(&rec.bmbt, irec);
return xfs_btree_update(cur, &rec);
}
@@ -192,12 +181,8 @@ xfs_bmap_worst_indlen(
int maxrecs; /* maximum record count at this level */
xfs_mount_t *mp; /* mount structure */
xfs_filblks_t rval; /* return value */
- xfs_filblks_t orig_len;
mp = ip->i_mount;
-
- /* Calculate the worst-case size of the bmbt. */
- orig_len = len;
maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
@@ -205,20 +190,12 @@ xfs_bmap_worst_indlen(
len += maxrecs - 1;
do_div(len, maxrecs);
rval += len;
- if (len == 1) {
- rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ if (len == 1)
+ return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
level - 1;
- break;
- }
if (level == 0)
maxrecs = mp->m_bmap_dmxr[1];
}
-
- /* Calculate the worst-case size of the rmapbt. */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) +
- mp->m_rmap_maxlevels;
-
return rval;
}
@@ -255,7 +232,6 @@ xfs_bmap_forkoff_reset(
{
if (whichfork == XFS_ATTR_FORK &&
ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
- ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
@@ -512,31 +488,6 @@ error_norelse:
}
/*
- * Add bmap trace insert entries for all the contents of the extent records.
- */
-void
-xfs_bmap_trace_exlist(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t cnt, /* count of entries in the list */
- int whichfork, /* data or attr or cow fork */
- unsigned long caller_ip)
-{
- xfs_extnum_t idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int state = 0;
-
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
- else if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(cnt == xfs_iext_count(ifp));
- for (idx = 0; idx < cnt; idx++)
- trace_xfs_extlist(ip, idx, state, caller_ip);
-}
-
-/*
* Validate that the bmbt_irecs being returned from bmapi are valid
* given the caller's original parameters. Specifically check the
* ranges of the returned irecs to ensure that they only extend beyond
@@ -670,8 +621,8 @@ xfs_bmap_btree_to_extents(
cbno = be64_to_cpu(*pp);
*logflagsp = 0;
#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
- return error;
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
+ xfs_btree_check_lptr(cur, cbno, 1));
#endif
error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
@@ -716,14 +667,14 @@ xfs_bmap_extents_to_btree(
xfs_bmbt_rec_t *arp; /* child record pointer */
struct xfs_btree_block *block; /* btree root block */
xfs_btree_cur_t *cur; /* bmap btree cursor */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
int error; /* error return value */
- xfs_extnum_t i, cnt; /* extent record index */
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_key_t *kp; /* root block key pointer */
xfs_mount_t *mp; /* mount structure */
- xfs_extnum_t nextents; /* number of file extents */
xfs_bmbt_ptr_t *pp; /* root block address pointer */
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec rec;
+ xfs_extnum_t cnt = 0;
mp = ip->i_mount;
ASSERT(whichfork != XFS_COW_FORK);
@@ -802,15 +753,12 @@ xfs_bmap_extents_to_btree(
XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
XFS_BTREE_LONG_PTRS);
- arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
- nextents = xfs_iext_count(ifp);
- for (cnt = i = 0; i < nextents; i++) {
- ep = xfs_iext_get_ext(ifp, i);
- if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
- arp->l0 = cpu_to_be64(ep->l0);
- arp->l1 = cpu_to_be64(ep->l1);
- arp++; cnt++;
- }
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt);
+ xfs_bmbt_disk_set_all(arp, &rec);
+ cnt++;
}
ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
xfs_btree_set_numrecs(ablock, cnt);
@@ -858,6 +806,8 @@ xfs_bmap_local_to_extents_empty(
xfs_bmap_forkoff_reset(ip, whichfork);
ifp->if_flags &= ~XFS_IFINLINE;
ifp->if_flags |= XFS_IFEXTENTS;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
}
@@ -881,6 +831,7 @@ xfs_bmap_local_to_extents(
xfs_alloc_arg_t args; /* allocation arguments */
xfs_buf_t *bp; /* buffer for extent block */
struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
/*
* We don't want to deal with the case of keeping inode data inline yet.
@@ -898,8 +849,7 @@ xfs_bmap_local_to_extents(
flags = 0;
error = 0;
- ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
- XFS_IFINLINE);
+ ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS)) == XFS_IFINLINE);
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
@@ -943,15 +893,16 @@ xfs_bmap_local_to_extents(
xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
+
rec.br_startoff = 0;
rec.br_startblock = args.fsbno;
rec.br_blockcount = 1;
rec.br_state = XFS_EXT_NORM;
- xfs_iext_insert(ip, 0, 1, &rec, 0);
+ xfs_iext_first(ifp, &icur);
+ xfs_iext_insert(ip, &icur, &rec, 0);
- trace_xfs_bmap_post_update(ip, 0,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
- _THIS_IP_);
XFS_IFORK_NEXT_SET(ip, whichfork, 1);
ip->i_d.di_nblocks = 1;
xfs_trans_mod_dquot_byino(tp, ip,
@@ -986,7 +937,8 @@ xfs_bmap_add_attrfork_btree(
cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
cur->bc_private.b.dfops = dfops;
cur->bc_private.b.firstblock = *firstblock;
- if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+ error = xfs_bmbt_lookup_first(cur, &stat);
+ if (error)
goto error0;
/* must be at least one entry */
XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
@@ -1137,9 +1089,6 @@ xfs_bmap_add_attrfork(
case XFS_DINODE_FMT_DEV:
ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
break;
- case XFS_DINODE_FMT_UUID:
- ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
- break;
case XFS_DINODE_FMT_LOCAL:
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -1219,32 +1168,35 @@ trans_cancel:
*/
/*
- * Read in the extents to if_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in. If the file system cannot contain unwritten
- * extents, the records are checked for no "state" flags.
+ * Read in extents from a btree-format inode.
*/
-int /* error */
-xfs_bmap_read_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
+int
+xfs_iread_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
{
- struct xfs_btree_block *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
- int error; /* error return value */
- xfs_extnum_t i, j; /* index into the extents list */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
- /* REFERENCED */
- xfs_extnum_t room; /* number of entries there's room for */
+ struct xfs_mount *mp = ip->i_mount;
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ xfs_extnum_t nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+ struct xfs_btree_block *block = ifp->if_broot;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec new;
+ xfs_fsblock_t bno;
+ struct xfs_buf *bp;
+ xfs_extnum_t i, j;
+ int level;
+ __be64 *pp;
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- block = ifp->if_broot;
/*
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
*/
@@ -1261,21 +1213,23 @@ xfs_bmap_read_extents(
error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
if (error)
- return error;
+ goto out;
block = XFS_BUF_TO_BLOCK(bp);
if (level == 0)
break;
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
XFS_WANT_CORRUPTED_GOTO(mp,
- XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ XFS_FSB_SANITY_CHECK(mp, bno), out_brelse);
xfs_trans_brelse(tp, bp);
}
+
/*
* Here with bp and block set to the leftmost leaf node in the tree.
*/
- room = xfs_iext_count(ifp);
i = 0;
+ xfs_iext_first(ifp, &icur);
+
/*
* Loop over all leaf nodes. Copy information to the extent records.
*/
@@ -1285,14 +1239,15 @@ xfs_bmap_read_extents(
xfs_extnum_t num_recs;
num_recs = xfs_btree_get_numrecs(block);
- if (unlikely(i + num_recs > room)) {
- ASSERT(i + num_recs <= room);
+ if (unlikely(i + num_recs > nextents)) {
+ ASSERT(i + num_recs <= nextents);
xfs_warn(ip->i_mount,
"corrupt dinode %Lu, (btree extents).",
(unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+ XFS_CORRUPTION_ERROR(__func__,
XFS_ERRLEVEL_LOW, ip->i_mount, block);
- goto error0;
+ error = -EFSCORRUPTED;
+ goto out_brelse;
}
/*
* Read-ahead the next leaf block, if any.
@@ -1305,15 +1260,17 @@ xfs_bmap_read_extents(
* Copy records into the extent records.
*/
frp = XFS_BMBT_REC_ADDR(mp, block, 1);
- for (j = 0; j < num_recs; j++, i++, frp++) {
- xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
- trp->l0 = be64_to_cpu(frp->l0);
- trp->l1 = be64_to_cpu(frp->l1);
- if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) {
+ for (j = 0; j < num_recs; j++, frp++, i++) {
+ xfs_bmbt_disk_get_all(frp, &new);
+ if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) {
XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
XFS_ERRLEVEL_LOW, mp);
- goto error0;
+ error = -EFSCORRUPTED;
+ goto out_brelse;
}
+ xfs_iext_insert(ip, &icur, &new, state);
+ trace_xfs_read_extent(ip, &icur, state, _THIS_IP_);
+ xfs_iext_next(ifp, &icur);
}
xfs_trans_brelse(tp, bp);
bno = nextbno;
@@ -1325,71 +1282,74 @@ xfs_bmap_read_extents(
error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
if (error)
- return error;
+ goto out;
block = XFS_BUF_TO_BLOCK(bp);
}
- if (i != XFS_IFORK_NEXTENTS(ip, whichfork))
- return -EFSCORRUPTED;
+
+ if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
ASSERT(i == xfs_iext_count(ifp));
- XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+
+ ifp->if_flags |= XFS_IFEXTENTS;
return 0;
-error0:
+
+out_brelse:
xfs_trans_brelse(tp, bp);
- return -EFSCORRUPTED;
+out:
+ xfs_iext_destroy(ifp);
+ return error;
}
/*
- * Returns the file-relative block number of the first unused block(s)
- * in the file with at least "len" logically contiguous blocks free.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- * Return 0 if the file is currently local (in-inode).
+ * Returns the relative block number of the first unused block(s) in the given
+ * fork with at least "len" logically contiguous blocks free. This is the
+ * lowest-address hole if the fork has holes, else the first block past the end
+ * of fork. Return 0 if the fork is currently local (in-inode).
*/
int /* error */
xfs_bmap_first_unused(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_extlen_t len, /* size of hole to find */
- xfs_fileoff_t *first_unused, /* unused block */
- int whichfork) /* data or attr fork */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_extlen_t len, /* size of hole to find */
+ xfs_fileoff_t *first_unused, /* unused block */
+ int whichfork) /* data or attr fork */
{
- int error; /* error return value */
- int idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_fileoff_t lastaddr; /* last block number seen */
- xfs_fileoff_t lowest; /* lowest useful block */
- xfs_fileoff_t max; /* starting useful block */
- xfs_extnum_t nextents; /* number of extent entries */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t lastaddr = 0;
+ xfs_fileoff_t lowest, max;
+ int error;
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
*first_unused = 0;
return 0;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- lowest = *first_unused;
- nextents = xfs_iext_count(ifp);
- for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
- struct xfs_bmbt_irec got;
- xfs_iext_get_extent(ifp, idx, &got);
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+ lowest = max = *first_unused;
+ for_each_xfs_iext(ifp, &icur, &got) {
/*
* See if the hole before this extent will work.
*/
if (got.br_startoff >= lowest + len &&
- got.br_startoff - max >= len) {
- *first_unused = max;
- return 0;
- }
+ got.br_startoff - max >= len)
+ break;
lastaddr = got.br_startoff + got.br_blockcount;
max = XFS_FILEOFF_MAX(lastaddr, lowest);
}
+
*first_unused = max;
return 0;
}
@@ -1409,7 +1369,7 @@ xfs_bmap_last_before(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
int error;
switch (XFS_IFORK_FORMAT(ip, whichfork)) {
@@ -1429,17 +1389,8 @@ xfs_bmap_last_before(
return error;
}
- if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) {
- if (got.br_startoff <= *last_block - 1)
- return 0;
- }
-
- if (xfs_iext_get_extent(ifp, idx - 1, &got)) {
- *last_block = got.br_startoff + got.br_blockcount;
- return 0;
- }
-
- *last_block = 0;
+ if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got))
+ *last_block = 0;
return 0;
}
@@ -1452,8 +1403,8 @@ xfs_bmap_last_extent(
int *is_empty)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_iext_cursor icur;
int error;
- int nextents;
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
@@ -1461,14 +1412,11 @@ xfs_bmap_last_extent(
return error;
}
- nextents = xfs_iext_count(ifp);
- if (nextents == 0) {
+ xfs_iext_last(ifp, &icur);
+ if (!xfs_iext_get_extent(ifp, &icur, rec))
*is_empty = 1;
- return 0;
- }
-
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
- *is_empty = 0;
+ else
+ *is_empty = 0;
return 0;
}
@@ -1490,14 +1438,14 @@ xfs_bmap_isaeof(
int is_empty;
int error;
- bma->aeof = 0;
+ bma->aeof = false;
error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
&is_empty);
if (error)
return error;
if (is_empty) {
- bma->aeof = 1;
+ bma->aeof = true;
return 0;
}
@@ -1553,10 +1501,10 @@ xfs_bmap_one_block(
xfs_inode_t *ip, /* incore inode */
int whichfork) /* data or attr fork */
{
- xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
xfs_ifork_t *ifp; /* inode fork pointer */
int rval; /* return value */
xfs_bmbt_irec_t s; /* internal version of extent */
+ struct xfs_iext_cursor icur;
#ifndef DEBUG
if (whichfork == XFS_DATA_FORK)
@@ -1568,8 +1516,8 @@ xfs_bmap_one_block(
return 0;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_get_all(ep, &s);
+ xfs_iext_first(ifp, &icur);
+ xfs_iext_get_extent(ifp, &icur, &s);
rval = s.br_startoff == 0 && s.br_blockcount == 1;
if (rval && whichfork == XFS_DATA_FORK)
ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
@@ -1589,8 +1537,6 @@ xfs_bmap_add_extent_delay_real(
int whichfork)
{
struct xfs_bmbt_irec *new = &bma->got;
- int diff; /* temp value */
- xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
int error; /* error return value */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1598,14 +1544,14 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = 0;/* state bits, accessed thru macros */
+ int state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t da_new; /* new count del alloc blocks used */
xfs_filblks_t da_old; /* old count del alloc blocks used */
xfs_filblks_t temp=0; /* value for da_new calculations */
- xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
struct xfs_mount *mp;
xfs_extnum_t *nextents;
+ struct xfs_bmbt_irec old;
mp = bma->ip->i_mount;
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
@@ -1613,8 +1559,6 @@ xfs_bmap_add_extent_delay_real(
nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents :
&bma->ip->i_d.di_nextents);
- ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -1625,15 +1569,12 @@ xfs_bmap_add_extent_delay_real(
#define RIGHT r[1]
#define PREV r[2]
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
/*
* Set up a bunch of variables to make the tests simpler.
*/
- ep = xfs_iext_get_ext(ifp, bma->idx);
- xfs_bmbt_get_all(ep, &PREV);
+ xfs_iext_get_extent(ifp, &bma->icur, &PREV);
new_endoff = new->br_startoff + new->br_blockcount;
+ ASSERT(isnullstartblock(PREV.br_startblock));
ASSERT(PREV.br_startoff <= new->br_startoff);
ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
@@ -1653,10 +1594,8 @@ xfs_bmap_add_extent_delay_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (bma->idx > 0) {
+ if (xfs_iext_peek_prev_extent(ifp, &bma->icur, &LEFT)) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
-
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -1673,10 +1612,8 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (bma->idx < xfs_iext_count(ifp) - 1) {
+ if (xfs_iext_peek_next_extent(ifp, &bma->icur, &RIGHT)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
-
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -1706,22 +1643,19 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left and right neighbors are both contiguous with new.
*/
- bma->idx--;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
- LEFT.br_blockcount + PREV.br_blockcount +
- RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
- xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+ LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount;
+
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
(*nextents)--;
+
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -1733,11 +1667,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount +
- PREV.br_blockcount +
- RIGHT.br_blockcount, LEFT.br_state);
+ error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
}
@@ -1748,28 +1678,22 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left neighbor is contiguous, the right is not.
*/
- bma->idx--;
+ old = LEFT;
+ LEFT.br_blockcount += PREV.br_blockcount;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
- LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
- xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock, LEFT.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount +
- PREV.br_blockcount, LEFT.br_state);
+ error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
}
@@ -1780,27 +1704,23 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep, new->br_startblock);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount + RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ PREV.br_startblock = new->br_startblock;
+ PREV.br_blockcount += RIGHT.br_blockcount;
+
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
- xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
- new->br_startblock,
- PREV.br_blockcount +
- RIGHT.br_blockcount, PREV.br_state);
+ error = xfs_bmbt_update(bma->cur, &PREV);
if (error)
goto done;
}
@@ -1812,23 +1732,19 @@ xfs_bmap_add_extent_delay_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep, new->br_startblock);
- xfs_bmbt_set_state(ep, new->br_state);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ PREV.br_startblock = new->br_startblock;
+ PREV.br_state = new->br_state;
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
(*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
@@ -1841,40 +1757,33 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
- LEFT.br_blockcount + new->br_blockcount);
- xfs_bmbt_set_startoff(ep,
- PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
-
+ old = LEFT;
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock));
+
+ LEFT.br_blockcount += new->br_blockcount;
+
+ PREV.br_blockcount = temp;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock = nullstartblock(da_new);
+
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
+
if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock, LEFT.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount +
- new->br_blockcount,
- LEFT.br_state);
+ error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
}
- da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
- startblockval(PREV.br_startblock));
- xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
- bma->idx--;
break;
case BMAP_LEFT_FILLING:
@@ -1882,23 +1791,16 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_startoff(ep, new_endoff);
- temp = PREV.br_blockcount - new->br_blockcount;
- xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
(*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
@@ -1913,12 +1815,18 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+
+ temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
(bma->cur ? bma->cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, bma->idx + 1);
- xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
- trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+
+ PREV.br_startoff = new_endoff;
+ PREV.br_blockcount = temp;
+ PREV.br_startblock = nullstartblock(da_new);
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
+ xfs_iext_prev(ifp, &bma->icur);
break;
case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1926,40 +1834,34 @@ xfs_bmap_add_extent_delay_real(
* Filling in the last part of a previous delayed allocation.
* The right neighbor is contiguous with the new allocation.
*/
- temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
- new->br_startoff, new->br_startblock,
- new->br_blockcount + RIGHT.br_blockcount,
- RIGHT.br_state);
- trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+ old = RIGHT;
+ RIGHT.br_startoff = new->br_startoff;
+ RIGHT.br_startblock = new->br_startblock;
+ RIGHT.br_blockcount += new->br_blockcount;
+
if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount +
- RIGHT.br_blockcount,
- RIGHT.br_state);
+ error = xfs_bmbt_update(bma->cur, &RIGHT);
if (error)
goto done;
}
+ temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock));
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- bma->idx++;
+ PREV.br_blockcount = temp;
+ PREV.br_startblock = nullstartblock(da_new);
+
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT);
break;
case BMAP_RIGHT_FILLING:
@@ -1967,22 +1869,16 @@ xfs_bmap_add_extent_delay_real(
* Filling in the last part of a previous delayed allocation.
* The right neighbor is not contiguous.
*/
- temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
(*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
@@ -1997,14 +1893,16 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+
+ temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
(bma->cur ? bma->cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, bma->idx);
- xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- bma->idx++;
+ PREV.br_startblock = nullstartblock(da_new);
+ PREV.br_blockcount = temp;
+ xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
+ xfs_iext_next(ifp, &bma->icur);
break;
case 0:
@@ -2028,30 +1926,40 @@ xfs_bmap_add_extent_delay_real(
* PREV @ idx LEFT RIGHT
* inserted at idx + 1
*/
- temp = new->br_startoff - PREV.br_startoff;
- temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
+ old = PREV;
+
+ /* LEFT is the new middle */
LEFT = *new;
+
+ /* RIGHT is the new right */
RIGHT.br_state = PREV.br_state;
- RIGHT.br_startblock = nullstartblock(
- (int)xfs_bmap_worst_indlen(bma->ip, temp2));
RIGHT.br_startoff = new_endoff;
- RIGHT.br_blockcount = temp2;
- /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
- xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+ RIGHT.br_blockcount =
+ PREV.br_startoff + PREV.br_blockcount - new_endoff;
+ RIGHT.br_startblock =
+ nullstartblock(xfs_bmap_worst_indlen(bma->ip,
+ RIGHT.br_blockcount));
+
+ /* truncate PREV */
+ PREV.br_blockcount = new->br_startoff - PREV.br_startoff;
+ PREV.br_startblock =
+ nullstartblock(xfs_bmap_worst_indlen(bma->ip,
+ PREV.br_blockcount));
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state);
+ xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state);
(*nextents)++;
+
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
@@ -2066,30 +1974,9 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
- temp = xfs_bmap_worst_indlen(bma->ip, temp);
- temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
- diff = (int)(temp + temp2 -
- (startblockval(PREV.br_startblock) -
- (bma->cur ?
- bma->cur->bc_private.b.allocated : 0)));
- if (diff > 0) {
- error = xfs_mod_fdblocks(bma->ip->i_mount,
- -((int64_t)diff), false);
- ASSERT(!error);
- if (error)
- goto done;
- }
- ep = xfs_iext_get_ext(ifp, bma->idx);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
- nullstartblock((int)temp2));
- trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
-
- bma->idx++;
- da_new = temp + temp2;
+ da_new = startblockval(PREV.br_startblock) +
+ startblockval(RIGHT.br_startblock);
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -2123,19 +2010,17 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
- /* adjust for changes in reserved delayed indirect blocks */
- if (da_old || da_new) {
- temp = da_new;
- if (bma->cur)
- temp += bma->cur->bc_private.b.allocated;
- if (temp < da_old)
- xfs_mod_fdblocks(bma->ip->i_mount,
- (int64_t)(da_old - temp), false);
+ if (bma->cur) {
+ da_new += bma->cur->bc_private.b.allocated;
+ bma->cur->bc_private.b.allocated = 0;
}
- /* clear out the allocated field, done with it now in any case. */
- if (bma->cur)
- bma->cur->bc_private.b.allocated = 0;
+ /* adjust for changes in reserved delayed indirect blocks */
+ if (da_new != da_old) {
+ ASSERT(state == 0 || da_new < da_old);
+ error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new),
+ false);
+ }
xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
@@ -2155,7 +2040,7 @@ xfs_bmap_add_extent_unwritten_real(
struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
- xfs_extnum_t *idx, /* extent number to update/insert */
+ struct xfs_iext_cursor *icur,
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_fsblock_t *first, /* pointer to firstblock variable */
@@ -2163,28 +2048,22 @@ xfs_bmap_add_extent_unwritten_real(
int *logflagsp) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
- xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
int error; /* error return value */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_fileoff_t new_endoff; /* end offset of new entry */
- xfs_exntst_t newext; /* new extent state */
- xfs_exntst_t oldext; /* old extent state */
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = 0;/* state bits, accessed thru macros */
+ int state = xfs_bmap_fork_to_state(whichfork);
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec old;
*logflagsp = 0;
cur = *curp;
ifp = XFS_IFORK_PTR(ip, whichfork);
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
- ASSERT(*idx >= 0);
- ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2197,12 +2076,8 @@ xfs_bmap_add_extent_unwritten_real(
* Set up a bunch of variables to make the tests simpler.
*/
error = 0;
- ep = xfs_iext_get_ext(ifp, *idx);
- xfs_bmbt_get_all(ep, &PREV);
- newext = new->br_state;
- oldext = (newext == XFS_EXT_UNWRITTEN) ?
- XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
- ASSERT(PREV.br_state == oldext);
+ xfs_iext_get_extent(ifp, icur, &PREV);
+ ASSERT(new->br_state != PREV.br_state);
new_endoff = new->br_startoff + new->br_blockcount;
ASSERT(PREV.br_startoff <= new->br_startoff);
ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
@@ -2220,10 +2095,8 @@ xfs_bmap_add_extent_unwritten_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (*idx > 0) {
+ if (xfs_iext_peek_prev_extent(ifp, icur, &LEFT)) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
-
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2231,7 +2104,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
- LEFT.br_state == newext &&
+ LEFT.br_state == new->br_state &&
LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
state |= BMAP_LEFT_CONTIG;
@@ -2240,9 +2113,8 @@ xfs_bmap_add_extent_unwritten_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (*idx < xfs_iext_count(ifp) - 1) {
+ if (xfs_iext_peek_next_extent(ifp, icur, &RIGHT)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2250,7 +2122,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
- newext == RIGHT.br_state &&
+ new->br_state == RIGHT.br_state &&
new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
@@ -2271,24 +2143,20 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The left and right neighbors are both contiguous with new.
*/
- --*idx;
+ LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
- LEFT.br_blockcount + PREV.br_blockcount +
- RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
- xfs_iext_remove(ip, *idx + 1, 2, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
@@ -2303,10 +2171,8 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount + PREV.br_blockcount +
- RIGHT.br_blockcount, LEFT.br_state)))
+ error = xfs_bmbt_update(cur, &LEFT);
+ if (error)
goto done;
}
break;
@@ -2316,23 +2182,19 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- --*idx;
-
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
- LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ LEFT.br_blockcount += PREV.br_blockcount;
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &PREV, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
@@ -2341,10 +2203,8 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount + PREV.br_blockcount,
- LEFT.br_state)))
+ error = xfs_bmbt_update(cur, &LEFT);
+ if (error)
goto done;
}
break;
@@ -2354,21 +2214,22 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount + RIGHT.br_blockcount);
- xfs_bmbt_set_state(ep, newext);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ PREV.br_blockcount += RIGHT.br_blockcount;
+ PREV.br_state = new->br_state;
+
+ xfs_iext_next(ifp, icur);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
@@ -2377,10 +2238,8 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount + RIGHT.br_blockcount,
- newext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
}
break;
@@ -2391,22 +2250,19 @@ xfs_bmap_add_extent_unwritten_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_state(ep, newext);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ PREV.br_state = new->br_state;
+ xfs_iext_update_extent(ip, state, icur, &PREV);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- newext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
}
break;
@@ -2416,43 +2272,32 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
- LEFT.br_blockcount + new->br_blockcount);
- xfs_bmbt_set_startoff(ep,
- PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
-
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep,
- new->br_startblock + new->br_blockcount);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
- --*idx;
+ LEFT.br_blockcount += new->br_blockcount;
+
+ old = PREV;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock += new->br_blockcount;
+ PREV.br_blockcount -= new->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur,
- PREV.br_startoff + new->br_blockcount,
- PREV.br_startblock + new->br_blockcount,
- PREV.br_blockcount - new->br_blockcount,
- oldext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
- if ((error = xfs_btree_decrement(cur, 0, &i)))
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
goto done;
- error = xfs_bmbt_update(cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount + new->br_blockcount,
- LEFT.br_state);
+ error = xfs_bmbt_update(cur, &LEFT);
if (error)
goto done;
}
@@ -2463,32 +2308,25 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
- xfs_bmbt_set_startoff(ep, new_endoff);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount - new->br_blockcount);
- xfs_bmbt_set_startblock(ep,
- new->br_startblock + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
- xfs_iext_insert(ip, *idx, 1, new, state);
+ old = PREV;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock += new->br_blockcount;
+ PREV.br_blockcount -= new->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_insert(ip, icur, new, state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur,
- PREV.br_startoff + new->br_blockcount,
- PREV.br_startblock + new->br_blockcount,
- PREV.br_blockcount - new->br_blockcount,
- oldext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
cur->bc_rec.b = *new;
if ((error = xfs_btree_insert(cur, &i)))
@@ -2502,39 +2340,33 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is contiguous with the new allocation.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = PREV;
+ PREV.br_blockcount -= new->br_blockcount;
- ++*idx;
+ RIGHT.br_startoff = new->br_startoff;
+ RIGHT.br_startblock = new->br_startblock;
+ RIGHT.br_blockcount += new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
- new->br_startoff, new->br_startblock,
- new->br_blockcount + RIGHT.br_blockcount, newext);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &RIGHT);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock,
- PREV.br_blockcount, &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
- PREV.br_startblock,
- PREV.br_blockcount - new->br_blockcount,
- oldext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
- if ((error = xfs_btree_increment(cur, 0, &i)))
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
goto done;
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount + RIGHT.br_blockcount,
- newext)))
+ error = xfs_bmbt_update(cur, &RIGHT);
+ if (error)
goto done;
}
break;
@@ -2544,13 +2376,12 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = PREV;
+ PREV.br_blockcount -= new->br_blockcount;
- ++*idx;
- xfs_iext_insert(ip, *idx, 1, new, state);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, new, state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
@@ -2558,22 +2389,17 @@ xfs_bmap_add_extent_unwritten_real(
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
- PREV.br_startblock,
- PREV.br_blockcount - new->br_blockcount,
- oldext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -2586,20 +2412,20 @@ xfs_bmap_add_extent_unwritten_real(
* newext. Contiguity is impossible here.
* One extent becomes three extents.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep,
- new->br_startoff - PREV.br_startoff);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = PREV;
+ PREV.br_blockcount = new->br_startoff - PREV.br_startoff;
r[0] = *new;
r[1].br_startoff = new_endoff;
r[1].br_blockcount =
- PREV.br_startoff + PREV.br_blockcount - new_endoff;
+ old.br_startoff + old.br_blockcount - new_endoff;
r[1].br_startblock = new->br_startblock + new->br_blockcount;
- r[1].br_state = oldext;
+ r[1].br_state = PREV.br_state;
- ++*idx;
- xfs_iext_insert(ip, *idx, 2, &r[0], state);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &r[1], state);
+ xfs_iext_insert(ip, icur, &r[0], state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
@@ -2607,20 +2433,16 @@ xfs_bmap_add_extent_unwritten_real(
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
/* new right extent - oldext */
- if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
- r[1].br_startblock, r[1].br_blockcount,
- r[1].br_state)))
+ error = xfs_bmbt_update(cur, &r[1]);
+ if (error)
goto done;
/* new left extent - oldext */
cur->bc_rec.b = PREV;
- cur->bc_rec.b.br_blockcount =
- new->br_startoff - PREV.br_startoff;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -2629,13 +2451,11 @@ xfs_bmap_add_extent_unwritten_real(
* we are about to insert as we can't trust it after
* the previous insert.
*/
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
/* new middle extent - newext */
- cur->bc_rec.b.br_state = new->br_state;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -2694,7 +2514,7 @@ STATIC void
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
- xfs_extnum_t *idx, /* extent number to update/insert */
+ struct xfs_iext_cursor *icur,
xfs_bmbt_irec_t *new) /* new data to add to file extents */
{
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -2702,22 +2522,17 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
- int state; /* state bits, accessed thru macros */
- xfs_filblks_t temp=0; /* temp for indirect calculations */
+ int state = xfs_bmap_fork_to_state(whichfork);
+ xfs_filblks_t temp; /* temp for indirect calculations */
ifp = XFS_IFORK_PTR(ip, whichfork);
- state = 0;
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
ASSERT(isnullstartblock(new->br_startblock));
/*
* Check and set flags if this segment has a left neighbor
*/
- if (*idx > 0) {
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
-
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2726,10 +2541,8 @@ xfs_bmap_add_extent_hole_delay(
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (*idx < xfs_iext_count(ifp)) {
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
-
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2761,22 +2574,20 @@ xfs_bmap_add_extent_hole_delay(
* on the left and on the right.
* Merge all three into a single extent record.
*/
- --*idx;
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
- nullstartblock((int)newlen));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ left.br_startblock = nullstartblock(newlen);
+ left.br_blockcount = temp;
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_LEFT_CONTIG:
@@ -2785,18 +2596,17 @@ xfs_bmap_add_extent_hole_delay(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- --*idx;
temp = left.br_blockcount + new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
- nullstartblock((int)newlen));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ left.br_blockcount = temp;
+ left.br_startblock = nullstartblock(newlen);
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_RIGHT_CONTIG:
@@ -2805,16 +2615,15 @@ xfs_bmap_add_extent_hole_delay(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
temp = new->br_blockcount + right.br_blockcount;
oldlen = startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
- new->br_startoff,
- nullstartblock((int)newlen), temp, right.br_state);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = nullstartblock(newlen);
+ right.br_blockcount = temp;
+ xfs_iext_update_extent(ip, state, icur, &right);
break;
case 0:
@@ -2824,7 +2633,7 @@ xfs_bmap_add_extent_hole_delay(
* Insert a new entry.
*/
oldlen = newlen = 0;
- xfs_iext_insert(ip, *idx, 1, new, state);
+ xfs_iext_insert(ip, icur, new, state);
break;
}
if (oldlen != newlen) {
@@ -2845,7 +2654,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork,
- xfs_extnum_t *idx,
+ struct xfs_iext_cursor *icur,
struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new,
xfs_fsblock_t *first,
@@ -2860,27 +2669,19 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
- int state; /* state bits, accessed thru macros */
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_bmbt_irec old;
- ASSERT(*idx >= 0);
- ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
- state = 0;
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (*idx > 0) {
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2889,9 +2690,8 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (*idx < xfs_iext_count(ifp)) {
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2928,14 +2728,11 @@ xfs_bmap_add_extent_hole_real(
* left and on the right.
* Merge all three into a single extent record.
*/
- --*idx;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
- left.br_blockcount + new->br_blockcount +
- right.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ left.br_blockcount += new->br_blockcount + right.br_blockcount;
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -2943,9 +2740,7 @@ xfs_bmap_add_extent_hole_real(
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
- right.br_startblock, right.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(cur, &right, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -2957,12 +2752,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(cur, left.br_startoff,
- left.br_startblock,
- left.br_blockcount +
- new->br_blockcount +
- right.br_blockcount,
- left.br_state);
+ error = xfs_bmbt_update(cur, &left);
if (error)
goto done;
}
@@ -2974,27 +2764,21 @@ xfs_bmap_add_extent_hole_real(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- --*idx;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
- left.br_blockcount + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = left;
+ left.br_blockcount += new->br_blockcount;
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
- left.br_startblock, left.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(cur, left.br_startoff,
- left.br_startblock,
- left.br_blockcount +
- new->br_blockcount,
- left.br_state);
+ error = xfs_bmbt_update(cur, &left);
if (error)
goto done;
}
@@ -3006,29 +2790,22 @@ xfs_bmap_add_extent_hole_real(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
- new->br_startoff, new->br_startblock,
- new->br_blockcount + right.br_blockcount,
- right.br_state);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = right;
+
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = new->br_startblock;
+ right.br_blockcount += new->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &right);
if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(cur,
- right.br_startoff,
- right.br_startblock,
- right.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount +
- right.br_blockcount,
- right.br_state);
+ error = xfs_bmbt_update(cur, &right);
if (error)
goto done;
}
@@ -3040,21 +2817,17 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_iext_insert(ip, *idx, 1, new, state);
+ xfs_iext_insert(ip, icur, new, state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(cur,
- new->br_startoff,
- new->br_startblock,
- new->br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- cur->bc_rec.b.br_state = new->br_state;
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
@@ -3865,6 +3638,17 @@ xfs_trim_extent(
}
}
+/* trim extent to within eof */
+void
+xfs_trim_extent_eof(
+ struct xfs_bmbt_irec *irec,
+ struct xfs_inode *ip)
+
+{
+ xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
+ i_size_read(VFS_I(ip))));
+}
+
/*
* Trim the returned map to the required bounds
*/
@@ -3983,7 +3767,7 @@ xfs_bmapi_read(
struct xfs_bmbt_irec got;
xfs_fileoff_t obno;
xfs_fileoff_t end;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
int error;
bool eof = false;
int n = 0;
@@ -4025,7 +3809,7 @@ xfs_bmapi_read(
return error;
}
- if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got))
eof = true;
end = bno + len;
obno = bno;
@@ -4057,7 +3841,7 @@ xfs_bmapi_read(
break;
/* Else go on to the next record. */
- if (!xfs_iext_get_extent(ifp, ++idx, &got))
+ if (!xfs_iext_next_extent(ifp, &icur, &got))
eof = true;
}
*nmap = n;
@@ -4085,7 +3869,7 @@ xfs_bmapi_reserve_delalloc(
xfs_filblks_t len,
xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
- xfs_extnum_t *lastx,
+ struct xfs_iext_cursor *icur,
int eof)
{
struct xfs_mount *mp = ip->i_mount;
@@ -4115,7 +3899,7 @@ xfs_bmapi_reserve_delalloc(
if (extsz) {
struct xfs_bmbt_irec prev;
- if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev))
+ if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
prev.br_startoff = NULLFILEOFF;
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof,
@@ -4164,7 +3948,7 @@ xfs_bmapi_reserve_delalloc(
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
- xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
+ xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
/*
* Tag the inode if blocks were preallocated. Note that COW fork
@@ -4209,10 +3993,7 @@ xfs_bmapi_allocate(
if (bma->wasdel) {
bma->length = (xfs_extlen_t)bma->got.br_blockcount;
bma->offset = bma->got.br_startoff;
- if (bma->idx) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
- &bma->prev);
- }
+ xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev);
} else {
bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
if (!bma->eof)
@@ -4297,7 +4078,7 @@ xfs_bmapi_allocate(
error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
- whichfork, &bma->idx, &bma->cur, &bma->got,
+ whichfork, &bma->icur, &bma->cur, &bma->got,
bma->firstblock, bma->dfops, &bma->logflags);
bma->logflags |= tmp_logflags;
@@ -4309,7 +4090,7 @@ xfs_bmapi_allocate(
* or xfs_bmap_add_extent_hole_real might have merged it into one of
* the neighbouring ones.
*/
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+ xfs_iext_get_extent(ifp, &bma->icur, &bma->got);
ASSERT(bma->got.br_startoff <= bma->offset);
ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
@@ -4367,8 +4148,8 @@ xfs_bmapi_convert_unwritten(
}
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
- &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops,
- &tmp_logflags);
+ &bma->icur, &bma->cur, mval, bma->firstblock,
+ bma->dfops, &tmp_logflags);
/*
* Log the inode core unconditionally in the unwritten extent conversion
* path because the conversion might not have done so (e.g., if the
@@ -4390,7 +4171,7 @@ xfs_bmapi_convert_unwritten(
* xfs_bmap_add_extent_unwritten_real might have merged it into one
* of the neighbouring ones.
*/
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+ xfs_iext_get_extent(ifp, &bma->icur, &bma->got);
/*
* We may have combined previously unwritten space with written space,
@@ -4509,9 +4290,9 @@ xfs_bmapi_write(
end = bno + len;
obno = bno;
- if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got))
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
- if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev))
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
bma.tp = tp;
bma.ip = ip;
@@ -4553,7 +4334,8 @@ xfs_bmapi_write(
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if (need_alloc || wasdelay) {
+ if ((need_alloc || wasdelay) &&
+ !(flags & XFS_BMAPI_CONVERT_ONLY)) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4616,7 +4398,7 @@ xfs_bmapi_write(
/* Else go on to the next record. */
bma.prev = bma.got;
- if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got))
+ if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got))
eof = true;
}
*nmap = n;
@@ -4689,7 +4471,7 @@ xfs_bmapi_remap(
struct xfs_btree_cur *cur = NULL;
xfs_fsblock_t firstblock = NULLFSBLOCK;
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
int logflags = 0, error;
ASSERT(len > 0);
@@ -4713,7 +4495,7 @@ xfs_bmapi_remap(
return error;
}
- if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) {
+ if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
/* make sure we only reflink into a hole. */
ASSERT(got.br_startoff > bno);
ASSERT(got.br_startoff - bno >= len);
@@ -4734,8 +4516,8 @@ xfs_bmapi_remap(
got.br_blockcount = len;
got.br_state = XFS_EXT_NORM;
- error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur,
- &got, &firstblock, dfops, &logflags);
+ error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur,
+ &cur, &got, &firstblock, dfops, &logflags);
if (error)
goto error0;
@@ -4851,7 +4633,7 @@ int
xfs_bmap_del_extent_delay(
struct xfs_inode *ip,
int whichfork,
- xfs_extnum_t *idx,
+ struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del)
{
@@ -4861,7 +4643,8 @@ xfs_bmap_del_extent_delay(
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
xfs_filblks_t got_indlen, new_indlen, stolen;
- int error = 0, state = 0;
+ int state = xfs_bmap_fork_to_state(whichfork);
+ int error = 0;
bool isrt;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -4872,8 +4655,6 @@ xfs_bmap_del_extent_delay(
da_old = startblockval(got->br_startblock);
da_new = 0;
- ASSERT(*idx >= 0);
- ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(del->br_blockcount > 0);
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
@@ -4897,46 +4678,39 @@ xfs_bmap_del_extent_delay(
return error;
ip->i_delayed_blks -= del->br_blockcount;
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
if (got->br_startoff == del->br_startoff)
- state |= BMAP_LEFT_CONTIG;
+ state |= BMAP_LEFT_FILLING;
if (got_endoff == del_endoff)
- state |= BMAP_RIGHT_CONTIG;
+ state |= BMAP_RIGHT_FILLING;
- switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
- case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
/*
* Matches the whole extent. Delete the entry.
*/
- xfs_iext_remove(ip, *idx, 1, state);
- --*idx;
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
break;
- case BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_FILLING:
/*
* Deleting the first part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_startoff = del_endoff;
got->br_blockcount -= del->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
got->br_blockcount), da_old);
got->br_startblock = nullstartblock((int)da_new);
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, got);
break;
- case BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING:
/*
* Deleting the last part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_blockcount = got->br_blockcount - del->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
got->br_blockcount), da_old);
got->br_startblock = nullstartblock((int)da_new);
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, got);
break;
case 0:
/*
@@ -4948,8 +4722,6 @@ xfs_bmap_del_extent_delay(
* Warn if either of the new indlen reservations is zero as this
* can lead to delalloc problems.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-
got->br_blockcount = del->br_startoff - got->br_startoff;
got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount);
@@ -4961,15 +4733,14 @@ xfs_bmap_del_extent_delay(
del->br_blockcount);
got->br_startblock = nullstartblock((int)got_indlen);
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_);
new.br_startoff = del_endoff;
new.br_state = got->br_state;
new.br_startblock = nullstartblock((int)new_indlen);
- ++*idx;
- xfs_iext_insert(ip, *idx, 1, &new, state);
+ xfs_iext_update_extent(ip, state, icur, got);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
da_new = got_indlen + new_indlen - stolen;
del->br_blockcount -= stolen;
@@ -4988,7 +4759,7 @@ xfs_bmap_del_extent_delay(
void
xfs_bmap_del_extent_cow(
struct xfs_inode *ip,
- xfs_extnum_t *idx,
+ struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del)
{
@@ -5003,75 +4774,67 @@ xfs_bmap_del_extent_cow(
del_endoff = del->br_startoff + del->br_blockcount;
got_endoff = got->br_startoff + got->br_blockcount;
- ASSERT(*idx >= 0);
- ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(del->br_blockcount > 0);
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
ASSERT(!isnullstartblock(got->br_startblock));
if (got->br_startoff == del->br_startoff)
- state |= BMAP_LEFT_CONTIG;
+ state |= BMAP_LEFT_FILLING;
if (got_endoff == del_endoff)
- state |= BMAP_RIGHT_CONTIG;
+ state |= BMAP_RIGHT_FILLING;
- switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
- case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
/*
* Matches the whole extent. Delete the entry.
*/
- xfs_iext_remove(ip, *idx, 1, state);
- --*idx;
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
break;
- case BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_FILLING:
/*
* Deleting the first part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_startoff = del_endoff;
got->br_blockcount -= del->br_blockcount;
got->br_startblock = del->br_startblock + del->br_blockcount;
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, got);
break;
- case BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING:
/*
* Deleting the last part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_blockcount -= del->br_blockcount;
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, got);
break;
case 0:
/*
* Deleting the middle of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_blockcount = del->br_startoff - got->br_startoff;
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
new.br_startoff = del_endoff;
new.br_blockcount = got_endoff - del_endoff;
new.br_state = got->br_state;
new.br_startblock = del->br_startblock + del->br_blockcount;
- ++*idx;
- xfs_iext_insert(ip, *idx, 1, &new, state);
+ xfs_iext_update_extent(ip, state, icur, got);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
break;
}
}
/*
* Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
+ * after removing space.
*/
STATIC int /* error */
-xfs_bmap_del_extent(
+xfs_bmap_del_extent_real(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
- xfs_extnum_t *idx, /* extent number to update/delete */
+ struct xfs_iext_cursor *icur,
struct xfs_defer_ops *dfops, /* list of extents to be freed */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
@@ -5079,16 +4842,12 @@ xfs_bmap_del_extent(
int whichfork, /* data or attr fork */
int bflags) /* bmapi flags */
{
- xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
- xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
- int delay; /* current block is delayed allocated */
int do_fx; /* free extent at end of routine */
- xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
int error; /* error return value */
- int flags; /* inode logging flags */
- xfs_bmbt_irec_t got; /* current extent entry */
+ int flags = 0;/* inode logging flags */
+ struct xfs_bmbt_irec got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -5097,103 +4856,81 @@ xfs_bmap_del_extent(
xfs_bmbt_irec_t new; /* new record to be inserted */
/* REFERENCED */
uint qfield; /* quota field to update */
- xfs_filblks_t temp; /* for indirect length calculations */
- xfs_filblks_t temp2; /* for indirect length calculations */
- int state = 0;
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_bmbt_irec old;
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
- else if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp)));
ASSERT(del->br_blockcount > 0);
- ep = xfs_iext_get_ext(ifp, *idx);
- xfs_bmbt_get_all(ep, &got);
+ xfs_iext_get_extent(ifp, icur, &got);
ASSERT(got.br_startoff <= del->br_startoff);
del_endoff = del->br_startoff + del->br_blockcount;
got_endoff = got.br_startoff + got.br_blockcount;
ASSERT(got_endoff >= del_endoff);
- delay = isnullstartblock(got.br_startblock);
- ASSERT(isnullstartblock(del->br_startblock) == delay);
- flags = 0;
+ ASSERT(!isnullstartblock(got.br_startblock));
qfield = 0;
error = 0;
+
/*
- * If deleting a real allocation, must free up the disk space.
+ * If it's the case where the directory code is running with no block
+ * reservation, and the deleted block is in the middle of its extent,
+ * and the resulting insert of an extent would cause transformation to
+ * btree format, then reject it. The calling code will then swap blocks
+ * around instead. We have to do this now, rather than waiting for the
+ * conversion to btree format, since the transaction will be dirty then.
*/
- if (!delay) {
- flags = XFS_ILOG_CORE;
- /*
- * Realtime allocation. Free it and record di_nblocks update.
- */
- if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_fsblock_t bno;
- xfs_filblks_t len;
-
- ASSERT(do_mod(del->br_blockcount,
- mp->m_sb.sb_rextsize) == 0);
- ASSERT(do_mod(del->br_startblock,
- mp->m_sb.sb_rextsize) == 0);
- bno = del->br_startblock;
- len = del->br_blockcount;
- do_div(bno, mp->m_sb.sb_rextsize);
- do_div(len, mp->m_sb.sb_rextsize);
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
- if (error)
- goto done;
- do_fx = 0;
- nblks = len * mp->m_sb.sb_rextsize;
- qfield = XFS_TRANS_DQ_RTBCOUNT;
- }
- /*
- * Ordinary allocation.
- */
- else {
- do_fx = 1;
- nblks = del->br_blockcount;
- qfield = XFS_TRANS_DQ_BCOUNT;
- }
- /*
- * Set up del_endblock and cur for later.
- */
- del_endblock = del->br_startblock + del->br_blockcount;
- if (cur) {
- if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
- got.br_startblock, got.br_blockcount,
- &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- }
- da_old = da_new = 0;
- } else {
- da_old = startblockval(got.br_startblock);
- da_new = 0;
- nblks = 0;
+ if (tp->t_blk_res == 0 &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >=
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
+ del->br_startoff > got.br_startoff && del_endoff < got_endoff)
+ return -ENOSPC;
+
+ flags = XFS_ILOG_CORE;
+ if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+ xfs_fsblock_t bno;
+ xfs_filblks_t len;
+
+ ASSERT(do_mod(del->br_blockcount, mp->m_sb.sb_rextsize) == 0);
+ ASSERT(do_mod(del->br_startblock, mp->m_sb.sb_rextsize) == 0);
+ bno = del->br_startblock;
+ len = del->br_blockcount;
+ do_div(bno, mp->m_sb.sb_rextsize);
+ do_div(len, mp->m_sb.sb_rextsize);
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
+ goto done;
do_fx = 0;
+ nblks = len * mp->m_sb.sb_rextsize;
+ qfield = XFS_TRANS_DQ_RTBCOUNT;
+ } else {
+ do_fx = 1;
+ nblks = del->br_blockcount;
+ qfield = XFS_TRANS_DQ_BCOUNT;
}
- /*
- * Set flag value to use in switch statement.
- * Left-contig is 2, right-contig is 1.
- */
- switch (((got.br_startoff == del->br_startoff) << 1) |
- (got_endoff == del_endoff)) {
- case 3:
+ del_endblock = del->br_startblock + del->br_blockcount;
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ }
+
+ if (got.br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
/*
* Matches the whole extent. Delete the entry.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, *idx, 1,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
- --*idx;
- if (delay)
- break;
-
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
flags |= XFS_ILOG_CORE;
@@ -5205,168 +4942,106 @@ xfs_bmap_del_extent(
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
break;
-
- case 2:
+ case BMAP_LEFT_FILLING:
/*
* Deleting the first part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_startoff(ep, del_endoff);
- temp = got.br_blockcount - del->br_blockcount;
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- xfs_bmbt_set_startblock(ep, del_endblock);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ got.br_startoff = del_endoff;
+ got.br_startblock = del_endblock;
+ got.br_blockcount -= del->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
break;
}
- if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
- got.br_blockcount - del->br_blockcount,
- got.br_state)))
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
goto done;
break;
-
- case 1:
+ case BMAP_RIGHT_FILLING:
/*
* Deleting the last part of the extent.
*/
- temp = got.br_blockcount - del->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ got.br_blockcount -= del->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
break;
}
- if ((error = xfs_bmbt_update(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount - del->br_blockcount,
- got.br_state)))
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
goto done;
break;
-
case 0:
/*
* Deleting the middle of the extent.
*/
- temp = del->br_startoff - got.br_startoff;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
+ old = got;
+
+ got.br_blockcount = del->br_startoff - got.br_startoff;
+ xfs_iext_update_extent(ip, state, icur, &got);
+
new.br_startoff = del_endoff;
- temp2 = got_endoff - del_endoff;
- new.br_blockcount = temp2;
+ new.br_blockcount = got_endoff - del_endoff;
new.br_state = got.br_state;
- if (!delay) {
- new.br_startblock = del_endblock;
- flags |= XFS_ILOG_CORE;
- if (cur) {
- if ((error = xfs_bmbt_update(cur,
- got.br_startoff,
- got.br_startblock, temp,
- got.br_state)))
- goto done;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto done;
- cur->bc_rec.b = new;
- error = xfs_btree_insert(cur, &i);
- if (error && error != -ENOSPC)
- goto done;
+ new.br_startblock = del_endblock;
+
+ flags |= XFS_ILOG_CORE;
+ if (cur) {
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
+ goto done;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ cur->bc_rec.b = new;
+ error = xfs_btree_insert(cur, &i);
+ if (error && error != -ENOSPC)
+ goto done;
+ /*
+ * If get no-space back from btree insert, it tried a
+ * split, and we have a zero block reservation. Fix up
+ * our state and return the error.
+ */
+ if (error == -ENOSPC) {
/*
- * If get no-space back from btree insert,
- * it tried a split, and we have a zero
- * block reservation.
- * Fix up our state and return the error.
+ * Reset the cursor, don't trust it after any
+ * insert operation.
*/
- if (error == -ENOSPC) {
- /*
- * Reset the cursor, don't trust
- * it after any insert operation.
- */
- if ((error = xfs_bmbt_lookup_eq(cur,
- got.br_startoff,
- got.br_startblock,
- temp, &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(mp,
- i == 1, done);
- /*
- * Update the btree record back
- * to the original value.
- */
- if ((error = xfs_bmbt_update(cur,
- got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- got.br_state)))
- goto done;
- /*
- * Reset the extent record back
- * to the original value.
- */
- xfs_bmbt_set_blockcount(ep,
- got.br_blockcount);
- flags = 0;
- error = -ENOSPC;
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
+ if (error)
goto done;
- }
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- } else
- flags |= xfs_ilog_fext(whichfork);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
- } else {
- xfs_filblks_t stolen;
- ASSERT(whichfork == XFS_DATA_FORK);
-
- /*
- * Distribute the original indlen reservation across the
- * two new extents. Steal blocks from the deleted extent
- * if necessary. Stealing blocks simply fudges the
- * fdblocks accounting in xfs_bunmapi().
- */
- temp = xfs_bmap_worst_indlen(ip, got.br_blockcount);
- temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount);
- stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2,
- del->br_blockcount);
- da_new = temp + temp2 - stolen;
- del->br_blockcount -= stolen;
-
- /*
- * Set the reservation for each extent. Warn if either
- * is zero as this can lead to delalloc problems.
- */
- WARN_ON_ONCE(!temp || !temp2);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- new.br_startblock = nullstartblock((int)temp2);
- }
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_insert(ip, *idx + 1, 1, &new, state);
- ++*idx;
+ /*
+ * Update the btree record back
+ * to the original value.
+ */
+ error = xfs_bmbt_update(cur, &old);
+ if (error)
+ goto done;
+ /*
+ * Reset the extent record back
+ * to the original value.
+ */
+ xfs_iext_update_extent(ip, state, icur, &old);
+ flags = 0;
+ error = -ENOSPC;
+ goto done;
+ }
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ } else
+ flags |= xfs_ilog_fext(whichfork);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
break;
}
/* remove reverse mapping */
- if (!delay) {
- error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del);
- if (error)
- goto done;
- }
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del);
+ if (error)
+ goto done;
/*
* If we need to, add to list of extents to delete.
@@ -5392,13 +5067,6 @@ xfs_bmap_del_extent(
if (qfield && !(bflags & XFS_BMAPI_REMAP))
xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
- /*
- * Account for change in delayed indirect blocks.
- * Nothing to do for disk quota accounting here.
- */
- ASSERT(da_old >= da_new);
- if (da_old > da_new)
- xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
done:
*logflagsp = flags;
return error;
@@ -5414,7 +5082,7 @@ int /* error */
__xfs_bunmapi(
xfs_trans_t *tp, /* transaction pointer */
struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting offset to unmap */
+ xfs_fileoff_t start, /* first file offset deleted */
xfs_filblks_t *rlen, /* i/o: amount remaining */
int flags, /* misc flags */
xfs_extnum_t nexts, /* number of extents max */
@@ -5429,11 +5097,9 @@ __xfs_bunmapi(
xfs_bmbt_irec_t got; /* current extent record */
xfs_ifork_t *ifp; /* inode fork pointer */
int isrt; /* freeing in rt area */
- xfs_extnum_t lastx; /* last extent index used */
int logflags; /* transaction logging flags */
xfs_extlen_t mod; /* rt extent offset */
xfs_mount_t *mp; /* mount structure */
- xfs_fileoff_t start; /* first file offset deleted */
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
@@ -5441,8 +5107,11 @@ __xfs_bunmapi(
xfs_filblks_t len = *rlen; /* length to unmap in file */
xfs_fileoff_t max_len;
xfs_agnumber_t prev_agno = NULLAGNUMBER, agno;
+ xfs_fileoff_t end;
+ struct xfs_iext_cursor icur;
+ bool done = false;
- trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+ trace_xfs_bunmap(ip, start, len, flags, _RET_IP_);
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
@@ -5481,18 +5150,13 @@ __xfs_bunmapi(
}
XFS_STATS_INC(mp, xs_blk_unmap);
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
- start = bno;
- bno = start + len - 1;
+ end = start + len;
- /*
- * Check to see if the given block number is past the end of the
- * file, back up to the last block if so...
- */
- if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) {
- ASSERT(lastx > 0);
- xfs_iext_get_extent(ifp, --lastx, &got);
- bno = got.br_startoff + got.br_blockcount - 1;
+ if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
+ *rlen = 0;
+ return 0;
}
+ end--;
logflags = 0;
if (ifp->if_flags & XFS_IFBROOT) {
@@ -5515,24 +5179,24 @@ __xfs_bunmapi(
}
extno = 0;
- while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+ while (end != (xfs_fileoff_t)-1 && end >= start &&
(nexts == 0 || extno < nexts) && max_len > 0) {
/*
- * Is the found extent after a hole in which bno lives?
+ * Is the found extent after a hole in which end lives?
* Just back up to the previous extent, if so.
*/
- if (got.br_startoff > bno) {
- if (--lastx < 0)
- break;
- xfs_iext_get_extent(ifp, lastx, &got);
+ if (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
}
/*
* Is the last block of this extent before the range
* we're supposed to delete? If so, we're done.
*/
- bno = XFS_FILEOFF_MIN(bno,
+ end = XFS_FILEOFF_MIN(end,
got.br_startoff + got.br_blockcount - 1);
- if (bno < start)
+ if (end < start)
break;
/*
* Then deal with the (possibly delayed) allocated space
@@ -5557,8 +5221,8 @@ __xfs_bunmapi(
if (!wasdel)
del.br_startblock += start - got.br_startoff;
}
- if (del.br_startoff + del.br_blockcount > bno + 1)
- del.br_blockcount = bno + 1 - del.br_startoff;
+ if (del.br_startoff + del.br_blockcount > end + 1)
+ del.br_blockcount = end + 1 - del.br_startoff;
/* How much can we safely unmap? */
if (max_len < del.br_blockcount) {
@@ -5584,13 +5248,13 @@ __xfs_bunmapi(
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
*/
- ASSERT(bno >= mod);
- bno -= mod > del.br_blockcount ?
+ ASSERT(end >= mod);
+ end -= mod > del.br_blockcount ?
del.br_blockcount : mod;
- if (bno < got.br_startoff) {
- if (--lastx >= 0)
- xfs_bmbt_get_all(xfs_iext_get_ext(
- ifp, lastx), &got);
+ if (end < got.br_startoff &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
}
continue;
}
@@ -5611,7 +5275,7 @@ __xfs_bunmapi(
}
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent_unwritten_real(tp, ip,
- whichfork, &lastx, &cur, &del,
+ whichfork, &icur, &cur, &del,
firstblock, dfops, &logflags);
if (error)
goto error0;
@@ -5636,10 +5300,13 @@ __xfs_bunmapi(
* Can't make it unwritten. There isn't
* a full extent here so just skip it.
*/
- ASSERT(bno >= del.br_blockcount);
- bno -= del.br_blockcount;
- if (got.br_startoff > bno && --lastx >= 0)
- xfs_iext_get_extent(ifp, lastx, &got);
+ ASSERT(end >= del.br_blockcount);
+ end -= del.br_blockcount;
+ if (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
+ }
continue;
} else if (del.br_state == XFS_EXT_UNWRITTEN) {
struct xfs_bmbt_irec prev;
@@ -5650,8 +5317,8 @@ __xfs_bunmapi(
* Unwrite the killed part of that one and
* try again.
*/
- ASSERT(lastx > 0);
- xfs_iext_get_extent(ifp, lastx - 1, &prev);
+ if (!xfs_iext_prev_extent(ifp, &icur, &prev))
+ ASSERT(0);
ASSERT(prev.br_state == XFS_EXT_NORM);
ASSERT(!isnullstartblock(prev.br_startblock));
ASSERT(del.br_startblock ==
@@ -5663,9 +5330,8 @@ __xfs_bunmapi(
prev.br_startoff = start;
}
prev.br_state = XFS_EXT_UNWRITTEN;
- lastx--;
error = xfs_bmap_add_extent_unwritten_real(tp,
- ip, whichfork, &lastx, &cur,
+ ip, whichfork, &icur, &cur,
&prev, firstblock, dfops,
&logflags);
if (error)
@@ -5675,7 +5341,7 @@ __xfs_bunmapi(
ASSERT(del.br_state == XFS_EXT_NORM);
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent_unwritten_real(tp,
- ip, whichfork, &lastx, &cur,
+ ip, whichfork, &icur, &cur,
&del, firstblock, dfops,
&logflags);
if (error)
@@ -5684,85 +5350,39 @@ __xfs_bunmapi(
}
}
- /*
- * If it's the case where the directory code is running
- * with no block reservation, and the deleted block is in
- * the middle of its extent, and the resulting insert
- * of an extent would cause transformation to btree format,
- * then reject it. The calling code will then swap
- * blocks around instead.
- * We have to do this now, rather than waiting for the
- * conversion to btree format, since the transaction
- * will be dirty.
- */
- if (!wasdel && tp->t_blk_res == 0 &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
- XFS_IFORK_MAXEXT(ip, whichfork) &&
- del.br_startoff > got.br_startoff &&
- del.br_startoff + del.br_blockcount <
- got.br_startoff + got.br_blockcount) {
- error = -ENOSPC;
- goto error0;
+ if (wasdel) {
+ error = xfs_bmap_del_extent_delay(ip, whichfork, &icur,
+ &got, &del);
+ } else {
+ error = xfs_bmap_del_extent_real(ip, tp, &icur, dfops,
+ cur, &del, &tmp_logflags, whichfork,
+ flags);
+ logflags |= tmp_logflags;
}
- /*
- * Unreserve quota and update realtime free space, if
- * appropriate. If delayed allocation, update the inode delalloc
- * counter now and wait to update the sb counters as
- * xfs_bmap_del_extent() might need to borrow some blocks.
- */
- if (wasdel) {
- ASSERT(startblockval(del.br_startblock) > 0);
- if (isrt) {
- xfs_filblks_t rtexts;
-
- rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
- do_div(rtexts, mp->m_sb.sb_rextsize);
- xfs_mod_frextents(mp, (int64_t)rtexts);
- (void)xfs_trans_reserve_quota_nblks(NULL,
- ip, -((long)del.br_blockcount), 0,
- XFS_QMOPT_RES_RTBLKS);
- } else {
- (void)xfs_trans_reserve_quota_nblks(NULL,
- ip, -((long)del.br_blockcount), 0,
- XFS_QMOPT_RES_REGBLKS);
- }
- ip->i_delayed_blks -= del.br_blockcount;
- if (cur)
- cur->bc_private.b.flags |=
- XFS_BTCUR_BPRV_WASDEL;
- } else if (cur)
- cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
-
- error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
- &tmp_logflags, whichfork, flags);
- logflags |= tmp_logflags;
if (error)
goto error0;
- if (!isrt && wasdel)
- xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
-
max_len -= del.br_blockcount;
- bno = del.br_startoff - 1;
+ end = del.br_startoff - 1;
nodelete:
/*
* If not done go on to the next (previous) record.
*/
- if (bno != (xfs_fileoff_t)-1 && bno >= start) {
- if (lastx >= 0) {
- xfs_iext_get_extent(ifp, lastx, &got);
- if (got.br_startoff > bno && --lastx >= 0)
- xfs_iext_get_extent(ifp, lastx, &got);
+ if (end != (xfs_fileoff_t)-1 && end >= start) {
+ if (!xfs_iext_get_extent(ifp, &icur, &got) ||
+ (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got))) {
+ done = true;
+ break;
}
extno++;
}
}
- if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0)
+ if (done || end == (xfs_fileoff_t)-1 || end < start)
*rlen = 0;
else
- *rlen = bno - start + 1;
+ *rlen = end - start + 1;
/*
* Convert to a btree if necessary.
@@ -5880,14 +5500,13 @@ xfs_bmse_merge(
struct xfs_inode *ip,
int whichfork,
xfs_fileoff_t shift, /* shift fsb */
- int current_ext, /* idx of gotp */
+ struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got, /* extent to shift */
struct xfs_bmbt_irec *left, /* preceding extent */
struct xfs_btree_cur *cur,
int *logflags, /* output */
struct xfs_defer_ops *dfops)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_bmbt_irec new;
xfs_filblks_t blockcount;
int error, i;
@@ -5915,8 +5534,7 @@ xfs_bmse_merge(
}
/* lookup and remove the extent to merge */
- error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock,
- got->br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, got, &i);
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
@@ -5927,20 +5545,20 @@ xfs_bmse_merge(
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
/* lookup and update size of the previous extent */
- error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock,
- left->br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, left, &i);
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
- error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock,
- new.br_blockcount, new.br_state);
+ error = xfs_bmbt_update(cur, &new);
if (error)
return error;
done:
- xfs_iext_update_extent(ifp, current_ext - 1, &new);
- xfs_iext_remove(ip, current_ext, 1, 0);
+ xfs_iext_remove(ip, icur, 0);
+ xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur);
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
+ &new);
/* update reverse mapping. rmap functions merge the rmaps for us */
error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got);
@@ -5951,183 +5569,83 @@ done:
return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new);
}
-/*
- * Shift a single extent.
- */
-STATIC int
-xfs_bmse_shift_one(
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t offset_shift_fsb,
- int *current_ext,
- struct xfs_bmbt_irec *got,
- struct xfs_btree_cur *cur,
- int *logflags,
- enum shift_direction direction,
- struct xfs_defer_ops *dfops)
+static int
+xfs_bmap_shift_update_extent(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *got,
+ struct xfs_btree_cur *cur,
+ int *logflags,
+ struct xfs_defer_ops *dfops,
+ xfs_fileoff_t startoff)
{
- struct xfs_ifork *ifp;
- struct xfs_mount *mp;
- xfs_fileoff_t startoff;
- struct xfs_bmbt_irec adj_irec, new;
- int error;
- int i;
- int total_extents;
-
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- total_extents = xfs_iext_count(ifp);
-
- /* delalloc extents should be prevented by caller */
- XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock));
-
- if (direction == SHIFT_LEFT) {
- startoff = got->br_startoff - offset_shift_fsb;
-
- /*
- * Check for merge if we've got an extent to the left,
- * otherwise make sure there's enough room at the start
- * of the file for the shift.
- */
- if (!*current_ext) {
- if (got->br_startoff < offset_shift_fsb)
- return -EINVAL;
- goto update_current_ext;
- }
-
- /*
- * grab the left extent and check for a large enough hole.
- */
- xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec);
- if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount)
- return -EINVAL;
-
- /* check whether to merge the extent or shift it down */
- if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) {
- return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
- *current_ext, got, &adj_irec,
- cur, logflags, dfops);
- }
- } else {
- startoff = got->br_startoff + offset_shift_fsb;
- /* nothing to move if this is the last extent */
- if (*current_ext >= (total_extents - 1))
- goto update_current_ext;
-
- /*
- * If this is not the last extent in the file, make sure there
- * is enough room between current extent and next extent for
- * accommodating the shift.
- */
- xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec);
- if (startoff + got->br_blockcount > adj_irec.br_startoff)
- return -EINVAL;
-
- /*
- * Unlike a left shift (which involves a hole punch),
- * a right shift does not modify extent neighbors
- * in any way. We should never find mergeable extents
- * in this scenario. Check anyways and warn if we
- * encounter two extents that could be one.
- */
- if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb))
- WARN_ON_ONCE(1);
- }
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec prev = *got;
+ int error, i;
- /*
- * Increment the extent index for the next iteration, update the start
- * offset of the in-core extent and update the btree if applicable.
- */
-update_current_ext:
*logflags |= XFS_ILOG_CORE;
- new = *got;
- new.br_startoff = startoff;
+ got->br_startoff = startoff;
if (cur) {
- error = xfs_bmbt_lookup_eq(cur, got->br_startoff,
- got->br_startblock, got->br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, &prev, &i);
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
- error = xfs_bmbt_update(cur, new.br_startoff,
- new.br_startblock, new.br_blockcount,
- new.br_state);
+ error = xfs_bmbt_update(cur, got);
if (error)
return error;
} else {
*logflags |= XFS_ILOG_DEXT;
}
- xfs_iext_update_extent(ifp, *current_ext, &new);
-
- if (direction == SHIFT_LEFT)
- (*current_ext)++;
- else
- (*current_ext)--;
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
+ got);
/* update reverse mapping */
- error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got);
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &prev);
if (error)
return error;
- return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new);
+ return xfs_rmap_map_extent(mp, dfops, ip, whichfork, got);
}
-/*
- * Shift extent records to the left/right to cover/create a hole.
- *
- * The maximum number of extents to be shifted in a single operation is
- * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the
- * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
- * is the length by which each extent is shifted. If there is no hole to shift
- * the extents into, this will be considered invalid operation and we abort
- * immediately.
- */
int
-xfs_bmap_shift_extents(
+xfs_bmap_collapse_extents(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *next_fsb,
xfs_fileoff_t offset_shift_fsb,
- int *done,
+ bool *done,
xfs_fileoff_t stop_fsb,
xfs_fsblock_t *firstblock,
- struct xfs_defer_ops *dfops,
- enum shift_direction direction,
- int num_exts)
+ struct xfs_defer_ops *dfops)
{
- struct xfs_btree_cur *cur = NULL;
- struct xfs_bmbt_irec got;
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp;
- xfs_extnum_t nexts = 0;
- xfs_extnum_t current_ext;
- xfs_extnum_t total_extents;
- xfs_extnum_t stop_extent;
- int error = 0;
- int whichfork = XFS_DATA_FORK;
- int logflags = 0;
+ int whichfork = XFS_DATA_FORK;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_irec got, prev;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t new_startoff;
+ int error = 0;
+ int logflags = 0;
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmap_shift_extents",
- XFS_ERRLEVEL_LOW, mp);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
- ifp = XFS_IFORK_PTR(ip, whichfork);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- /* Read in all the extents */
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
@@ -6140,107 +5658,165 @@ xfs_bmap_shift_extents(
cur->bc_private.b.flags = 0;
}
- /*
- * There may be delalloc extents in the data fork before the range we
- * are collapsing out, so we cannot use the count of real extents here.
- * Instead we have to calculate it from the incore fork.
- */
- total_extents = xfs_iext_count(ifp);
- if (total_extents == 0) {
- *done = 1;
+ if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
+ *done = true;
goto del_cursor;
}
+ XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
- /*
- * In case of first right shift, we need to initialize next_fsb
- */
- if (*next_fsb == NULLFSBLOCK) {
- ASSERT(direction == SHIFT_RIGHT);
-
- current_ext = total_extents - 1;
- xfs_iext_get_extent(ifp, current_ext, &got);
- if (stop_fsb > got.br_startoff) {
- *done = 1;
+ new_startoff = got.br_startoff - offset_shift_fsb;
+ if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) {
+ if (new_startoff < prev.br_startoff + prev.br_blockcount) {
+ error = -EINVAL;
goto del_cursor;
}
- *next_fsb = got.br_startoff;
+
+ if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) {
+ error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+ &icur, &got, &prev, cur, &logflags,
+ dfops);
+ if (error)
+ goto del_cursor;
+ goto done;
+ }
} else {
- /*
- * Look up the extent index for the fsb where we start shifting. We can
- * henceforth iterate with current_ext as extent list changes are locked
- * out via ilock.
- *
- * If next_fsb lies in a hole beyond which there are no extents we are
- * done.
- */
- if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &current_ext,
- &got)) {
- *done = 1;
+ if (got.br_startoff < offset_shift_fsb) {
+ error = -EINVAL;
goto del_cursor;
}
}
- /* Lookup the extent index at which we have to stop */
- if (direction == SHIFT_RIGHT) {
- struct xfs_bmbt_irec s;
+ error = xfs_bmap_shift_update_extent(ip, whichfork, &icur, &got, cur,
+ &logflags, dfops, new_startoff);
+ if (error)
+ goto del_cursor;
+
+done:
+ if (!xfs_iext_next_extent(ifp, &icur, &got)) {
+ *done = true;
+ goto del_cursor;
+ }
- xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s);
- /* Make stop_extent exclusive of shift range */
- stop_extent--;
- if (current_ext <= stop_extent) {
- error = -EIO;
+ *next_fsb = got.br_startoff;
+del_cursor:
+ if (cur)
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ return error;
+}
+
+int
+xfs_bmap_insert_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t *next_fsb,
+ xfs_fileoff_t offset_shift_fsb,
+ bool *done,
+ xfs_fileoff_t stop_fsb,
+ xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops)
+{
+ int whichfork = XFS_DATA_FORK;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_irec got, next;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t new_startoff;
+ int error = 0;
+ int logflags = 0;
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.dfops = dfops;
+ cur->bc_private.b.flags = 0;
+ }
+
+ if (*next_fsb == NULLFSBLOCK) {
+ xfs_iext_last(ifp, &icur);
+ if (!xfs_iext_get_extent(ifp, &icur, &got) ||
+ stop_fsb > got.br_startoff) {
+ *done = true;
goto del_cursor;
}
} else {
- stop_extent = total_extents;
- if (current_ext >= stop_extent) {
- error = -EIO;
+ if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
+ *done = true;
goto del_cursor;
}
}
+ XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
- while (nexts++ < num_exts) {
- error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
- &current_ext, &got, cur, &logflags,
- direction, dfops);
- if (error)
+ if (stop_fsb >= got.br_startoff + got.br_blockcount) {
+ error = -EIO;
+ goto del_cursor;
+ }
+
+ new_startoff = got.br_startoff + offset_shift_fsb;
+ if (xfs_iext_peek_next_extent(ifp, &icur, &next)) {
+ if (new_startoff + got.br_blockcount > next.br_startoff) {
+ error = -EINVAL;
goto del_cursor;
- /*
- * If there was an extent merge during the shift, the extent
- * count can change. Update the total and grade the next record.
- */
- if (direction == SHIFT_LEFT) {
- total_extents = xfs_iext_count(ifp);
- stop_extent = total_extents;
}
- if (current_ext == stop_extent) {
- *done = 1;
- *next_fsb = NULLFSBLOCK;
- break;
- }
- xfs_iext_get_extent(ifp, current_ext, &got);
+ /*
+ * Unlike a left shift (which involves a hole punch), a right
+ * shift does not modify extent neighbors in any way. We should
+ * never find mergeable extents in this scenario. Check anyways
+ * and warn if we encounter two extents that could be one.
+ */
+ if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb))
+ WARN_ON_ONCE(1);
}
- if (!*done)
- *next_fsb = got.br_startoff;
+ error = xfs_bmap_shift_update_extent(ip, whichfork, &icur, &got, cur,
+ &logflags, dfops, new_startoff);
+ if (error)
+ goto del_cursor;
+
+ if (!xfs_iext_prev_extent(ifp, &icur, &got) ||
+ stop_fsb >= got.br_startoff + got.br_blockcount) {
+ *done = true;
+ goto del_cursor;
+ }
+ *next_fsb = got.br_startoff;
del_cursor:
if (cur)
xfs_btree_del_cursor(cur,
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-
if (logflags)
xfs_trans_log_inode(tp, ip, logflags);
-
return error;
}
/*
- * Splits an extent into two extents at split_fsb block such that it is
- * the first block of the current_ext. @current_ext is a target extent
- * to be split. @split_fsb is a block where the extents is split.
- * If split_fsb lies in a hole or the first block of extents, just return 0.
+ * Splits an extent into two extents at split_fsb block such that it is the
+ * first block of the current_ext. @ext is a target extent to be split.
+ * @split_fsb is a block where the extents is split. If split_fsb lies in a
+ * hole or the first block of extents, just return 0.
*/
STATIC int
xfs_bmap_split_extent_at(
@@ -6257,7 +5833,7 @@ xfs_bmap_split_extent_at(
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
xfs_fsblock_t gotblkcnt; /* new block count for got */
- xfs_extnum_t current_ext;
+ struct xfs_iext_cursor icur;
int error = 0;
int logflags = 0;
int i = 0;
@@ -6285,7 +5861,7 @@ xfs_bmap_split_extent_at(
/*
* If there are not extents, or split_fsb lies in a hole we are done.
*/
- if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &current_ext, &got) ||
+ if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &icur, &got) ||
got.br_startoff >= split_fsb)
return 0;
@@ -6300,44 +5876,35 @@ xfs_bmap_split_extent_at(
cur->bc_private.b.firstblock = *firstfsb;
cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
- error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
}
got.br_blockcount = gotblkcnt;
- xfs_iext_update_extent(ifp, current_ext, &got);
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), &icur,
+ &got);
logflags = XFS_ILOG_CORE;
if (cur) {
- error = xfs_bmbt_update(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- got.br_state);
+ error = xfs_bmbt_update(cur, &got);
if (error)
goto del_cursor;
} else
logflags |= XFS_ILOG_DEXT;
/* Add new extent */
- current_ext++;
- xfs_iext_insert(ip, current_ext, 1, &new, 0);
+ xfs_iext_next(ifp, &icur);
+ xfs_iext_insert(ip, &icur, &new, 0);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur) {
- error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
- new.br_startblock, new.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(cur, &new, &i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
- cur->bc_rec.b.br_state = new.br_state;
-
error = xfs_btree_insert(cur, &i);
if (error)
goto del_cursor;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 851982a5dfbc..e36d75799cd5 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -43,7 +43,7 @@ struct xfs_bmalloca {
xfs_fsblock_t blkno; /* starting block of new extent */
struct xfs_btree_cur *cur; /* btree cursor */
- xfs_extnum_t idx; /* current extent index */
+ struct xfs_iext_cursor icur; /* incore extent cursor */
int nallocs;/* number of extents alloc'd */
int logflags;/* flags for transaction logging */
@@ -113,6 +113,9 @@ struct xfs_extent_free_item
/* Only convert delalloc space, don't allocate entirely new extents */
#define XFS_BMAPI_DELALLOC 0x400
+/* Only convert unwritten extents, don't allocate new blocks */
+#define XFS_BMAPI_CONVERT_ONLY 0x800
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -124,7 +127,8 @@ struct xfs_extent_free_item
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
- { XFS_BMAPI_DELALLOC, "DELALLOC" }
+ { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
+ { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }
static inline int xfs_bmapi_aflag(int w)
@@ -183,31 +187,9 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
!isnullstartblock(irec->br_startblock);
}
-/*
- * This macro is used to determine how many extents will be shifted
- * in one write transaction. We could require two splits,
- * an extent move on the first and an extent merge on the second,
- * So it is proper that one extent is shifted inside write transaction
- * at a time.
- */
-#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
-
-enum shift_direction {
- SHIFT_LEFT = 0,
- SHIFT_RIGHT,
-};
-
-#ifdef DEBUG
-void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
- int whichfork, unsigned long caller_ip);
-#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
- xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
-#else
-#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
-
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
+void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
@@ -221,8 +203,6 @@ int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
int whichfork);
int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
-int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
- int whichfork);
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
int *nmap, int flags);
@@ -240,20 +220,25 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
struct xfs_defer_ops *dfops, int *done);
int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
- xfs_extnum_t *idx, struct xfs_bmbt_irec *got,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del);
+void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
-void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx,
- struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del);
uint xfs_default_attroffset(struct xfs_inode *ip);
-int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+ bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops);
+int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
- int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
- struct xfs_defer_ops *dfops, enum shift_direction direction,
- int num_exts);
+ bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof);
+ struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
+ int eof);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
@@ -277,4 +262,16 @@ int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+static inline int xfs_bmap_fork_to_state(int whichfork)
+{
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
+ return BMAP_ATTRFORK;
+ case XFS_COW_FORK:
+ return BMAP_COWFORK;
+ default:
+ return 0;
+ }
+}
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index a6331ffa51e3..c10aecaaae44 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -38,22 +38,6 @@
#include "xfs_rmap.h"
/*
- * Determine the extent state.
- */
-/* ARGSUSED */
-STATIC xfs_exntst_t
-xfs_extent_state(
- xfs_filblks_t blks,
- int extent_flag)
-{
- if (extent_flag) {
- ASSERT(blks != 0); /* saved for DMIG */
- return XFS_EXT_UNWRITTEN;
- }
- return XFS_EXT_NORM;
-}
-
-/*
* Convert on-disk form of btree root to in-memory form.
*/
void
@@ -87,84 +71,21 @@ xfs_bmdr_to_bmbt(
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
-/*
- * Convert a compressed bmap extent record to an uncompressed form.
- * This code must be in sync with the routines xfs_bmbt_get_startoff,
- * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
- */
-STATIC void
-__xfs_bmbt_get_all(
- uint64_t l0,
- uint64_t l1,
- xfs_bmbt_irec_t *s)
-{
- int ext_flag;
- xfs_exntst_t st;
-
- ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
- s->br_startoff = ((xfs_fileoff_t)l0 &
- xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
- s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
- (((xfs_fsblock_t)l1) >> 21);
- s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
- /* This is xfs_extent_state() in-line */
- if (ext_flag) {
- ASSERT(s->br_blockcount != 0); /* saved for DMIG */
- st = XFS_EXT_UNWRITTEN;
- } else
- st = XFS_EXT_NORM;
- s->br_state = st;
-}
-
void
-xfs_bmbt_get_all(
- xfs_bmbt_rec_host_t *r,
- xfs_bmbt_irec_t *s)
-{
- __xfs_bmbt_get_all(r->l0, r->l1, s);
-}
-
-/*
- * Extract the blockcount field from an in memory bmap extent record.
- */
-xfs_filblks_t
-xfs_bmbt_get_blockcount(
- xfs_bmbt_rec_host_t *r)
-{
- return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
-}
-
-/*
- * Extract the startblock field from an in memory bmap extent record.
- */
-xfs_fsblock_t
-xfs_bmbt_get_startblock(
- xfs_bmbt_rec_host_t *r)
-{
- return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
- (((xfs_fsblock_t)r->l1) >> 21);
-}
-
-/*
- * Extract the startoff field from an in memory bmap extent record.
- */
-xfs_fileoff_t
-xfs_bmbt_get_startoff(
- xfs_bmbt_rec_host_t *r)
-{
- return ((xfs_fileoff_t)r->l0 &
- xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-xfs_exntst_t
-xfs_bmbt_get_state(
- xfs_bmbt_rec_host_t *r)
-{
- int ext_flag;
-
- ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
- return xfs_extent_state(xfs_bmbt_get_blockcount(r),
- ext_flag);
+xfs_bmbt_disk_get_all(
+ struct xfs_bmbt_rec *rec,
+ struct xfs_bmbt_irec *irec)
+{
+ uint64_t l0 = get_unaligned_be64(&rec->l0);
+ uint64_t l1 = get_unaligned_be64(&rec->l1);
+
+ irec->br_startoff = (l0 & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+ irec->br_startblock = ((l0 & xfs_mask64lo(9)) << 43) | (l1 >> 21);
+ irec->br_blockcount = l1 & xfs_mask64lo(21);
+ if (l0 >> (64 - BMBT_EXNTFLAG_BITLEN))
+ irec->br_state = XFS_EXT_UNWRITTEN;
+ else
+ irec->br_state = XFS_EXT_NORM;
}
/*
@@ -188,142 +109,29 @@ xfs_bmbt_disk_get_startoff(
xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
}
-
-/*
- * Set all the fields in a bmap extent record from the arguments.
- */
-void
-xfs_bmbt_set_allf(
- xfs_bmbt_rec_host_t *r,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
- xfs_exntst_t state)
-{
- int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-
- ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
- ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
- ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-
- ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
-
- r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)startoff << 9) |
- ((xfs_bmbt_rec_base_t)startblock >> 43);
- r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
- ((xfs_bmbt_rec_base_t)blockcount &
- (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-}
-
/*
* Set all the fields in a bmap extent record from the uncompressed form.
*/
void
-xfs_bmbt_set_all(
- xfs_bmbt_rec_host_t *r,
- xfs_bmbt_irec_t *s)
-{
- xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
- s->br_blockcount, s->br_state);
-}
-
-
-/*
- * Set all the fields in a disk format bmap extent record from the arguments.
- */
-void
-xfs_bmbt_disk_set_allf(
- xfs_bmbt_rec_t *r,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
- xfs_exntst_t state)
-{
- int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-
- ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
- ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
- ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
- ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
-
- r->l0 = cpu_to_be64(
- ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)startoff << 9) |
- ((xfs_bmbt_rec_base_t)startblock >> 43));
- r->l1 = cpu_to_be64(
- ((xfs_bmbt_rec_base_t)startblock << 21) |
- ((xfs_bmbt_rec_base_t)blockcount &
- (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-}
-
-/*
- * Set all the fields in a bmap extent record from the uncompressed form.
- */
-STATIC void
xfs_bmbt_disk_set_all(
- xfs_bmbt_rec_t *r,
- xfs_bmbt_irec_t *s)
-{
- xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
- s->br_blockcount, s->br_state);
-}
-
-/*
- * Set the blockcount field in a bmap extent record.
- */
-void
-xfs_bmbt_set_blockcount(
- xfs_bmbt_rec_host_t *r,
- xfs_filblks_t v)
+ struct xfs_bmbt_rec *r,
+ struct xfs_bmbt_irec *s)
{
- ASSERT((v & xfs_mask64hi(43)) == 0);
- r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
- (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
-}
-
-/*
- * Set the startblock field in a bmap extent record.
- */
-void
-xfs_bmbt_set_startblock(
- xfs_bmbt_rec_host_t *r,
- xfs_fsblock_t v)
-{
- ASSERT((v & xfs_mask64hi(12)) == 0);
- r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
- (xfs_bmbt_rec_base_t)(v >> 43);
- r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
- (xfs_bmbt_rec_base_t)(v << 21);
-}
+ int extent_flag = (s->br_state != XFS_EXT_NORM);
-/*
- * Set the startoff field in a bmap extent record.
- */
-void
-xfs_bmbt_set_startoff(
- xfs_bmbt_rec_host_t *r,
- xfs_fileoff_t v)
-{
- ASSERT((v & xfs_mask64hi(9)) == 0);
- r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
- ((xfs_bmbt_rec_base_t)v << 9) |
- (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-}
+ ASSERT(s->br_state == XFS_EXT_NORM || s->br_state == XFS_EXT_UNWRITTEN);
+ ASSERT(!(s->br_startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)));
+ ASSERT(!(s->br_blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)));
+ ASSERT(!(s->br_startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)));
-/*
- * Set the extent state field in a bmap extent record.
- */
-void
-xfs_bmbt_set_state(
- xfs_bmbt_rec_host_t *r,
- xfs_exntst_t v)
-{
- ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
- if (v == XFS_EXT_NORM)
- r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
- else
- r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
+ put_unaligned_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+ ((xfs_bmbt_rec_base_t)s->br_startblock >> 43), &r->l0);
+ put_unaligned_be64(
+ ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+ ((xfs_bmbt_rec_base_t)s->br_blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)), &r->l1);
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 9da5a8d4f184..135b8c56d23e 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -98,25 +98,11 @@ struct xfs_trans;
*/
extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
struct xfs_btree_block *, int);
-extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
-extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
-extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
-extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
+void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s);
extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-
-extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
- xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
-extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
-extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
-extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
-
-extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
- xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
xfs_bmdr_block_t *, int);
@@ -136,9 +122,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
* Check that the extent does not contain an invalid unwritten extent flag.
*/
static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork,
- struct xfs_bmbt_rec_host *ep)
+ struct xfs_bmbt_irec *irec)
{
- if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0)
+ if (irec->br_state == XFS_EXT_NORM)
return true;
if (whichfork == XFS_DATA_FORK &&
xfs_sb_version_hasextflgbit(&mp->m_sb))
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 5bfb88261c7e..5f33adf8eecb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -29,6 +29,7 @@
#include "xfs_inode_item.h"
#include "xfs_buf_item.h"
#include "xfs_btree.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
@@ -63,44 +64,63 @@ xfs_btree_magic(
return magic;
}
-STATIC int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
- struct xfs_btree_cur *cur, /* btree cursor */
- struct xfs_btree_block *block, /* btree long form block pointer */
- int level, /* level of the btree block */
- struct xfs_buf *bp) /* buffer for block, if any */
+/*
+ * Check a long btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+xfs_failaddr_t
+__xfs_btree_check_lblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
{
- int lblock_ok = 1; /* block passes checks */
- struct xfs_mount *mp; /* file system mount point */
+ struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
- int crc;
-
- mp = cur->bc_mp;
- crc = xfs_sb_version_hascrc(&mp->m_sb);
+ int crc = xfs_sb_version_hascrc(&mp->m_sb);
if (crc) {
- lblock_ok = lblock_ok &&
- uuid_equal(&block->bb_u.l.bb_uuid,
- &mp->m_sb.sb_meta_uuid) &&
- block->bb_u.l.bb_blkno == cpu_to_be64(
- bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (block->bb_u.l.bb_blkno !=
+ cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
+ return __this_address;
+ if (block->bb_u.l.bb_pad != cpu_to_be32(0))
+ return __this_address;
}
- lblock_ok = lblock_ok &&
- be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
- be16_to_cpu(block->bb_level) == level &&
- be16_to_cpu(block->bb_numrecs) <=
- cur->bc_ops->get_maxrecs(cur, level) &&
- block->bb_u.l.bb_leftsib &&
- (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
- XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
- block->bb_u.l.bb_rightsib &&
- (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
- XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_rightsib)));
-
- if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ return __this_address;
+ if (be16_to_cpu(block->bb_level) != level)
+ return __this_address;
+ if (be16_to_cpu(block->bb_numrecs) >
+ cur->bc_ops->get_maxrecs(cur, level))
+ return __this_address;
+ if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
+ !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
+ level + 1))
+ return __this_address;
+ if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
+ !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
+ level + 1))
+ return __this_address;
+
+ return NULL;
+}
+
+/* Check a long btree block header. */
+static int
+xfs_btree_check_lblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+
+ fa = __xfs_btree_check_lblock(cur, block, level, bp);
+ if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -110,48 +130,61 @@ xfs_btree_check_lblock(
return 0;
}
-STATIC int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
- struct xfs_btree_cur *cur, /* btree cursor */
- struct xfs_btree_block *block, /* btree short form block pointer */
- int level, /* level of the btree block */
- struct xfs_buf *bp) /* buffer containing block */
+/*
+ * Check a short btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+xfs_failaddr_t
+__xfs_btree_check_sblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp; /* file system mount point */
- struct xfs_buf *agbp; /* buffer for ag. freespace struct */
- struct xfs_agf *agf; /* ag. freespace structure */
- xfs_agblock_t agflen; /* native ag. freespace length */
- int sblock_ok = 1; /* block passes checks */
+ struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
- int crc;
-
- mp = cur->bc_mp;
- crc = xfs_sb_version_hascrc(&mp->m_sb);
- agbp = cur->bc_private.a.agbp;
- agf = XFS_BUF_TO_AGF(agbp);
- agflen = be32_to_cpu(agf->agf_length);
+ int crc = xfs_sb_version_hascrc(&mp->m_sb);
if (crc) {
- sblock_ok = sblock_ok &&
- uuid_equal(&block->bb_u.s.bb_uuid,
- &mp->m_sb.sb_meta_uuid) &&
- block->bb_u.s.bb_blkno == cpu_to_be64(
- bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (block->bb_u.s.bb_blkno !=
+ cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
+ return __this_address;
}
- sblock_ok = sblock_ok &&
- be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
- be16_to_cpu(block->bb_level) == level &&
- be16_to_cpu(block->bb_numrecs) <=
- cur->bc_ops->get_maxrecs(cur, level) &&
- (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
- block->bb_u.s.bb_leftsib &&
- (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
- block->bb_u.s.bb_rightsib;
-
- if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ return __this_address;
+ if (be16_to_cpu(block->bb_level) != level)
+ return __this_address;
+ if (be16_to_cpu(block->bb_numrecs) >
+ cur->bc_ops->get_maxrecs(cur, level))
+ return __this_address;
+ if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
+ !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
+ level + 1))
+ return __this_address;
+ if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
+ !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
+ level + 1))
+ return __this_address;
+
+ return NULL;
+}
+
+/* Check a short btree block header. */
+STATIC int
+xfs_btree_check_sblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+
+ fa = __xfs_btree_check_sblock(cur, block, level, bp);
+ if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -177,59 +210,53 @@ xfs_btree_check_block(
return xfs_btree_check_sblock(cur, block, level, bp);
}
-/*
- * Check that (long) pointer is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
+/* Check that this long pointer is valid and points within the fs. */
+bool
xfs_btree_check_lptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_fsblock_t bno, /* btree block disk address */
- int level) /* btree block level */
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t fsbno,
+ int level)
{
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
- level > 0 &&
- bno != NULLFSBLOCK &&
- XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
- return 0;
+ if (level <= 0)
+ return false;
+ return xfs_verify_fsbno(cur->bc_mp, fsbno);
}
-#ifdef DEBUG
-/*
- * Check that (short) pointer is ok.
- */
-STATIC int /* error (0 or EFSCORRUPTED) */
+/* Check that this short pointer is valid and points within the AG. */
+bool
xfs_btree_check_sptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agblock_t bno, /* btree block disk address */
- int level) /* btree block level */
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ int level)
{
- xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
-
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
- level > 0 &&
- bno != NULLAGBLOCK &&
- bno != 0 &&
- bno < agblocks);
- return 0;
+ if (level <= 0)
+ return false;
+ return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
}
+#ifdef DEBUG
/*
- * Check that block ptr is ok.
+ * Check that a given (indexed) btree pointer at a certain level of a
+ * btree is valid and doesn't point past where it should.
*/
-STATIC int /* error (0 or EFSCORRUPTED) */
+static int
xfs_btree_check_ptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- union xfs_btree_ptr *ptr, /* btree block disk address */
- int index, /* offset from ptr to check */
- int level) /* btree block level */
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int index,
+ int level)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- return xfs_btree_check_lptr(cur,
- be64_to_cpu((&ptr->l)[index]), level);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
+ xfs_btree_check_lptr(cur,
+ be64_to_cpu((&ptr->l)[index]), level));
} else {
- return xfs_btree_check_sptr(cur,
- be32_to_cpu((&ptr->s)[index]), level);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
+ xfs_btree_check_sptr(cur,
+ be32_to_cpu((&ptr->s)[index]), level));
}
+
+ return 0;
}
#endif
@@ -1027,7 +1054,7 @@ xfs_btree_setbuf(
}
}
-STATIC int
+bool
xfs_btree_ptr_is_null(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
@@ -1052,7 +1079,7 @@ xfs_btree_set_ptr_null(
/*
* Get/set/init sibling pointers
*/
-STATIC void
+void
xfs_btree_get_sibling(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
@@ -2001,7 +2028,7 @@ error0:
}
/* Find the high key storage area from a regular key. */
-STATIC union xfs_btree_key *
+union xfs_btree_key *
xfs_btree_high_key_from_key(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
@@ -2075,7 +2102,7 @@ xfs_btree_get_node_keys(
}
/* Derive the keys for any btree block. */
-STATIC void
+void
xfs_btree_get_keys(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
@@ -4914,3 +4941,15 @@ xfs_btree_count_blocks(
return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
blocks);
}
+
+/* Compare two btree pointers. */
+int64_t
+xfs_btree_diff_two_ptrs(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
+ return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index f2a88c3b1159..b57501c6f71d 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -255,6 +255,14 @@ typedef struct xfs_btree_cur
*/
#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
+/*
+ * Internal long and short btree block checks. They return NULL if the
+ * block is ok or the address of the failed check otherwise.
+ */
+xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, int level, struct xfs_buf *bp);
/*
* Check that block header is ok.
@@ -269,10 +277,19 @@ xfs_btree_check_block(
/*
* Check that (long) pointer is ok.
*/
-int /* error (0 or EFSCORRUPTED) */
+bool /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lptr(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_fsblock_t ptr, /* btree block disk address */
+ xfs_fsblock_t fsbno, /* btree block disk address */
+ int level); /* btree block level */
+
+/*
+ * Check that (short) pointer is ok.
+ */
+bool /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t agbno, /* btree block disk address */
int level); /* btree block level */
/*
@@ -517,5 +534,16 @@ int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
int level, struct xfs_buf **bpp);
+bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr);
+int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b);
+void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr, int lr);
+void xfs_btree_get_keys(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, union xfs_btree_key *key);
+union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
+ union xfs_btree_key *key);
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index 8211f48b98e6..999a290cfd72 100644
--- a/fs/xfs/libxfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFS_CKSUM_H
#define _XFS_CKSUM_H 1
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 6d4335815c3f..651611530d2f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -1466,6 +1466,7 @@ xfs_da3_node_lookup_int(
int max;
int error;
int retval;
+ unsigned int expected_level = 0;
struct xfs_inode *dp = state->args->dp;
args = state->args;
@@ -1474,7 +1475,7 @@ xfs_da3_node_lookup_int(
* Descend thru the B-tree searching each level for the right
* node to use, until the right hashval is found.
*/
- blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
+ blkno = args->geo->leafblk;
for (blk = &state->path.blk[0], state->path.active = 1;
state->path.active <= XFS_DA_NODE_MAXDEPTH;
blk++, state->path.active++) {
@@ -1517,6 +1518,18 @@ xfs_da3_node_lookup_int(
dp->d_ops->node_hdr_from_disk(&nodehdr, node);
btree = dp->d_ops->node_tree_p(node);
+ /* Tree taller than we can handle; bail out! */
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ return -EFSCORRUPTED;
+
+ /* Check the level from the root. */
+ if (blkno == args->geo->leafblk)
+ expected_level = nodehdr.level - 1;
+ else if (expected_level != nodehdr.level)
+ return -EFSCORRUPTED;
+ else
+ expected_level--;
+
max = nodehdr.count;
blk->hashval = be32_to_cpu(btree[max - 1].hashval);
@@ -1562,8 +1575,15 @@ xfs_da3_node_lookup_int(
blk->index = probe;
blkno = be32_to_cpu(btree[probe].before);
}
+
+ /* We can't point back to the root. */
+ if (blkno == args->geo->leafblk)
+ return -EFSCORRUPTED;
}
+ if (expected_level != 0)
+ return -EFSCORRUPTED;
+
/*
* A leaf block that ends in the hashval that we are interested in
* (final hashval == search hashval) means that the next block may
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index ccf9783fd3f0..e10778c102ea 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -30,6 +30,8 @@
#include "xfs_bmap.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
+#include "xfs_ialloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -38,7 +40,9 @@ struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
/*
* Convert inode mode to directory entry filetype
*/
-unsigned char xfs_mode_to_ftype(int mode)
+unsigned char
+xfs_mode_to_ftype(
+ int mode)
{
switch (mode & S_IFMT) {
case S_IFREG:
@@ -202,22 +206,8 @@ xfs_dir_ino_validate(
xfs_mount_t *mp,
xfs_ino_t ino)
{
- xfs_agblock_t agblkno;
- xfs_agino_t agino;
- xfs_agnumber_t agno;
- int ino_ok;
- int ioff;
-
- agno = XFS_INO_TO_AGNO(mp, ino);
- agblkno = XFS_INO_TO_AGBNO(mp, ino);
- ioff = XFS_INO_TO_OFFSET(mp, ino);
- agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
- ino_ok =
- agno < mp->m_sb.sb_agcount &&
- agblkno < mp->m_sb.sb_agblocks &&
- agblkno != 0 &&
- ioff < (1 << mp->m_sb.sb_inopblog) &&
- XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+ bool ino_ok = xfs_verify_dir_ino(mp, ino);
+
if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) {
xfs_warn(mp, "Invalid inode number 0x%Lx",
(unsigned long long) ino);
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 21c8f8bf94d5..1a8f2cf977ca 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -324,4 +324,21 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
sizeof(struct xfs_dir2_leaf_tail));
}
+/*
+ * The Linux API doesn't pass down the total size of the buffer
+ * we read into down to the filesystem. With the filldir concept
+ * it's not needed for correct information, but the XFS dir2 leaf
+ * code wants an estimate of the buffer size to calculate it's
+ * readahead window and size the buffers used for mapping to
+ * physical blocks.
+ *
+ * Try to give it an estimate that's good enough, maybe at some
+ * point we can change the ->readdir prototype to include the
+ * buffer size. For now we use the current glibc buffer size.
+ * musl libc hardcodes 2k and dietlibc uses PAGE_SIZE.
+ */
+#define XFS_READDIR_BUFSIZE (32768)
+
+unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
+
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
new file mode 100644
index 000000000000..bc1789d95152
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2017 Oracle.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_ERRORTAG_H_
+#define __XFS_ERRORTAG_H_
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR 0
+#define XFS_ERRTAG_IFLUSH_1 1
+#define XFS_ERRTAG_IFLUSH_2 2
+#define XFS_ERRTAG_IFLUSH_3 3
+#define XFS_ERRTAG_IFLUSH_4 4
+#define XFS_ERRTAG_IFLUSH_5 5
+#define XFS_ERRTAG_IFLUSH_6 6
+#define XFS_ERRTAG_DA_READ_BUF 7
+#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8
+#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9
+#define XFS_ERRTAG_ALLOC_READ_AGF 10
+#define XFS_ERRTAG_IALLOC_READ_AGI 11
+#define XFS_ERRTAG_ITOBP_INOTOBP 12
+#define XFS_ERRTAG_IUNLINK 13
+#define XFS_ERRTAG_IUNLINK_REMOVE 14
+#define XFS_ERRTAG_DIR_INO_VALIDATE 15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16
+#define XFS_ERRTAG_IODONE_IOERR 17
+#define XFS_ERRTAG_STRATREAD_IOERR 18
+#define XFS_ERRTAG_STRATCMPL_IOERR 19
+#define XFS_ERRTAG_DIOWRITE_IOERR 20
+#define XFS_ERRTAG_BMAPIFORMAT 21
+#define XFS_ERRTAG_FREE_EXTENT 22
+#define XFS_ERRTAG_RMAP_FINISH_ONE 23
+#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24
+#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
+#define XFS_ERRTAG_BMAP_FINISH_ONE 26
+#define XFS_ERRTAG_AG_RESV_CRITICAL 27
+/*
+ * DEBUG mode instrumentation to test and/or trigger delayed allocation
+ * block killing in the event of failed writes. When enabled, all
+ * buffered writes are silenty dropped and handled as if they failed.
+ * All delalloc blocks in the range of the write (including pre-existing
+ * delalloc blocks!) are tossed as part of the write failure error
+ * handling sequence.
+ */
+#define XFS_ERRTAG_DROP_WRITES 28
+#define XFS_ERRTAG_LOG_BAD_CRC 29
+#define XFS_ERRTAG_LOG_ITEM_PIN 30
+#define XFS_ERRTAG_BUF_LRU_REF 31
+#define XFS_ERRTAG_MAX 32
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT 100
+#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_FREE_EXTENT 1
+#define XFS_RANDOM_RMAP_FINISH_ONE 1
+#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1
+#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
+#define XFS_RANDOM_BMAP_FINISH_ONE 1
+#define XFS_RANDOM_AG_RESV_CRITICAL 4
+#define XFS_RANDOM_DROP_WRITES 1
+#define XFS_RANDOM_LOG_BAD_CRC 1
+#define XFS_RANDOM_LOG_ITEM_PIN 1
+#define XFS_RANDOM_BUF_LRU_REF 2
+
+#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 23229f0c5b15..1acb584fc5f7 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -315,6 +315,11 @@ static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
return false;
}
+static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp)
+{
+ return sbp->sb_rblocks > 0;
+}
+
/*
* Detect a mismatched features2 field. Older kernels read/wrote
* this into the wrong slot, so to be safe we keep them in sync.
@@ -500,12 +505,12 @@ xfs_sb_has_incompat_log_feature(
/*
* V5 superblock specific feature checks
*/
-static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
-static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
@@ -518,7 +523,7 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
(sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
}
-static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
@@ -941,7 +946,7 @@ typedef enum xfs_dinode_fmt {
XFS_DINODE_FMT_LOCAL, /* bulk data */
XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
- XFS_DINODE_FMT_UUID /* uuid_t */
+ XFS_DINODE_FMT_UUID /* added long ago, but never used */
} xfs_dinode_fmt_t;
/*
@@ -1142,7 +1147,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* Dquot and dquot block format definitions
*/
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
-#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
+#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
/*
* This is the main portion of the on-disk representation of quota
@@ -1548,10 +1553,6 @@ typedef struct xfs_bmbt_rec {
typedef uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
-typedef struct xfs_bmbt_rec_host {
- uint64_t l0, l1;
-} xfs_bmbt_rec_host_t;
-
/*
* Values and macros for delayed-allocation startblock fields.
*/
@@ -1577,24 +1578,6 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
}
/*
- * Possible extent states.
- */
-typedef enum {
- XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
-} xfs_exntst_t;
-
-/*
- * Incore version of above.
- */
-typedef struct xfs_bmbt_irec
-{
- xfs_fileoff_t br_startoff; /* starting file offset */
- xfs_fsblock_t br_startblock; /* starting block number */
- xfs_filblks_t br_blockcount; /* number of blocks */
- xfs_exntst_t br_state; /* extent state */
-} xfs_bmbt_irec_t;
-
-/*
* Key structure for non-leaf levels of the tree.
*/
typedef struct xfs_bmbt_key {
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 8c61f21535d4..b90924104596 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -468,6 +468,82 @@ typedef struct xfs_swapext
#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+/* metadata scrubbing */
+struct xfs_scrub_metadata {
+ __u32 sm_type; /* What to check? */
+ __u32 sm_flags; /* flags; see below. */
+ __u64 sm_ino; /* inode number. */
+ __u32 sm_gen; /* inode generation. */
+ __u32 sm_agno; /* ag number. */
+ __u64 sm_reserved[5]; /* pad to 64 bytes */
+};
+
+/*
+ * Metadata types and flags for scrub operation.
+ */
+
+/* Scrub subcommands. */
+#define XFS_SCRUB_TYPE_PROBE 0 /* presence test ioctl */
+#define XFS_SCRUB_TYPE_SB 1 /* superblock */
+#define XFS_SCRUB_TYPE_AGF 2 /* AG free header */
+#define XFS_SCRUB_TYPE_AGFL 3 /* AG free list */
+#define XFS_SCRUB_TYPE_AGI 4 /* AG inode header */
+#define XFS_SCRUB_TYPE_BNOBT 5 /* freesp by block btree */
+#define XFS_SCRUB_TYPE_CNTBT 6 /* freesp by length btree */
+#define XFS_SCRUB_TYPE_INOBT 7 /* inode btree */
+#define XFS_SCRUB_TYPE_FINOBT 8 /* free inode btree */
+#define XFS_SCRUB_TYPE_RMAPBT 9 /* reverse mapping btree */
+#define XFS_SCRUB_TYPE_REFCNTBT 10 /* reference count btree */
+#define XFS_SCRUB_TYPE_INODE 11 /* inode record */
+#define XFS_SCRUB_TYPE_BMBTD 12 /* data fork block mapping */
+#define XFS_SCRUB_TYPE_BMBTA 13 /* attr fork block mapping */
+#define XFS_SCRUB_TYPE_BMBTC 14 /* CoW fork block mapping */
+#define XFS_SCRUB_TYPE_DIR 15 /* directory */
+#define XFS_SCRUB_TYPE_XATTR 16 /* extended attribute */
+#define XFS_SCRUB_TYPE_SYMLINK 17 /* symbolic link */
+#define XFS_SCRUB_TYPE_PARENT 18 /* parent pointers */
+#define XFS_SCRUB_TYPE_RTBITMAP 19 /* realtime bitmap */
+#define XFS_SCRUB_TYPE_RTSUM 20 /* realtime summary */
+#define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */
+#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */
+#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */
+
+/* Number of scrub subcommands. */
+#define XFS_SCRUB_TYPE_NR 24
+
+/* i: Repair this metadata. */
+#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
+
+/* o: Metadata object needs repair. */
+#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1)
+
+/*
+ * o: Metadata object could be optimized. It's not corrupt, but
+ * we could improve on it somehow.
+ */
+#define XFS_SCRUB_OFLAG_PREEN (1 << 2)
+
+/* o: Cross-referencing failed. */
+#define XFS_SCRUB_OFLAG_XFAIL (1 << 3)
+
+/* o: Metadata object disagrees with cross-referenced metadata. */
+#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4)
+
+/* o: Scan was not complete. */
+#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5)
+
+/* o: Metadata object looked funny but isn't corrupt. */
+#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+
+#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
+#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
+ XFS_SCRUB_OFLAG_PREEN | \
+ XFS_SCRUB_OFLAG_XFAIL | \
+ XFS_SCRUB_OFLAG_XCORRUPT | \
+ XFS_SCRUB_OFLAG_INCOMPLETE | \
+ XFS_SCRUB_OFLAG_WARNING)
+#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
+
/*
* ioctl limits
*/
@@ -511,6 +587,7 @@ typedef struct xfs_swapext
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
/* XFS_IOC_GETFSMAP ------ hoisted 59 */
+#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata)
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 988bb3f31446..de3f04a98656 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -31,6 +31,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
#include "xfs_cksum.h"
@@ -1962,7 +1963,7 @@ xfs_difree_inobt(
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
- xic->deleted = 1;
+ xic->deleted = true;
xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
@@ -1989,7 +1990,7 @@ xfs_difree_inobt(
xfs_difree_inode_chunk(mp, agno, &rec, dfops);
} else {
- xic->deleted = 0;
+ xic->deleted = false;
error = xfs_inobt_update(cur, &rec);
if (error) {
@@ -2664,3 +2665,93 @@ xfs_ialloc_pagi_init(
xfs_trans_brelse(tp, bp);
return 0;
}
+
+/* Calculate the first and last possible inode number in an AG. */
+void
+xfs_ialloc_agino_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ xfs_agblock_t bno;
+ xfs_agblock_t eoag;
+
+ eoag = xfs_ag_block_count(mp, agno);
+
+ /*
+ * Calculate the first inode, which will be in the first
+ * cluster-aligned block after the AGFL.
+ */
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1,
+ xfs_ialloc_cluster_alignment(mp));
+ *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0);
+
+ /*
+ * Calculate the last inode, which will be at the end of the
+ * last (aligned) cluster that can be allocated in the AG.
+ */
+ bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp));
+ *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+bool
+xfs_verify_agino(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ xfs_agino_t first;
+ xfs_agino_t last;
+
+ xfs_ialloc_agino_range(mp, agno, &first, &last);
+ return agino >= first && agino <= last;
+}
+
+/*
+ * Verify that an FS inode number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+bool
+xfs_verify_ino(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return false;
+ if (XFS_AGINO_TO_INO(mp, agno, agino) != ino)
+ return false;
+ return xfs_verify_agino(mp, agno, agino);
+}
+
+/* Is this an internal inode number? */
+bool
+xfs_internal_inum(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+ (xfs_sb_version_hasquota(&mp->m_sb) &&
+ xfs_is_quota_inode(&mp->m_sb, ino));
+}
+
+/*
+ * Verify that a directory entry's inode number doesn't point at an internal
+ * inode, empty space, or static AG metadata.
+ */
+bool
+xfs_verify_dir_ino(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ if (xfs_internal_inum(mp, ino))
+ return false;
+ return xfs_verify_ino(mp, ino);
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index b32cfb5aeb5b..d2bdcd5e7312 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -173,5 +173,12 @@ void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
+void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t *first, xfs_agino_t *last);
+bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t agino);
+bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
new file mode 100644
index 000000000000..343a94246f5b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 2017 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+
+/*
+ * In-core extent record layout:
+ *
+ * +-------+----------------------------+
+ * | 00:53 | all 54 bits of startoff |
+ * | 54:63 | low 10 bits of startblock |
+ * +-------+----------------------------+
+ * | 00:20 | all 21 bits of length |
+ * | 21 | unwritten extent bit |
+ * | 22:63 | high 42 bits of startblock |
+ * +-------+----------------------------+
+ */
+#define XFS_IEXT_STARTOFF_MASK xfs_mask64lo(BMBT_STARTOFF_BITLEN)
+#define XFS_IEXT_LENGTH_MASK xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN)
+#define XFS_IEXT_STARTBLOCK_MASK xfs_mask64lo(BMBT_STARTBLOCK_BITLEN)
+
+struct xfs_iext_rec {
+ uint64_t lo;
+ uint64_t hi;
+};
+
+/*
+ * Given that the length can't be a zero, only an empty hi value indicates an
+ * unused record.
+ */
+static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec)
+{
+ return rec->hi == 0;
+}
+
+static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec)
+{
+ rec->lo = 0;
+ rec->hi = 0;
+}
+
+static void
+xfs_iext_set(
+ struct xfs_iext_rec *rec,
+ struct xfs_bmbt_irec *irec)
+{
+ ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0);
+ ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0);
+ ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0);
+
+ rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK;
+ rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK;
+
+ rec->lo |= (irec->br_startblock << 54);
+ rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10));
+
+ if (irec->br_state == XFS_EXT_UNWRITTEN)
+ rec->hi |= (1 << 21);
+}
+
+static void
+xfs_iext_get(
+ struct xfs_bmbt_irec *irec,
+ struct xfs_iext_rec *rec)
+{
+ irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK;
+ irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK;
+
+ irec->br_startblock = rec->lo >> 54;
+ irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10);
+
+ if (rec->hi & (1 << 21))
+ irec->br_state = XFS_EXT_UNWRITTEN;
+ else
+ irec->br_state = XFS_EXT_NORM;
+}
+
+enum {
+ NODE_SIZE = 256,
+ KEYS_PER_NODE = NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
+ RECS_PER_LEAF = (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) /
+ sizeof(struct xfs_iext_rec),
+};
+
+/*
+ * In-core extent btree block layout:
+ *
+ * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
+ *
+ * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
+ * contain the startoffset, blockcount, startblock and unwritten extent flag.
+ * See above for the exact format, followed by pointers to the previous and next
+ * leaf blocks (if there are any).
+ *
+ * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
+ * by an equal number of pointers to the btree blocks at the next lower level.
+ *
+ * +-------+-------+-------+-------+-------+----------+----------+
+ * Leaf: | rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
+ * +-------+-------+-------+-------+-------+----------+----------+
+ *
+ * +-------+-------+-------+-------+-------+-------+------+-------+
+ * Inner: | key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
+ * +-------+-------+-------+-------+-------+-------+------+-------+
+ */
+struct xfs_iext_node {
+ uint64_t keys[KEYS_PER_NODE];
+#define XFS_IEXT_KEY_INVALID (1ULL << 63)
+ void *ptrs[KEYS_PER_NODE];
+};
+
+struct xfs_iext_leaf {
+ struct xfs_iext_rec recs[RECS_PER_LEAF];
+ struct xfs_iext_leaf *prev;
+ struct xfs_iext_leaf *next;
+};
+
+inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
+{
+ return ifp->if_bytes / sizeof(struct xfs_iext_rec);
+}
+
+static inline int xfs_iext_max_recs(struct xfs_ifork *ifp)
+{
+ if (ifp->if_height == 1)
+ return xfs_iext_count(ifp);
+ return RECS_PER_LEAF;
+}
+
+static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur)
+{
+ return &cur->leaf->recs[cur->pos];
+}
+
+static inline bool xfs_iext_valid(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf)
+ return false;
+ if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp))
+ return false;
+ if (xfs_iext_rec_is_empty(cur_rec(cur)))
+ return false;
+ return true;
+}
+
+static void *
+xfs_iext_find_first_leaf(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > 1; height--) {
+ node = node->ptrs[0];
+ ASSERT(node);
+ }
+
+ return node;
+}
+
+static void *
+xfs_iext_find_last_leaf(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > 1; height--) {
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ if (!node->ptrs[i])
+ break;
+ node = node->ptrs[i - 1];
+ ASSERT(node);
+ }
+
+ return node;
+}
+
+void
+xfs_iext_first(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ cur->pos = 0;
+ cur->leaf = xfs_iext_find_first_leaf(ifp);
+}
+
+void
+xfs_iext_last(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ int i;
+
+ cur->leaf = xfs_iext_find_last_leaf(ifp);
+ if (!cur->leaf) {
+ cur->pos = 0;
+ return;
+ }
+
+ for (i = 1; i < xfs_iext_max_recs(ifp); i++) {
+ if (xfs_iext_rec_is_empty(&cur->leaf->recs[i]))
+ break;
+ }
+ cur->pos = i - 1;
+}
+
+void
+xfs_iext_next(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf) {
+ ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+ xfs_iext_first(ifp, cur);
+ return;
+ }
+
+ ASSERT(cur->pos >= 0);
+ ASSERT(cur->pos < xfs_iext_max_recs(ifp));
+
+ cur->pos++;
+ if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) &&
+ cur->leaf->next) {
+ cur->leaf = cur->leaf->next;
+ cur->pos = 0;
+ }
+}
+
+void
+xfs_iext_prev(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf) {
+ ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+ xfs_iext_last(ifp, cur);
+ return;
+ }
+
+ ASSERT(cur->pos >= 0);
+ ASSERT(cur->pos <= RECS_PER_LEAF);
+
+recurse:
+ do {
+ cur->pos--;
+ if (xfs_iext_valid(ifp, cur))
+ return;
+ } while (cur->pos > 0);
+
+ if (ifp->if_height > 1 && cur->leaf->prev) {
+ cur->leaf = cur->leaf->prev;
+ cur->pos = RECS_PER_LEAF;
+ goto recurse;
+ }
+}
+
+static inline int
+xfs_iext_key_cmp(
+ struct xfs_iext_node *node,
+ int n,
+ xfs_fileoff_t offset)
+{
+ if (node->keys[n] > offset)
+ return 1;
+ if (node->keys[n] < offset)
+ return -1;
+ return 0;
+}
+
+static inline int
+xfs_iext_rec_cmp(
+ struct xfs_iext_rec *rec,
+ xfs_fileoff_t offset)
+{
+ uint64_t rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
+ u32 rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
+
+ if (rec_offset > offset)
+ return 1;
+ if (rec_offset + rec_len <= offset)
+ return -1;
+ return 0;
+}
+
+static void *
+xfs_iext_find_level(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ int level)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > level; height--) {
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ break;
+
+ node = node->ptrs[i - 1];
+ if (!node)
+ break;
+ }
+
+ return node;
+}
+
+static int
+xfs_iext_node_pos(
+ struct xfs_iext_node *node,
+ xfs_fileoff_t offset)
+{
+ int i;
+
+ for (i = 1; i < KEYS_PER_NODE; i++) {
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ break;
+ }
+
+ return i - 1;
+}
+
+static int
+xfs_iext_node_insert_pos(
+ struct xfs_iext_node *node,
+ xfs_fileoff_t offset)
+{
+ int i;
+
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ return i;
+ }
+
+ return KEYS_PER_NODE;
+}
+
+static int
+xfs_iext_node_nr_entries(
+ struct xfs_iext_node *node,
+ int start)
+{
+ int i;
+
+ for (i = start; i < KEYS_PER_NODE; i++) {
+ if (node->keys[i] == XFS_IEXT_KEY_INVALID)
+ break;
+ }
+
+ return i;
+}
+
+static int
+xfs_iext_leaf_nr_entries(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_leaf *leaf,
+ int start)
+{
+ int i;
+
+ for (i = start; i < xfs_iext_max_recs(ifp); i++) {
+ if (xfs_iext_rec_is_empty(&leaf->recs[i]))
+ break;
+ }
+
+ return i;
+}
+
+static inline uint64_t
+xfs_iext_leaf_key(
+ struct xfs_iext_leaf *leaf,
+ int n)
+{
+ return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
+}
+
+static void
+xfs_iext_grow(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ int i;
+
+ if (ifp->if_height == 1) {
+ struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
+
+ node->keys[0] = xfs_iext_leaf_key(prev, 0);
+ node->ptrs[0] = prev;
+ } else {
+ struct xfs_iext_node *prev = ifp->if_u1.if_root;
+
+ ASSERT(ifp->if_height > 1);
+
+ node->keys[0] = prev->keys[0];
+ node->ptrs[0] = prev;
+ }
+
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ node->keys[i] = XFS_IEXT_KEY_INVALID;
+
+ ifp->if_u1.if_root = node;
+ ifp->if_height++;
+}
+
+static void
+xfs_iext_update_node(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t old_offset,
+ xfs_fileoff_t new_offset,
+ int level,
+ void *ptr)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ for (height = ifp->if_height; height > level; height--) {
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0)
+ break;
+ if (node->keys[i] == old_offset)
+ node->keys[i] = new_offset;
+ }
+ node = node->ptrs[i - 1];
+ ASSERT(node);
+ }
+
+ ASSERT(node == ptr);
+}
+
+static struct xfs_iext_node *
+xfs_iext_split_node(
+ struct xfs_iext_node **nodep,
+ int *pos,
+ int *nr_entries)
+{
+ struct xfs_iext_node *node = *nodep;
+ struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ const int nr_move = KEYS_PER_NODE / 2;
+ int nr_keep = nr_move + (KEYS_PER_NODE & 1);
+ int i = 0;
+
+ /* for sequential append operations just spill over into the new node */
+ if (*pos == KEYS_PER_NODE) {
+ *nodep = new;
+ *pos = 0;
+ *nr_entries = 0;
+ goto done;
+ }
+
+
+ for (i = 0; i < nr_move; i++) {
+ new->keys[i] = node->keys[nr_keep + i];
+ new->ptrs[i] = node->ptrs[nr_keep + i];
+
+ node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID;
+ node->ptrs[nr_keep + i] = NULL;
+ }
+
+ if (*pos >= nr_keep) {
+ *nodep = new;
+ *pos -= nr_keep;
+ *nr_entries = nr_move;
+ } else {
+ *nr_entries = nr_keep;
+ }
+done:
+ for (; i < KEYS_PER_NODE; i++)
+ new->keys[i] = XFS_IEXT_KEY_INVALID;
+ return new;
+}
+
+static void
+xfs_iext_insert_node(
+ struct xfs_ifork *ifp,
+ uint64_t offset,
+ void *ptr,
+ int level)
+{
+ struct xfs_iext_node *node, *new;
+ int i, pos, nr_entries;
+
+again:
+ if (ifp->if_height < level)
+ xfs_iext_grow(ifp);
+
+ new = NULL;
+ node = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_insert_pos(node, offset);
+ nr_entries = xfs_iext_node_nr_entries(node, pos);
+
+ ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0);
+ ASSERT(nr_entries <= KEYS_PER_NODE);
+
+ if (nr_entries == KEYS_PER_NODE)
+ new = xfs_iext_split_node(&node, &pos, &nr_entries);
+
+ /*
+ * Update the pointers in higher levels if the first entry changes
+ * in an existing node.
+ */
+ if (node != new && pos == 0 && nr_entries > 0)
+ xfs_iext_update_node(ifp, node->keys[0], offset, level, node);
+
+ for (i = nr_entries; i > pos; i--) {
+ node->keys[i] = node->keys[i - 1];
+ node->ptrs[i] = node->ptrs[i - 1];
+ }
+ node->keys[pos] = offset;
+ node->ptrs[pos] = ptr;
+
+ if (new) {
+ offset = new->keys[0];
+ ptr = new;
+ level++;
+ goto again;
+ }
+}
+
+static struct xfs_iext_leaf *
+xfs_iext_split_leaf(
+ struct xfs_iext_cursor *cur,
+ int *nr_entries)
+{
+ struct xfs_iext_leaf *leaf = cur->leaf;
+ struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ const int nr_move = RECS_PER_LEAF / 2;
+ int nr_keep = nr_move + (RECS_PER_LEAF & 1);
+ int i;
+
+ /* for sequential append operations just spill over into the new node */
+ if (cur->pos == RECS_PER_LEAF) {
+ cur->leaf = new;
+ cur->pos = 0;
+ *nr_entries = 0;
+ goto done;
+ }
+
+ for (i = 0; i < nr_move; i++) {
+ new->recs[i] = leaf->recs[nr_keep + i];
+ xfs_iext_rec_clear(&leaf->recs[nr_keep + i]);
+ }
+
+ if (cur->pos >= nr_keep) {
+ cur->leaf = new;
+ cur->pos -= nr_keep;
+ *nr_entries = nr_move;
+ } else {
+ *nr_entries = nr_keep;
+ }
+done:
+ if (leaf->next)
+ leaf->next->prev = new;
+ new->next = leaf->next;
+ new->prev = leaf;
+ leaf->next = new;
+ return new;
+}
+
+static void
+xfs_iext_alloc_root(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ ASSERT(ifp->if_bytes == 0);
+
+ ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+ ifp->if_height = 1;
+
+ /* now that we have a node step into it */
+ cur->leaf = ifp->if_u1.if_root;
+ cur->pos = 0;
+}
+
+static void
+xfs_iext_realloc_root(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
+ void *new;
+
+ /* account for the prev/next pointers */
+ if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
+ new_size = NODE_SIZE;
+
+ new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
+ memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
+ ifp->if_u1.if_root = new;
+ cur->leaf = new;
+}
+
+void
+xfs_iext_insert(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+ xfs_fileoff_t offset = irec->br_startoff;
+ struct xfs_iext_leaf *new = NULL;
+ int nr_entries, i;
+
+ trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+
+ if (ifp->if_height == 0)
+ xfs_iext_alloc_root(ifp, cur);
+ else if (ifp->if_height == 1)
+ xfs_iext_realloc_root(ifp, cur);
+
+ nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
+ ASSERT(nr_entries <= RECS_PER_LEAF);
+ ASSERT(cur->pos >= nr_entries ||
+ xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0);
+
+ if (nr_entries == RECS_PER_LEAF)
+ new = xfs_iext_split_leaf(cur, &nr_entries);
+
+ /*
+ * Update the pointers in higher levels if the first entry changes
+ * in an existing node.
+ */
+ if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0),
+ offset, 1, cur->leaf);
+ }
+
+ for (i = nr_entries; i > cur->pos; i--)
+ cur->leaf->recs[i] = cur->leaf->recs[i - 1];
+ xfs_iext_set(cur_rec(cur), irec);
+ ifp->if_bytes += sizeof(struct xfs_iext_rec);
+
+ if (new)
+ xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
+}
+
+static struct xfs_iext_node *
+xfs_iext_rebalance_node(
+ struct xfs_iext_node *parent,
+ int *pos,
+ struct xfs_iext_node *node,
+ int nr_entries)
+{
+ /*
+ * If the neighbouring nodes are completely full, or have different
+ * parents, we might never be able to merge our node, and will only
+ * delete it once the number of entries hits zero.
+ */
+ if (nr_entries == 0)
+ return node;
+
+ if (*pos > 0) {
+ struct xfs_iext_node *prev = parent->ptrs[*pos - 1];
+ int nr_prev = xfs_iext_node_nr_entries(prev, 0), i;
+
+ if (nr_prev + nr_entries <= KEYS_PER_NODE) {
+ for (i = 0; i < nr_entries; i++) {
+ prev->keys[nr_prev + i] = node->keys[i];
+ prev->ptrs[nr_prev + i] = node->ptrs[i];
+ }
+ return node;
+ }
+ }
+
+ if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) {
+ struct xfs_iext_node *next = parent->ptrs[*pos + 1];
+ int nr_next = xfs_iext_node_nr_entries(next, 0), i;
+
+ if (nr_entries + nr_next <= KEYS_PER_NODE) {
+ /*
+ * Merge the next node into this node so that we don't
+ * have to do an additional update of the keys in the
+ * higher levels.
+ */
+ for (i = 0; i < nr_next; i++) {
+ node->keys[nr_entries + i] = next->keys[i];
+ node->ptrs[nr_entries + i] = next->ptrs[i];
+ }
+
+ ++*pos;
+ return next;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+xfs_iext_remove_node(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ void *victim)
+{
+ struct xfs_iext_node *node, *parent;
+ int level = 2, pos, nr_entries, i;
+
+ ASSERT(level <= ifp->if_height);
+ node = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_pos(node, offset);
+again:
+ ASSERT(node->ptrs[pos]);
+ ASSERT(node->ptrs[pos] == victim);
+ kmem_free(victim);
+
+ nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
+ offset = node->keys[0];
+ for (i = pos; i < nr_entries; i++) {
+ node->keys[i] = node->keys[i + 1];
+ node->ptrs[i] = node->ptrs[i + 1];
+ }
+ node->keys[nr_entries] = XFS_IEXT_KEY_INVALID;
+ node->ptrs[nr_entries] = NULL;
+
+ if (pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, offset, node->keys[0], level, node);
+ offset = node->keys[0];
+ }
+
+ if (nr_entries >= KEYS_PER_NODE / 2)
+ return;
+
+ if (level < ifp->if_height) {
+ /*
+ * If we aren't at the root yet try to find a neighbour node to
+ * merge with (or delete the node if it is empty), and then
+ * recurse up to the next level.
+ */
+ level++;
+ parent = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_pos(parent, offset);
+
+ ASSERT(pos != KEYS_PER_NODE);
+ ASSERT(parent->ptrs[pos] == node);
+
+ node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries);
+ if (node) {
+ victim = node;
+ node = parent;
+ goto again;
+ }
+ } else if (nr_entries == 1) {
+ /*
+ * If we are at the root and only one entry is left we can just
+ * free this node and update the root pointer.
+ */
+ ASSERT(node == ifp->if_u1.if_root);
+ ifp->if_u1.if_root = node->ptrs[0];
+ ifp->if_height--;
+ kmem_free(node);
+ }
+}
+
+static void
+xfs_iext_rebalance_leaf(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_iext_leaf *leaf,
+ xfs_fileoff_t offset,
+ int nr_entries)
+{
+ /*
+ * If the neighbouring nodes are completely full we might never be able
+ * to merge our node, and will only delete it once the number of
+ * entries hits zero.
+ */
+ if (nr_entries == 0)
+ goto remove_node;
+
+ if (leaf->prev) {
+ int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
+
+ if (nr_prev + nr_entries <= RECS_PER_LEAF) {
+ for (i = 0; i < nr_entries; i++)
+ leaf->prev->recs[nr_prev + i] = leaf->recs[i];
+
+ if (cur->leaf == leaf) {
+ cur->leaf = leaf->prev;
+ cur->pos += nr_prev;
+ }
+ goto remove_node;
+ }
+ }
+
+ if (leaf->next) {
+ int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
+
+ if (nr_entries + nr_next <= RECS_PER_LEAF) {
+ /*
+ * Merge the next node into this node so that we don't
+ * have to do an additional update of the keys in the
+ * higher levels.
+ */
+ for (i = 0; i < nr_next; i++) {
+ leaf->recs[nr_entries + i] =
+ leaf->next->recs[i];
+ }
+
+ if (cur->leaf == leaf->next) {
+ cur->leaf = leaf;
+ cur->pos += nr_entries;
+ }
+
+ offset = xfs_iext_leaf_key(leaf->next, 0);
+ leaf = leaf->next;
+ goto remove_node;
+ }
+ }
+
+ return;
+remove_node:
+ if (leaf->prev)
+ leaf->prev->next = leaf->next;
+ if (leaf->next)
+ leaf->next->prev = leaf->prev;
+ xfs_iext_remove_node(ifp, offset, leaf);
+}
+
+static void
+xfs_iext_free_last_leaf(
+ struct xfs_ifork *ifp)
+{
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height--;
+ kmem_free(ifp->if_u1.if_root);
+}
+
+void
+xfs_iext_remove(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+ struct xfs_iext_leaf *leaf = cur->leaf;
+ xfs_fileoff_t offset = xfs_iext_leaf_key(leaf, 0);
+ int i, nr_entries;
+
+ trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
+
+ ASSERT(ifp->if_height > 0);
+ ASSERT(ifp->if_u1.if_root != NULL);
+ ASSERT(xfs_iext_valid(ifp, cur));
+
+ nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
+ for (i = cur->pos; i < nr_entries; i++)
+ leaf->recs[i] = leaf->recs[i + 1];
+ xfs_iext_rec_clear(&leaf->recs[nr_entries]);
+ ifp->if_bytes -= sizeof(struct xfs_iext_rec);
+
+ if (cur->pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1,
+ leaf);
+ offset = xfs_iext_leaf_key(leaf, 0);
+ } else if (cur->pos == nr_entries) {
+ if (ifp->if_height > 1 && leaf->next)
+ cur->leaf = leaf->next;
+ else
+ cur->leaf = NULL;
+ cur->pos = 0;
+ }
+
+ if (nr_entries >= RECS_PER_LEAF / 2)
+ return;
+
+ if (ifp->if_height > 1)
+ xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
+ else if (nr_entries == 0)
+ xfs_iext_free_last_leaf(ifp);
+}
+
+/*
+ * Lookup the extent covering bno.
+ *
+ * If there is an extent covering bno return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent cursor in *cur.
+ * If there is no extent covering bno, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its cursor in *cur
+ * instead.
+ * If bno is beyond the last extent return false, and return an invalid
+ * cursor value.
+ */
+bool
+xfs_iext_lookup_extent(
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
+
+ cur->leaf = xfs_iext_find_level(ifp, offset, 1);
+ if (!cur->leaf) {
+ cur->pos = 0;
+ return false;
+ }
+
+ for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) {
+ struct xfs_iext_rec *rec = cur_rec(cur);
+
+ if (xfs_iext_rec_is_empty(rec))
+ break;
+ if (xfs_iext_rec_cmp(rec, offset) >= 0)
+ goto found;
+ }
+
+ /* Try looking in the next node for an entry > offset */
+ if (ifp->if_height == 1 || !cur->leaf->next)
+ return false;
+ cur->leaf = cur->leaf->next;
+ cur->pos = 0;
+ if (!xfs_iext_valid(ifp, cur))
+ return false;
+found:
+ xfs_iext_get(gotp, cur_rec(cur));
+ return true;
+}
+
+/*
+ * Returns the last extent before end, and if this extent doesn't cover
+ * end, update end to the end of the extent.
+ */
+bool
+xfs_iext_lookup_extent_before(
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t *end,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ /* could be optimized to not even look up the next on a match.. */
+ if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
+ gotp->br_startoff <= *end - 1)
+ return true;
+ if (!xfs_iext_prev_extent(ifp, cur, gotp))
+ return false;
+ *end = gotp->br_startoff + gotp->br_blockcount;
+ return true;
+}
+
+void
+xfs_iext_update_extent(
+ struct xfs_inode *ip,
+ int state,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *new)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+
+ if (cur->pos == 0) {
+ struct xfs_bmbt_irec old;
+
+ xfs_iext_get(&old, cur_rec(cur));
+ if (new->br_startoff != old.br_startoff) {
+ xfs_iext_update_node(ifp, old.br_startoff,
+ new->br_startoff, 1, cur->leaf);
+ }
+ }
+
+ trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
+ xfs_iext_set(cur_rec(cur), new);
+ trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
+}
+
+/*
+ * Return true if the cursor points at an extent and return the extent structure
+ * in gotp. Else return false.
+ */
+bool
+xfs_iext_get_extent(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ if (!xfs_iext_valid(ifp, cur))
+ return false;
+ xfs_iext_get(gotp, cur_rec(cur));
+ return true;
+}
+
+/*
+ * This is a recursive function, because of that we need to be extremely
+ * careful with stack usage.
+ */
+static void
+xfs_iext_destroy_node(
+ struct xfs_iext_node *node,
+ int level)
+{
+ int i;
+
+ if (level > 1) {
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (node->keys[i] == XFS_IEXT_KEY_INVALID)
+ break;
+ xfs_iext_destroy_node(node->ptrs[i], level - 1);
+ }
+ }
+
+ kmem_free(node);
+}
+
+void
+xfs_iext_destroy(
+ struct xfs_ifork *ifp)
+{
+ xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
+
+ ifp->if_bytes = 0;
+ ifp->if_height = 0;
+ ifp->if_u1.if_root = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 378f8fbc91a7..6b7989038d75 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -24,6 +24,7 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
#include "xfs_icache.h"
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 31840ca24018..1c90ec41e9df 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -42,21 +42,27 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
+{
+ return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
+}
+
/*
- * Move inode type and inode format specific information from the
- * on-disk inode to the in-core inode. For fifos, devs, and sockets
- * this means set if_rdev to the proper value. For files, directories,
- * and symlinks this means to bring in the in-line data or extent
- * pointers. For a file in B-tree format, only the root is immediately
- * brought in-core. The rest will be in-lined in if_extents when it
- * is first referenced (see xfs_iread_extents()).
+ * Copy inode type and data and attr format specific information from the
+ * on-disk inode to the in-core inode and fork structures. For fifos, devices,
+ * and sockets this means set i_rdev to the proper value. For files,
+ * directories, and symlinks this means to bring in the in-line data or extent
+ * pointers as well as the attribute fork. For a fork in B-tree format, only
+ * the root is immediately brought in-core. The rest will be read in later when
+ * first referenced (see xfs_iread_extents()).
*/
int
xfs_iformat_fork(
- xfs_inode_t *ip,
- xfs_dinode_t *dip)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
{
- xfs_attr_shortform_t *atp;
+ struct inode *inode = VFS_I(ip);
+ struct xfs_attr_shortform *atp;
int size;
int error = 0;
xfs_fsize_t di_size;
@@ -95,8 +101,7 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
- if (unlikely(xfs_is_reflink_inode(ip) &&
- (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) {
+ if (unlikely(xfs_is_reflink_inode(ip) && !S_ISREG(inode->i_mode))) {
xfs_warn(ip->i_mount,
"corrupt dinode %llu, wrong file type for reflink.",
ip->i_ino);
@@ -115,7 +120,7 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
- switch (VFS_I(ip)->i_mode & S_IFMT) {
+ switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
case S_IFBLK:
@@ -126,7 +131,7 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
ip->i_d.di_size = 0;
- ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+ inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip));
break;
case S_IFREG:
@@ -184,8 +189,7 @@ xfs_iformat_fork(
return error;
/* Check inline dir contents. */
- if (S_ISDIR(VFS_I(ip)->i_mode) &&
- dip->di_format == XFS_DINODE_FMT_LOCAL) {
+ if (S_ISDIR(inode->i_mode) && dip->di_format == XFS_DINODE_FMT_LOCAL) {
error = xfs_dir2_sf_verify(ip);
if (error) {
xfs_idestroy_fork(ip, XFS_DATA_FORK);
@@ -265,19 +269,14 @@ xfs_init_local_fork(
if (zero_terminate)
mem_size++;
- if (size == 0)
- ifp->if_u1.if_data = NULL;
- else if (mem_size <= sizeof(ifp->if_u2.if_inline_data))
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- else {
+ if (size) {
real_size = roundup(mem_size, 4);
ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
- }
-
- if (size) {
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
+ } else {
+ ifp->if_u1.if_data = NULL;
}
ifp->if_bytes = size;
@@ -288,13 +287,6 @@ xfs_init_local_fork(
/*
* The file is in-lined in the on-disk inode.
- * If it fits into if_inline_data, then copy
- * it there, otherwise allocate a buffer for it
- * and copy the data there. Either way, set
- * if_data to point at the data.
- * If we allocate a buffer for the data, make
- * sure that its size is a multiple of 4 and
- * record the real size in i_real_bytes.
*/
STATIC int
xfs_iformat_local(
@@ -324,9 +316,7 @@ xfs_iformat_local(
/*
* The file consists of a set of extents all of which fit into the on-disk
- * inode. If there are few enough extents to fit into the if_inline_ext, then
- * copy them there. Otherwise allocate a buffer for them and copy them into it.
- * Either way, set if_extents to point at the extents.
+ * inode.
*/
STATIC int
xfs_iformat_extents(
@@ -336,9 +326,12 @@ xfs_iformat_extents(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int state = xfs_bmap_fork_to_state(whichfork);
int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
+ struct xfs_iext_cursor icur;
struct xfs_bmbt_rec *dp;
+ struct xfs_bmbt_irec new;
int i;
/*
@@ -354,27 +347,25 @@ xfs_iformat_extents(
}
ifp->if_real_bytes = 0;
- if (nex == 0)
- ifp->if_u1.if_extents = NULL;
- else if (nex <= XFS_INLINE_EXTS)
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- else
- xfs_iext_add(ifp, 0, nex);
-
- ifp->if_bytes = size;
+ ifp->if_bytes = 0;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
if (size) {
dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+
+ xfs_iext_first(ifp, &icur);
for (i = 0; i < nex; i++, dp++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- ep->l0 = get_unaligned_be64(&dp->l0);
- ep->l1 = get_unaligned_be64(&dp->l1);
- if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) {
+ xfs_bmbt_disk_get_all(dp, &new);
+ if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) {
XFS_ERROR_REPORT("xfs_iformat_extents(2)",
XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
+
+ xfs_iext_insert(ip, &icur, &new, state);
+ trace_xfs_read_extent(ip, &icur, state, _THIS_IP_);
+ xfs_iext_next(ifp, &icur);
}
- XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
}
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
@@ -440,47 +431,14 @@ xfs_iformat_btree(
ifp->if_flags &= ~XFS_IFEXTENTS;
ifp->if_flags |= XFS_IFBROOT;
+ ifp->if_real_bytes = 0;
+ ifp->if_bytes = 0;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
return 0;
}
/*
- * Read in extents from a btree-format inode.
- * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
- */
-int
-xfs_iread_extents(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- int whichfork)
-{
- int error;
- xfs_ifork_t *ifp;
- xfs_extnum_t nextents;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
- XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return -EFSCORRUPTED;
- }
- nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
- ifp = XFS_IFORK_PTR(ip, whichfork);
-
- /*
- * We know that the size is valid (it's checked in iformat_btree)
- */
- ifp->if_bytes = ifp->if_real_bytes = 0;
- xfs_iext_add(ifp, 0, nextents);
- error = xfs_bmap_read_extents(tp, ip, whichfork);
- if (error) {
- xfs_iext_destroy(ifp);
- return error;
- }
- ifp->if_flags |= XFS_IFEXTENTS;
- return 0;
-}
-/*
* Reallocate the space for if_broot based on the number of records
* being added or deleted as indicated in rec_diff. Move the records
* and pointers in if_broot to fit the new size. When shrinking this
@@ -644,26 +602,9 @@ xfs_idata_realloc(
ASSERT(new_size >= 0);
if (new_size == 0) {
- if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- kmem_free(ifp->if_u1.if_data);
- }
+ kmem_free(ifp->if_u1.if_data);
ifp->if_u1.if_data = NULL;
real_size = 0;
- } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
- /*
- * If the valid extents/data can fit in if_inline_ext/data,
- * copy them from the malloc'd vector and free it.
- */
- if (ifp->if_u1.if_data == NULL) {
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- ASSERT(ifp->if_real_bytes != 0);
- memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
- new_size);
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- }
- real_size = 0;
} else {
/*
* Stuck with malloc/realloc.
@@ -677,7 +618,7 @@ xfs_idata_realloc(
ASSERT(ifp->if_real_bytes == 0);
ifp->if_u1.if_data = kmem_alloc(real_size,
KM_SLEEP | KM_NOFS);
- } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+ } else {
/*
* Only do the realloc if the underlying size
* is really changing.
@@ -688,12 +629,6 @@ xfs_idata_realloc(
real_size,
KM_SLEEP | KM_NOFS);
}
- } else {
- ASSERT(ifp->if_real_bytes == 0);
- ifp->if_u1.if_data = kmem_alloc(real_size,
- KM_SLEEP | KM_NOFS);
- memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
- ifp->if_bytes);
}
}
ifp->if_real_bytes = real_size;
@@ -721,23 +656,18 @@ xfs_idestroy_fork(
* so check and free it up if we do.
*/
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
- (ifp->if_u1.if_data != NULL)) {
+ if (ifp->if_u1.if_data != NULL) {
ASSERT(ifp->if_real_bytes != 0);
kmem_free(ifp->if_u1.if_data);
ifp->if_u1.if_data = NULL;
ifp->if_real_bytes = 0;
}
- } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
- ((ifp->if_flags & XFS_IFEXTIREC) ||
- ((ifp->if_u1.if_extents != NULL) &&
- (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
- ASSERT(ifp->if_real_bytes != 0);
+ } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) {
xfs_iext_destroy(ifp);
}
- ASSERT(ifp->if_u1.if_extents == NULL ||
- ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+
ASSERT(ifp->if_real_bytes == 0);
+
if (whichfork == XFS_ATTR_FORK) {
kmem_zone_free(xfs_ifork_zone, ip->i_afp);
ip->i_afp = NULL;
@@ -747,19 +677,9 @@ xfs_idestroy_fork(
}
}
-/* Count number of incore extents based on if_bytes */
-xfs_extnum_t
-xfs_iext_count(struct xfs_ifork *ifp)
-{
- return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-}
-
/*
* Convert in-core extents to on-disk form
*
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode.
- *
* In the case of the data fork, the in-core and on-disk fork sizes can be
* different due to delayed allocation extents. We only copy on-disk extents
* here, so callers must always use the physical fork size to determine the
@@ -768,53 +688,32 @@ xfs_iext_count(struct xfs_ifork *ifp)
*/
int
xfs_iextents_copy(
- xfs_inode_t *ip,
- xfs_bmbt_rec_t *dp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_rec *dp,
int whichfork)
{
- int copied;
- int i;
- xfs_ifork_t *ifp;
- int nrecs;
- xfs_fsblock_t start_block;
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec rec;
+ int copied = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
ASSERT(ifp->if_bytes > 0);
- nrecs = xfs_iext_count(ifp);
- XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
- ASSERT(nrecs > 0);
-
- /*
- * There are some delayed allocation extents in the
- * inode, so copy the extents one at a time and skip
- * the delayed ones. There must be at least one
- * non-delayed extent.
- */
- copied = 0;
- for (i = 0; i < nrecs; i++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-
- ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep));
-
- start_block = xfs_bmbt_get_startblock(ep);
- if (isnullstartblock(start_block)) {
- /*
- * It's a delayed allocation extent, so skip it.
- */
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
continue;
- }
-
- /* Translate to on disk format */
- put_unaligned_be64(ep->l0, &dp->l0);
- put_unaligned_be64(ep->l1, &dp->l1);
+ ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, &rec));
+ xfs_bmbt_disk_set_all(dp, &rec);
+ trace_xfs_write_extent(ip, &icur, state, _RET_IP_);
+ copied += sizeof(struct xfs_bmbt_rec);
dp++;
- copied++;
}
- ASSERT(copied != 0);
- return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+ ASSERT(copied > 0);
+ ASSERT(copied <= ifp->if_bytes);
+ return copied;
}
/*
@@ -872,7 +771,6 @@ xfs_iflush_fork(
!(iip->ili_fields & extflag[whichfork]));
if ((iip->ili_fields & extflag[whichfork]) &&
(ifp->if_bytes > 0)) {
- ASSERT(xfs_iext_get_ext(ifp, 0));
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
whichfork);
@@ -894,16 +792,7 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_DEV:
if (iip->ili_fields & XFS_ILOG_DEV) {
ASSERT(whichfork == XFS_DATA_FORK);
- xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
- }
- break;
-
- case XFS_DINODE_FMT_UUID:
- if (iip->ili_fields & XFS_ILOG_UUID) {
- ASSERT(whichfork == XFS_DATA_FORK);
- memcpy(XFS_DFORK_DPTR(dip),
- &ip->i_df.if_u2.if_uuid,
- sizeof(uuid_t));
+ xfs_dinode_put_rdev(dip, sysv_encode_dev(VFS_I(ip)->i_rdev));
}
break;
@@ -913,33 +802,6 @@ xfs_iflush_fork(
}
}
-/*
- * Return a pointer to the extent record at file index idx.
- */
-xfs_bmbt_rec_host_t *
-xfs_iext_get_ext(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx) /* index of target extent */
-{
- ASSERT(idx >= 0);
- ASSERT(idx < xfs_iext_count(ifp));
-
- if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
- return ifp->if_u1.if_ext_irec->er_extbuf;
- } else if (ifp->if_flags & XFS_IFEXTIREC) {
- xfs_ext_irec_t *erp; /* irec pointer */
- int erp_idx = 0; /* irec index */
- xfs_extnum_t page_idx = idx; /* ext index in target list */
-
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
- return &erp->er_extbuf[page_idx];
- } else if (ifp->if_bytes) {
- return &ifp->if_u1.if_extents[idx];
- } else {
- return NULL;
- }
-}
-
/* Convert bmap state flags to an inode fork. */
struct xfs_ifork *
xfs_iext_state_to_fork(
@@ -954,1011 +816,6 @@ xfs_iext_state_to_fork(
}
/*
- * Insert new item(s) into the extent records for incore inode
- * fork 'ifp'. 'count' new items are inserted at index 'idx'.
- */
-void
-xfs_iext_insert(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* starting index of new items */
- xfs_extnum_t count, /* number of inserted items */
- xfs_bmbt_irec_t *new, /* items to insert */
- int state) /* type of extent conversion */
-{
- xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state);
- xfs_extnum_t i; /* extent record index */
-
- trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
-
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- xfs_iext_add(ifp, idx, count);
- for (i = idx; i < idx + count; i++, new++)
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be increased. The ext_diff parameter stores the
- * number of new extents being added and the idx parameter contains
- * the extent index where the new extents will be added. If the new
- * extents are being appended, then we just need to (re)allocate and
- * initialize the space. Otherwise, if the new extents are being
- * inserted into the middle of the existing entries, a bit more work
- * is required to make room for the new extents to be inserted. The
- * caller is responsible for filling in the new extent entries upon
- * return.
- */
-void
-xfs_iext_add(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin adding exts */
- int ext_diff) /* number of extents to add */
-{
- int byte_diff; /* new bytes being added */
- int new_size; /* size of extents after adding */
- xfs_extnum_t nextents; /* number of extents in file */
-
- nextents = xfs_iext_count(ifp);
- ASSERT((idx >= 0) && (idx <= nextents));
- byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
- new_size = ifp->if_bytes + byte_diff;
- /*
- * If the new number of extents (nextents + ext_diff)
- * fits inside the inode, then continue to use the inline
- * extent buffer.
- */
- if (nextents + ext_diff <= XFS_INLINE_EXTS) {
- if (idx < nextents) {
- memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
- &ifp->if_u2.if_inline_ext[idx],
- (nextents - idx) * sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
- }
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- ifp->if_real_bytes = 0;
- }
- /*
- * Otherwise use a linear (direct) extent list.
- * If the extents are currently inside the inode,
- * xfs_iext_realloc_direct will switch us from
- * inline to direct extent allocation mode.
- */
- else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
- xfs_iext_realloc_direct(ifp, new_size);
- if (idx < nextents) {
- memmove(&ifp->if_u1.if_extents[idx + ext_diff],
- &ifp->if_u1.if_extents[idx],
- (nextents - idx) * sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
- }
- }
- /* Indirection array */
- else {
- xfs_ext_irec_t *erp;
- int erp_idx = 0;
- int page_idx = idx;
-
- ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
- if (ifp->if_flags & XFS_IFEXTIREC) {
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
- } else {
- xfs_iext_irec_init(ifp);
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = ifp->if_u1.if_ext_irec;
- }
- /* Extents fit in target extent page */
- if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
- if (page_idx < erp->er_extcount) {
- memmove(&erp->er_extbuf[page_idx + ext_diff],
- &erp->er_extbuf[page_idx],
- (erp->er_extcount - page_idx) *
- sizeof(xfs_bmbt_rec_t));
- memset(&erp->er_extbuf[page_idx], 0, byte_diff);
- }
- erp->er_extcount += ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- }
- /* Insert a new extent page */
- else if (erp) {
- xfs_iext_add_indirect_multi(ifp,
- erp_idx, page_idx, ext_diff);
- }
- /*
- * If extent(s) are being appended to the last page in
- * the indirection array and the new extent(s) don't fit
- * in the page, then erp is NULL and erp_idx is set to
- * the next index needed in the indirection array.
- */
- else {
- uint count = ext_diff;
-
- while (count) {
- erp = xfs_iext_irec_new(ifp, erp_idx);
- erp->er_extcount = min(count, XFS_LINEAR_EXTS);
- count -= erp->er_extcount;
- if (count)
- erp_idx++;
- }
- }
- }
- ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being added to the indirection
- * array and the new extents do not fit in the target extent list. The
- * erp_idx parameter contains the irec index for the target extent list
- * in the indirection array, and the idx parameter contains the extent
- * index within the list. The number of extents being added is stored
- * in the count parameter.
- *
- * |-------| |-------|
- * | | | | idx - number of extents before idx
- * | idx | | count |
- * | | | | count - number of extents being inserted at idx
- * |-------| |-------|
- * | count | | nex2 | nex2 - number of extents after idx + count
- * |-------| |-------|
- */
-void
-xfs_iext_add_indirect_multi(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx, /* target extent irec index */
- xfs_extnum_t idx, /* index within target list */
- int count) /* new extents being added */
-{
- int byte_diff; /* new bytes being added */
- xfs_ext_irec_t *erp; /* pointer to irec entry */
- xfs_extnum_t ext_diff; /* number of extents to add */
- xfs_extnum_t ext_cnt; /* new extents still needed */
- xfs_extnum_t nex2; /* extents after idx + count */
- xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
- int nlists; /* number of irec's (lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- nex2 = erp->er_extcount - idx;
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
- /*
- * Save second part of target extent list
- * (all extents past */
- if (nex2) {
- byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
- nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
- memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
- erp->er_extcount -= nex2;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
- memset(&erp->er_extbuf[idx], 0, byte_diff);
- }
-
- /*
- * Add the new extents to the end of the target
- * list, then allocate new irec record(s) and
- * extent buffer(s) as needed to store the rest
- * of the new extents.
- */
- ext_cnt = count;
- ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
- if (ext_diff) {
- erp->er_extcount += ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- ext_cnt -= ext_diff;
- }
- while (ext_cnt) {
- erp_idx++;
- erp = xfs_iext_irec_new(ifp, erp_idx);
- ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
- erp->er_extcount = ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- ext_cnt -= ext_diff;
- }
-
- /* Add nex2 extents back to indirection array */
- if (nex2) {
- xfs_extnum_t ext_avail;
- int i;
-
- byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
- ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
- i = 0;
- /*
- * If nex2 extents fit in the current page, append
- * nex2_ep after the new extents.
- */
- if (nex2 <= ext_avail) {
- i = erp->er_extcount;
- }
- /*
- * Otherwise, check if space is available in the
- * next page.
- */
- else if ((erp_idx < nlists - 1) &&
- (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
- ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
- erp_idx++;
- erp++;
- /* Create a hole for nex2 extents */
- memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
- erp->er_extcount * sizeof(xfs_bmbt_rec_t));
- }
- /*
- * Final choice, create a new extent page for
- * nex2 extents.
- */
- else {
- erp_idx++;
- erp = xfs_iext_irec_new(ifp, erp_idx);
- }
- memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
- kmem_free(nex2_ep);
- erp->er_extcount += nex2;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
- }
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be decreased. The ext_diff parameter stores the
- * number of extents to be removed and the idx parameter contains
- * the extent index where the extents will be removed from.
- *
- * If the amount of space needed has decreased below the linear
- * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
- * extent array. Otherwise, use kmem_realloc() to adjust the
- * size to what is needed.
- */
-void
-xfs_iext_remove(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff, /* number of extents to remove */
- int state) /* type of extent conversion */
-{
- xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state);
- xfs_extnum_t nextents; /* number of extents in file */
- int new_size; /* size of extents after removal */
-
- trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
-
- ASSERT(ext_diff > 0);
- nextents = xfs_iext_count(ifp);
- new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
-
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- } else if (ifp->if_flags & XFS_IFEXTIREC) {
- xfs_iext_remove_indirect(ifp, idx, ext_diff);
- } else if (ifp->if_real_bytes) {
- xfs_iext_remove_direct(ifp, idx, ext_diff);
- } else {
- xfs_iext_remove_inline(ifp, idx, ext_diff);
- }
- ifp->if_bytes = new_size;
-}
-
-/*
- * This removes ext_diff extents from the inline buffer, beginning
- * at extent index idx.
- */
-void
-xfs_iext_remove_inline(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff) /* number of extents to remove */
-{
- int nextents; /* number of extents in file */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- ASSERT(idx < XFS_INLINE_EXTS);
- nextents = xfs_iext_count(ifp);
- ASSERT(((nextents - ext_diff) > 0) &&
- (nextents - ext_diff) < XFS_INLINE_EXTS);
-
- if (idx + ext_diff < nextents) {
- memmove(&ifp->if_u2.if_inline_ext[idx],
- &ifp->if_u2.if_inline_ext[idx + ext_diff],
- (nextents - (idx + ext_diff)) *
- sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
- 0, ext_diff * sizeof(xfs_bmbt_rec_t));
- } else {
- memset(&ifp->if_u2.if_inline_ext[idx], 0,
- ext_diff * sizeof(xfs_bmbt_rec_t));
- }
-}
-
-/*
- * This removes ext_diff extents from a linear (direct) extent list,
- * beginning at extent index idx. If the extents are being removed
- * from the end of the list (ie. truncate) then we just need to re-
- * allocate the list to remove the extra space. Otherwise, if the
- * extents are being removed from the middle of the existing extent
- * entries, then we first need to move the extent records beginning
- * at idx + ext_diff up in the list to overwrite the records being
- * removed, then remove the extra space via kmem_realloc.
- */
-void
-xfs_iext_remove_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff) /* number of extents to remove */
-{
- xfs_extnum_t nextents; /* number of extents in file */
- int new_size; /* size of extents after removal */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- new_size = ifp->if_bytes -
- (ext_diff * sizeof(xfs_bmbt_rec_t));
- nextents = xfs_iext_count(ifp);
-
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- return;
- }
- /* Move extents up in the list (if needed) */
- if (idx + ext_diff < nextents) {
- memmove(&ifp->if_u1.if_extents[idx],
- &ifp->if_u1.if_extents[idx + ext_diff],
- (nextents - (idx + ext_diff)) *
- sizeof(xfs_bmbt_rec_t));
- }
- memset(&ifp->if_u1.if_extents[nextents - ext_diff],
- 0, ext_diff * sizeof(xfs_bmbt_rec_t));
- /*
- * Reallocate the direct extent list. If the extents
- * will fit inside the inode then xfs_iext_realloc_direct
- * will switch from direct to inline extent allocation
- * mode for us.
- */
- xfs_iext_realloc_direct(ifp, new_size);
- ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being removed from the
- * indirection array and the extents being removed span multiple extent
- * buffers. The idx parameter contains the file extent index where we
- * want to begin removing extents, and the count parameter contains
- * how many extents need to be removed.
- *
- * |-------| |-------|
- * | nex1 | | | nex1 - number of extents before idx
- * |-------| | count |
- * | | | | count - number of extents being removed at idx
- * | count | |-------|
- * | | | nex2 | nex2 - number of extents after idx + count
- * |-------| |-------|
- */
-void
-xfs_iext_remove_indirect(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing extents */
- int count) /* number of extents to remove */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int erp_idx = 0; /* indirection array index */
- xfs_extnum_t ext_cnt; /* extents left to remove */
- xfs_extnum_t ext_diff; /* extents to remove in current list */
- xfs_extnum_t nex1; /* number of extents before idx */
- xfs_extnum_t nex2; /* extents after idx + count */
- int page_idx = idx; /* index in target extent list */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
- ASSERT(erp != NULL);
- nex1 = page_idx;
- ext_cnt = count;
- while (ext_cnt) {
- nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
- ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
- /*
- * Check for deletion of entire list;
- * xfs_iext_irec_remove() updates extent offsets.
- */
- if (ext_diff == erp->er_extcount) {
- xfs_iext_irec_remove(ifp, erp_idx);
- ext_cnt -= ext_diff;
- nex1 = 0;
- if (ext_cnt) {
- ASSERT(erp_idx < ifp->if_real_bytes /
- XFS_IEXT_BUFSZ);
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- nex1 = 0;
- continue;
- } else {
- break;
- }
- }
- /* Move extents up (if needed) */
- if (nex2) {
- memmove(&erp->er_extbuf[nex1],
- &erp->er_extbuf[nex1 + ext_diff],
- nex2 * sizeof(xfs_bmbt_rec_t));
- }
- /* Zero out rest of page */
- memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
- ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
- /* Update remaining counters */
- erp->er_extcount -= ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
- ext_cnt -= ext_diff;
- nex1 = 0;
- erp_idx++;
- erp++;
- }
- ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
- xfs_iext_irec_compact(ifp);
-}
-
-/*
- * Create, destroy, or resize a linear (direct) block of extents.
- */
-void
-xfs_iext_realloc_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* new size of extents after adding */
-{
- int rnew_size; /* real new size of extents */
-
- rnew_size = new_size;
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
- ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
- (new_size != ifp->if_real_bytes)));
-
- /* Free extent records */
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- }
- /* Resize direct extent list and zero any new bytes */
- else if (ifp->if_real_bytes) {
- /* Check if extents will fit inside the inode */
- if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
- xfs_iext_direct_to_inline(ifp, new_size /
- (uint)sizeof(xfs_bmbt_rec_t));
- ifp->if_bytes = new_size;
- return;
- }
- if (!is_power_of_2(new_size)){
- rnew_size = roundup_pow_of_two(new_size);
- }
- if (rnew_size != ifp->if_real_bytes) {
- ifp->if_u1.if_extents =
- kmem_realloc(ifp->if_u1.if_extents,
- rnew_size, KM_NOFS);
- }
- if (rnew_size > ifp->if_real_bytes) {
- memset(&ifp->if_u1.if_extents[ifp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)], 0,
- rnew_size - ifp->if_real_bytes);
- }
- }
- /* Switch from the inline extent buffer to a direct extent list */
- else {
- if (!is_power_of_2(new_size)) {
- rnew_size = roundup_pow_of_two(new_size);
- }
- xfs_iext_inline_to_direct(ifp, rnew_size);
- }
- ifp->if_real_bytes = rnew_size;
- ifp->if_bytes = new_size;
-}
-
-/*
- * Switch from linear (direct) extent records to inline buffer.
- */
-void
-xfs_iext_direct_to_inline(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t nextents) /* number of extents in file */
-{
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ASSERT(nextents <= XFS_INLINE_EXTS);
- /*
- * The inline buffer was zeroed when we switched
- * from inline to direct extent allocation mode,
- * so we don't need to clear it here.
- */
- memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
- nextents * sizeof(xfs_bmbt_rec_t));
- kmem_free(ifp->if_u1.if_extents);
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- ifp->if_real_bytes = 0;
-}
-
-/*
- * Switch from inline buffer to linear (direct) extent records.
- * new_size should already be rounded up to the next power of 2
- * by the caller (when appropriate), so use new_size as it is.
- * However, since new_size may be rounded up, we can't update
- * if_bytes here. It is the caller's responsibility to update
- * if_bytes upon return.
- */
-void
-xfs_iext_inline_to_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* number of extents in file */
-{
- ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
- memset(ifp->if_u1.if_extents, 0, new_size);
- if (ifp->if_bytes) {
- memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
- ifp->if_bytes);
- memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
- sizeof(xfs_bmbt_rec_t));
- }
- ifp->if_real_bytes = new_size;
-}
-
-/*
- * Resize an extent indirection array to new_size bytes.
- */
-STATIC void
-xfs_iext_realloc_indirect(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* new indirection array size */
-{
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- ASSERT(ifp->if_real_bytes);
- ASSERT((new_size >= 0) &&
- (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
- sizeof(xfs_ext_irec_t))));
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- } else {
- ifp->if_u1.if_ext_irec =
- kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
- }
-}
-
-/*
- * Switch from indirection array to linear (direct) extent allocations.
- */
-STATIC void
-xfs_iext_indirect_to_direct(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- xfs_extnum_t nextents; /* number of extents in file */
- int size; /* size of file extents */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nextents = xfs_iext_count(ifp);
- ASSERT(nextents <= XFS_LINEAR_EXTS);
- size = nextents * sizeof(xfs_bmbt_rec_t);
-
- xfs_iext_irec_compact_pages(ifp);
- ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
-
- ep = ifp->if_u1.if_ext_irec->er_extbuf;
- kmem_free(ifp->if_u1.if_ext_irec);
- ifp->if_flags &= ~XFS_IFEXTIREC;
- ifp->if_u1.if_extents = ep;
- ifp->if_bytes = size;
- if (nextents < XFS_LINEAR_EXTS) {
- xfs_iext_realloc_direct(ifp, size);
- }
-}
-
-/*
- * Remove all records from the indirection array.
- */
-STATIC void
-xfs_iext_irec_remove_all(
- struct xfs_ifork *ifp)
-{
- int nlists;
- int i;
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- for (i = 0; i < nlists; i++)
- kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
- kmem_free(ifp->if_u1.if_ext_irec);
- ifp->if_flags &= ~XFS_IFEXTIREC;
-}
-
-/*
- * Free incore file extents.
- */
-void
-xfs_iext_destroy(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- if (ifp->if_flags & XFS_IFEXTIREC) {
- xfs_iext_irec_remove_all(ifp);
- } else if (ifp->if_real_bytes) {
- kmem_free(ifp->if_u1.if_extents);
- } else if (ifp->if_bytes) {
- memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
- sizeof(xfs_bmbt_rec_t));
- }
- ifp->if_u1.if_extents = NULL;
- ifp->if_real_bytes = 0;
- ifp->if_bytes = 0;
-}
-
-/*
- * Return a pointer to the extent record for file system block bno.
- */
-xfs_bmbt_rec_host_t * /* pointer to found extent record */
-xfs_iext_bno_to_ext(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number to search for */
- xfs_extnum_t *idxp) /* index of target extent */
-{
- xfs_bmbt_rec_host_t *base; /* pointer to first extent */
- xfs_filblks_t blockcount = 0; /* number of blocks in extent */
- xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
- xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
- int high; /* upper boundary in search */
- xfs_extnum_t idx = 0; /* index of target extent */
- int low; /* lower boundary in search */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_fileoff_t startoff = 0; /* start offset of extent */
-
- nextents = xfs_iext_count(ifp);
- if (nextents == 0) {
- *idxp = 0;
- return NULL;
- }
- low = 0;
- if (ifp->if_flags & XFS_IFEXTIREC) {
- /* Find target extent list */
- int erp_idx = 0;
- erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
- base = erp->er_extbuf;
- high = erp->er_extcount - 1;
- } else {
- base = ifp->if_u1.if_extents;
- high = nextents - 1;
- }
- /* Binary search extent records */
- while (low <= high) {
- idx = (low + high) >> 1;
- ep = base + idx;
- startoff = xfs_bmbt_get_startoff(ep);
- blockcount = xfs_bmbt_get_blockcount(ep);
- if (bno < startoff) {
- high = idx - 1;
- } else if (bno >= startoff + blockcount) {
- low = idx + 1;
- } else {
- /* Convert back to file-based extent index */
- if (ifp->if_flags & XFS_IFEXTIREC) {
- idx += erp->er_extoff;
- }
- *idxp = idx;
- return ep;
- }
- }
- /* Convert back to file-based extent index */
- if (ifp->if_flags & XFS_IFEXTIREC) {
- idx += erp->er_extoff;
- }
- if (bno >= startoff + blockcount) {
- if (++idx == nextents) {
- ep = NULL;
- } else {
- ep = xfs_iext_get_ext(ifp, idx);
- }
- }
- *idxp = idx;
- return ep;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record for filesystem block bno. Store the index of the
- * target irec in *erp_idxp.
- */
-xfs_ext_irec_t * /* pointer to found extent record */
-xfs_iext_bno_to_irec(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number to search for */
- int *erp_idxp) /* irec index of target ext list */
-{
- xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
- xfs_ext_irec_t *erp_next; /* next indirection array entry */
- int erp_idx; /* indirection array index */
- int nlists; /* number of extent irec's (lists) */
- int high; /* binary search upper limit */
- int low; /* binary search lower limit */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp_idx = 0;
- low = 0;
- high = nlists - 1;
- while (low <= high) {
- erp_idx = (low + high) >> 1;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
- if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
- high = erp_idx - 1;
- } else if (erp_next && bno >=
- xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
- low = erp_idx + 1;
- } else {
- break;
- }
- }
- *erp_idxp = erp_idx;
- return erp;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record at file extent index *idxp. Store the index of the
- * target irec in *erp_idxp and store the page index of the target
- * extent record in *idxp.
- */
-xfs_ext_irec_t *
-xfs_iext_idx_to_irec(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t *idxp, /* extent index (file -> page) */
- int *erp_idxp, /* pointer to target irec */
- int realloc) /* new bytes were just added */
-{
- xfs_ext_irec_t *prev; /* pointer to previous irec */
- xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
- int erp_idx; /* indirection array index */
- int nlists; /* number of irec's (ex lists) */
- int high; /* binary search upper limit */
- int low; /* binary search lower limit */
- xfs_extnum_t page_idx = *idxp; /* extent index in target list */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- ASSERT(page_idx >= 0);
- ASSERT(page_idx <= xfs_iext_count(ifp));
- ASSERT(page_idx < xfs_iext_count(ifp) || realloc);
-
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp_idx = 0;
- low = 0;
- high = nlists - 1;
-
- /* Binary search extent irec's */
- while (low <= high) {
- erp_idx = (low + high) >> 1;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- prev = erp_idx > 0 ? erp - 1 : NULL;
- if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
- realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
- high = erp_idx - 1;
- } else if (page_idx > erp->er_extoff + erp->er_extcount ||
- (page_idx == erp->er_extoff + erp->er_extcount &&
- !realloc)) {
- low = erp_idx + 1;
- } else if (page_idx == erp->er_extoff + erp->er_extcount &&
- erp->er_extcount == XFS_LINEAR_EXTS) {
- ASSERT(realloc);
- page_idx = 0;
- erp_idx++;
- erp = erp_idx < nlists ? erp + 1 : NULL;
- break;
- } else {
- page_idx -= erp->er_extoff;
- break;
- }
- }
- *idxp = page_idx;
- *erp_idxp = erp_idx;
- return erp;
-}
-
-/*
- * Allocate and initialize an indirection array once the space needed
- * for incore extents increases above XFS_IEXT_BUFSZ.
- */
-void
-xfs_iext_irec_init(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- xfs_extnum_t nextents; /* number of extents in file */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- nextents = xfs_iext_count(ifp);
- ASSERT(nextents <= XFS_LINEAR_EXTS);
-
- erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-
- if (nextents == 0) {
- ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
- } else if (!ifp->if_real_bytes) {
- xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
- } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
- xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
- }
- erp->er_extbuf = ifp->if_u1.if_extents;
- erp->er_extcount = nextents;
- erp->er_extoff = 0;
-
- ifp->if_flags |= XFS_IFEXTIREC;
- ifp->if_real_bytes = XFS_IEXT_BUFSZ;
- ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
- ifp->if_u1.if_ext_irec = erp;
-
- return;
-}
-
-/*
- * Allocate and initialize a new entry in the indirection array.
- */
-xfs_ext_irec_t *
-xfs_iext_irec_new(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx) /* index for new irec */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
- /* Resize indirection array */
- xfs_iext_realloc_indirect(ifp, ++nlists *
- sizeof(xfs_ext_irec_t));
- /*
- * Move records down in the array so the
- * new page can use erp_idx.
- */
- erp = ifp->if_u1.if_ext_irec;
- for (i = nlists - 1; i > erp_idx; i--) {
- memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
- }
- ASSERT(i == erp_idx);
-
- /* Initialize new extent record */
- erp = ifp->if_u1.if_ext_irec;
- erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
- ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
- memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
- erp[erp_idx].er_extcount = 0;
- erp[erp_idx].er_extoff = erp_idx > 0 ?
- erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
- return (&erp[erp_idx]);
-}
-
-/*
- * Remove a record from the indirection array.
- */
-void
-xfs_iext_irec_remove(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx) /* irec index to remove */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- if (erp->er_extbuf) {
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
- -erp->er_extcount);
- kmem_free(erp->er_extbuf);
- }
- /* Compact extent records */
- erp = ifp->if_u1.if_ext_irec;
- for (i = erp_idx; i < nlists - 1; i++) {
- memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
- }
- /*
- * Manually free the last extent record from the indirection
- * array. A call to xfs_iext_realloc_indirect() with a size
- * of zero would result in a call to xfs_iext_destroy() which
- * would in turn call this function again, creating a nasty
- * infinite loop.
- */
- if (--nlists) {
- xfs_iext_realloc_indirect(ifp,
- nlists * sizeof(xfs_ext_irec_t));
- } else {
- kmem_free(ifp->if_u1.if_ext_irec);
- }
- ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-}
-
-/*
- * This is called to clean up large amounts of unused memory allocated
- * by the indirection array. Before compacting anything though, verify
- * that the indirection array is still needed and switch back to the
- * linear extent list (or even the inline buffer) if possible. The
- * compaction policy is as follows:
- *
- * Full Compaction: Extents fit into a single page (or inline buffer)
- * Partial Compaction: Extents occupy less than 50% of allocated space
- * No Compaction: Extents occupy at least 50% of allocated space
- */
-void
-xfs_iext_irec_compact(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_extnum_t nextents; /* number of extents in file */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- nextents = xfs_iext_count(ifp);
-
- if (nextents == 0) {
- xfs_iext_destroy(ifp);
- } else if (nextents <= XFS_INLINE_EXTS) {
- xfs_iext_indirect_to_direct(ifp);
- xfs_iext_direct_to_inline(ifp, nextents);
- } else if (nextents <= XFS_LINEAR_EXTS) {
- xfs_iext_indirect_to_direct(ifp);
- } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
- xfs_iext_irec_compact_pages(ifp);
- }
-}
-
-/*
- * Combine extents from neighboring extent pages.
- */
-void
-xfs_iext_irec_compact_pages(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
- int erp_idx = 0; /* indirection array index */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- while (erp_idx < nlists - 1) {
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- erp_next = erp + 1;
- if (erp_next->er_extcount <=
- (XFS_LINEAR_EXTS - erp->er_extcount)) {
- memcpy(&erp->er_extbuf[erp->er_extcount],
- erp_next->er_extbuf, erp_next->er_extcount *
- sizeof(xfs_bmbt_rec_t));
- erp->er_extcount += erp_next->er_extcount;
- /*
- * Free page before removing extent record
- * so er_extoffs don't get modified in
- * xfs_iext_irec_remove.
- */
- kmem_free(erp_next->er_extbuf);
- erp_next->er_extbuf = NULL;
- xfs_iext_irec_remove(ifp, erp_idx + 1);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- } else {
- erp_idx++;
- }
- }
-}
-
-/*
- * This is called to update the er_extoff field in the indirection
- * array when extents have been added or removed from one of the
- * extent lists. erp_idx contains the irec index to begin updating
- * at and ext_diff contains the number of extents that were added
- * or removed.
- */
-void
-xfs_iext_irec_update_extoffs(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx, /* irec index to update */
- int ext_diff) /* number of new extents */
-{
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- for (i = erp_idx; i < nlists; i++) {
- ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
- }
-}
-
-/*
* Initialize an inode's copy-on-write fork.
*/
void
@@ -1974,61 +831,3 @@ xfs_ifork_init_cow(
ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
ip->i_cnextents = 0;
}
-
-/*
- * Lookup the extent covering bno.
- *
- * If there is an extent covering bno return the extent index, and store the
- * expanded extent structure in *gotp, and the extent index in *idx.
- * If there is no extent covering bno, but there is an extent after it (e.g.
- * it lies in a hole) return that extent in *gotp and its index in *idx
- * instead.
- * If bno is beyond the last extent return false, and return the index after
- * the last valid index in *idxp.
- */
-bool
-xfs_iext_lookup_extent(
- struct xfs_inode *ip,
- struct xfs_ifork *ifp,
- xfs_fileoff_t bno,
- xfs_extnum_t *idxp,
- struct xfs_bmbt_irec *gotp)
-{
- struct xfs_bmbt_rec_host *ep;
-
- XFS_STATS_INC(ip->i_mount, xs_look_exlist);
-
- ep = xfs_iext_bno_to_ext(ifp, bno, idxp);
- if (!ep)
- return false;
- xfs_bmbt_get_all(ep, gotp);
- return true;
-}
-
-/*
- * Return true if there is an extent at index idx, and return the expanded
- * extent structure at idx in that case. Else return false.
- */
-bool
-xfs_iext_get_extent(
- struct xfs_ifork *ifp,
- xfs_extnum_t idx,
- struct xfs_bmbt_irec *gotp)
-{
- if (idx < 0 || idx >= xfs_iext_count(ifp))
- return false;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp);
- return true;
-}
-
-void
-xfs_iext_update_extent(
- struct xfs_ifork *ifp,
- xfs_extnum_t idx,
- struct xfs_bmbt_irec *gotp)
-{
- ASSERT(idx >= 0);
- ASSERT(idx < xfs_iext_count(ifp));
-
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp);
-}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 11af705219f6..b9f0098e33b8 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -22,56 +22,19 @@ struct xfs_inode_log_item;
struct xfs_dinode;
/*
- * The following xfs_ext_irec_t struct introduces a second (top) level
- * to the in-core extent allocation scheme. These structs are allocated
- * in a contiguous block, creating an indirection array where each entry
- * (irec) contains a pointer to a buffer of in-core extent records which
- * it manages. Each extent buffer is 4k in size, since 4k is the system
- * page size on Linux i386 and systems with larger page sizes don't seem
- * to gain much, if anything, by using their native page size as the
- * extent buffer size. Also, using 4k extent buffers everywhere provides
- * a consistent interface for CXFS across different platforms.
- *
- * There is currently no limit on the number of irec's (extent lists)
- * allowed, so heavily fragmented files may require an indirection array
- * which spans multiple system pages of memory. The number of extents
- * which would require this amount of contiguous memory is very large
- * and should not cause problems in the foreseeable future. However,
- * if the memory needed for the contiguous array ever becomes a problem,
- * it is possible that a third level of indirection may be required.
- */
-typedef struct xfs_ext_irec {
- xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
- xfs_extnum_t er_extoff; /* extent offset in file */
- xfs_extnum_t er_extcount; /* number of extents in page/block */
-} xfs_ext_irec_t;
-
-/*
* File incore extent information, present for each of data & attr forks.
*/
-#define XFS_IEXT_BUFSZ 4096
-#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
-#define XFS_INLINE_EXTS 2
-#define XFS_INLINE_DATA 32
typedef struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
int if_real_bytes; /* bytes allocated in if_u1 */
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
+ int if_height; /* height of the extent tree */
union {
- xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
- xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
+ void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
- union {
- xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
- /* very small file extents */
- char if_inline_data[XFS_INLINE_DATA];
- /* very small file data */
- xfs_dev_t if_rdev; /* dev number if special */
- uuid_t if_uuid; /* mount point value */
- } if_u2;
} xfs_ifork_t;
/*
@@ -80,7 +43,6 @@ typedef struct xfs_ifork {
#define XFS_IFINLINE 0x01 /* Inline data is read in */
#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
-#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
/*
* Fork handling.
@@ -150,45 +112,75 @@ int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
int);
void xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
-struct xfs_bmbt_rec_host *
- xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
-xfs_extnum_t xfs_iext_count(struct xfs_ifork *);
-void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
- struct xfs_bmbt_irec *, int);
-void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
-void xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
- xfs_extnum_t, int);
-void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
-void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
-void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
-void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
-void xfs_iext_realloc_direct(struct xfs_ifork *, int);
-void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
-void xfs_iext_inline_to_direct(struct xfs_ifork *, int);
+xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp);
+void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *, int);
+void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
+ int);
void xfs_iext_destroy(struct xfs_ifork *);
-struct xfs_bmbt_rec_host *
- xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
-struct xfs_ext_irec *
- xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
-struct xfs_ext_irec *
- xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
- int);
-void xfs_iext_irec_init(struct xfs_ifork *);
-struct xfs_ext_irec *
- xfs_iext_irec_new(struct xfs_ifork *, int);
-void xfs_iext_irec_remove(struct xfs_ifork *, int);
-void xfs_iext_irec_compact(struct xfs_ifork *);
-void xfs_iext_irec_compact_pages(struct xfs_ifork *);
-void xfs_iext_irec_compact_full(struct xfs_ifork *);
-void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
bool xfs_iext_lookup_extent(struct xfs_inode *ip,
struct xfs_ifork *ifp, xfs_fileoff_t bno,
- xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp);
-bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+ struct xfs_iext_cursor *cur,
struct xfs_bmbt_irec *gotp);
-void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+bool xfs_iext_lookup_extent_before(struct xfs_inode *ip,
+ struct xfs_ifork *ifp, xfs_fileoff_t *end,
+ struct xfs_iext_cursor *cur,
struct xfs_bmbt_irec *gotp);
+bool xfs_iext_get_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp);
+void xfs_iext_update_extent(struct xfs_inode *ip, int state,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp);
+
+void xfs_iext_first(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_last(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_next(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_prev(struct xfs_ifork *, struct xfs_iext_cursor *);
+
+static inline bool xfs_iext_next_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ xfs_iext_next(ifp, cur);
+ return xfs_iext_get_extent(ifp, cur, gotp);
+}
+
+static inline bool xfs_iext_prev_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ xfs_iext_prev(ifp, cur);
+ return xfs_iext_get_extent(ifp, cur, gotp);
+}
+
+/*
+ * Return the extent after cur in gotp without updating the cursor.
+ */
+static inline bool xfs_iext_peek_next_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ struct xfs_iext_cursor ncur = *cur;
+
+ xfs_iext_next(ifp, &ncur);
+ return xfs_iext_get_extent(ifp, &ncur, gotp);
+}
+
+/*
+ * Return the extent before cur in gotp without updating the cursor.
+ */
+static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ struct xfs_iext_cursor ncur = *cur;
+
+ xfs_iext_prev(ifp, &ncur);
+ return xfs_iext_get_extent(ifp, &ncur, gotp);
+}
+
+#define for_each_xfs_iext(ifp, ext, got) \
+ for (xfs_iext_first((ifp), (ext)); \
+ xfs_iext_get_extent((ifp), (ext), (got)); \
+ xfs_iext_next((ifp), (ext)))
extern struct kmem_zone *xfs_ifork_zone;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 8372e9bcd7b6..996f035ee205 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -264,54 +264,43 @@ typedef struct xfs_trans_header {
* (if any) is indicated in the ilf_dsize field. Changes to this structure
* must be added on to the end.
*/
-typedef struct xfs_inode_log_format {
- uint16_t ilf_type; /* inode log item type */
- uint16_t ilf_size; /* size of this item */
- uint32_t ilf_fields; /* flags for fields logged */
- uint16_t ilf_asize; /* size of attr d/ext/root */
- uint16_t ilf_dsize; /* size of data/ext/root */
- uint64_t ilf_ino; /* inode number */
- union {
- uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
- } ilf_u;
- int64_t ilf_blkno; /* blkno of inode buffer */
- int32_t ilf_len; /* len of inode buffer */
- int32_t ilf_boffset; /* off of inode in buffer */
-} xfs_inode_log_format_t;
-
-typedef struct xfs_inode_log_format_32 {
+struct xfs_inode_log_format {
uint16_t ilf_type; /* inode log item type */
uint16_t ilf_size; /* size of this item */
uint32_t ilf_fields; /* flags for fields logged */
uint16_t ilf_asize; /* size of attr d/ext/root */
uint16_t ilf_dsize; /* size of data/ext/root */
+ uint32_t ilf_pad; /* pad for 64 bit boundary */
uint64_t ilf_ino; /* inode number */
union {
uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
+ u8 __pad[16]; /* unused */
} ilf_u;
int64_t ilf_blkno; /* blkno of inode buffer */
int32_t ilf_len; /* len of inode buffer */
int32_t ilf_boffset; /* off of inode in buffer */
-} __attribute__((packed)) xfs_inode_log_format_32_t;
+};
-typedef struct xfs_inode_log_format_64 {
+/*
+ * Old 32 bit systems will log in this format without the 64 bit
+ * alignment padding. Recovery will detect this and convert it to the
+ * correct format.
+ */
+struct xfs_inode_log_format_32 {
uint16_t ilf_type; /* inode log item type */
uint16_t ilf_size; /* size of this item */
uint32_t ilf_fields; /* flags for fields logged */
uint16_t ilf_asize; /* size of attr d/ext/root */
uint16_t ilf_dsize; /* size of data/ext/root */
- uint32_t ilf_pad; /* pad for 64 bit boundary */
uint64_t ilf_ino; /* inode number */
union {
uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
+ u8 __pad[16]; /* unused */
} ilf_u;
int64_t ilf_blkno; /* blkno of inode buffer */
int32_t ilf_len; /* len of inode buffer */
int32_t ilf_boffset; /* off of inode in buffer */
-} xfs_inode_log_format_64_t;
+} __attribute__((packed));
/*
@@ -322,7 +311,7 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
#define XFS_ILOG_DEV 0x010 /* log the dev field */
-#define XFS_ILOG_UUID 0x020 /* log the uuid field */
+#define XFS_ILOG_UUID 0x020 /* added long ago, but never used */
#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
@@ -340,9 +329,9 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
- XFS_ILOG_UUID | XFS_ILOG_ADATA | \
- XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
- XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+ XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \
+ XFS_ILOG_AOWNER)
#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT)
@@ -352,10 +341,10 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
- XFS_ILOG_DEV | XFS_ILOG_UUID | \
- XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
- XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+ XFS_ILOG_DEV | XFS_ILOG_ADATA | \
+ XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+ XFS_ILOG_TIMESTAMP | XFS_ILOG_DOWNER | \
+ XFS_ILOG_AOWNER)
static inline int xfs_ilog_fbroot(int w)
{
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 9d5406b4f663..585b35d34142 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -30,6 +30,7 @@
#include "xfs_bmap.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 55c88a732690..dd019cee1b3b 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -34,6 +34,7 @@
#include "xfs_rmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_bmap.h"
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5d4e43ef4eea..3fb29a5ea915 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -672,7 +672,6 @@ xfs_rtmodify_range(
/*
* Compute a mask of relevant bits.
*/
- bit = 0;
mask = ((xfs_rtword_t)1 << lastbit) - 1;
/*
* Set/clear the active bits.
@@ -1086,3 +1085,15 @@ xfs_rtalloc_query_all(
return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
}
+
+/*
+ * Verify that an realtime block number pointer doesn't point off the
+ * end of the realtime device.
+ */
+bool
+xfs_verify_rtbno(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return rtbno < mp->m_sb.sb_rblocks;
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 0220159bd463..3c560695c546 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -48,6 +48,12 @@ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
typedef int64_t xfs_sfiloff_t; /* signed block number in a file */
/*
+ * New verifiers will return the instruction address of the failing check.
+ * NULL means everything is ok.
+ */
+typedef void * xfs_failaddr_t;
+
+/*
* Null values for the types.
*/
#define NULLFSBLOCK ((xfs_fsblock_t)-1)
@@ -136,5 +142,21 @@ typedef uint32_t xfs_dqid_t;
#define XFS_NBWORD (1 << XFS_NBWORDLOG)
#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
+struct xfs_iext_cursor {
+ struct xfs_iext_leaf *leaf;
+ int pos;
+};
+
+typedef enum {
+ XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+} xfs_exntst_t;
+
+typedef struct xfs_bmbt_irec
+{
+ xfs_fileoff_t br_startoff; /* starting file offset */
+ xfs_fsblock_t br_startblock; /* starting block number */
+ xfs_filblks_t br_blockcount; /* number of blocks */
+ xfs_exntst_t br_state; /* extent state */
+} xfs_bmbt_irec_t;
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
new file mode 100644
index 000000000000..2a9b4f9e93c6
--- /dev/null
+++ b/fs/xfs/scrub/agheader.c
@@ -0,0 +1,658 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Set up scrub to check all the static metadata in each AG.
+ * This means the SB, AGF, AGI, and AGFL headers.
+ */
+int
+xfs_scrub_setup_ag_header(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ if (sc->sm->sm_agno >= mp->m_sb.sb_agcount ||
+ sc->sm->sm_ino || sc->sm->sm_gen)
+ return -EINVAL;
+ return xfs_scrub_setup_fs(sc, ip);
+}
+
+/* Walk all the blocks in the AGFL. */
+int
+xfs_scrub_walk_agfl(
+ struct xfs_scrub_context *sc,
+ int (*fn)(struct xfs_scrub_context *,
+ xfs_agblock_t bno, void *),
+ void *priv)
+{
+ struct xfs_agf *agf;
+ __be32 *agfl_bno;
+ struct xfs_mount *mp = sc->mp;
+ unsigned int flfirst;
+ unsigned int fllast;
+ int i;
+ int error;
+
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp);
+ flfirst = be32_to_cpu(agf->agf_flfirst);
+ fllast = be32_to_cpu(agf->agf_fllast);
+
+ /* Nothing to walk in an empty AGFL. */
+ if (agf->agf_flcount == cpu_to_be32(0))
+ return 0;
+
+ /* first to last is a consecutive list. */
+ if (fllast >= flfirst) {
+ for (i = flfirst; i <= fllast; i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ return 0;
+ }
+
+ /* first to the end */
+ for (i = flfirst; i < XFS_AGFL_SIZE(mp); i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ /* the start to last. */
+ for (i = 0; i <= fllast; i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Superblock */
+
+/*
+ * Scrub the filesystem superblock.
+ *
+ * Note: We do /not/ attempt to check AG 0's superblock. Mount is
+ * responsible for validating all the geometry information in sb 0, so
+ * if the filesystem is capable of initiating online scrub, then clearly
+ * sb 0 is ok and we can use its information to check everything else.
+ */
+int
+xfs_scrub_superblock(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_dsb *sb;
+ xfs_agnumber_t agno;
+ uint32_t v2_ok;
+ __be32 features_mask;
+ int error;
+ __be16 vernum_mask;
+
+ agno = sc->sm->sm_agno;
+ if (agno == 0)
+ return 0;
+
+ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ if (!xfs_scrub_process_error(sc, agno, XFS_SB_BLOCK(mp), &error))
+ return error;
+
+ sb = XFS_BUF_TO_SBP(bp);
+
+ /*
+ * Verify the geometries match. Fields that are permanently
+ * set by mkfs are checked; fields that can be updated later
+ * (and are not propagated to backup superblocks) are preen
+ * checked.
+ */
+ if (sb->sb_blocksize != cpu_to_be32(mp->m_sb.sb_blocksize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_dblocks != cpu_to_be64(mp->m_sb.sb_dblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rblocks != cpu_to_be64(mp->m_sb.sb_rblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rextents != cpu_to_be64(mp->m_sb.sb_rextents))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (!uuid_equal(&sb->sb_uuid, &mp->m_sb.sb_uuid))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_logstart != cpu_to_be64(mp->m_sb.sb_logstart))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agblocks != cpu_to_be32(mp->m_sb.sb_agblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agcount != cpu_to_be32(mp->m_sb.sb_agcount))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rbmblocks != cpu_to_be32(mp->m_sb.sb_rbmblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logblocks != cpu_to_be32(mp->m_sb.sb_logblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_versionnum bits that are set at mkfs time. */
+ vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS |
+ XFS_SB_VERSION_NUMBITS |
+ XFS_SB_VERSION_ALIGNBIT |
+ XFS_SB_VERSION_DALIGNBIT |
+ XFS_SB_VERSION_SHAREDBIT |
+ XFS_SB_VERSION_LOGV2BIT |
+ XFS_SB_VERSION_SECTORBIT |
+ XFS_SB_VERSION_EXTFLGBIT |
+ XFS_SB_VERSION_DIRV2BIT);
+ if ((sb->sb_versionnum & vernum_mask) !=
+ (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_versionnum bits that can be set after mkfs time. */
+ vernum_mask = cpu_to_be16(XFS_SB_VERSION_ATTRBIT |
+ XFS_SB_VERSION_NLINKBIT |
+ XFS_SB_VERSION_QUOTABIT);
+ if ((sb->sb_versionnum & vernum_mask) !=
+ (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_sectsize != cpu_to_be16(mp->m_sb.sb_sectsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inodesize != cpu_to_be16(mp->m_sb.sb_inodesize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inopblock != cpu_to_be16(mp->m_sb.sb_inopblock))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (memcmp(sb->sb_fname, mp->m_sb.sb_fname, sizeof(sb->sb_fname)))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_blocklog != mp->m_sb.sb_blocklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_sectlog != mp->m_sb.sb_sectlog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inodelog != mp->m_sb.sb_inodelog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inopblog != mp->m_sb.sb_inopblog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agblklog != mp->m_sb.sb_agblklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rextslog != mp->m_sb.sb_rextslog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_imax_pct != mp->m_sb.sb_imax_pct)
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /*
+ * Skip the summary counters since we track them in memory anyway.
+ * sb_icount, sb_ifree, sb_fdblocks, sb_frexents
+ */
+
+ if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /*
+ * Skip the quota flags since repair will force quotacheck.
+ * sb_qflags
+ */
+
+ if (sb->sb_flags != mp->m_sb.sb_flags)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_shared_vn != mp->m_sb.sb_shared_vn)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inoalignmt != cpu_to_be32(mp->m_sb.sb_inoalignmt))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_unit != cpu_to_be32(mp->m_sb.sb_unit))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_width != cpu_to_be32(mp->m_sb.sb_width))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_dirblklog != mp->m_sb.sb_dirblklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsectlog != mp->m_sb.sb_logsectlog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsectsize != cpu_to_be16(mp->m_sb.sb_logsectsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsunit != cpu_to_be32(mp->m_sb.sb_logsunit))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Do we see any invalid bits in sb_features2? */
+ if (!xfs_sb_version_hasmorebits(&mp->m_sb)) {
+ if (sb->sb_features2 != 0)
+ xfs_scrub_block_set_corrupt(sc, bp);
+ } else {
+ v2_ok = XFS_SB_VERSION2_OKBITS;
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5)
+ v2_ok |= XFS_SB_VERSION2_CRCBIT;
+
+ if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_features2 != sb->sb_bad_features2)
+ xfs_scrub_block_set_preen(sc, bp);
+ }
+
+ /* Check sb_features2 flags that are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_VERSION2_LAZYSBCOUNTBIT |
+ XFS_SB_VERSION2_PROJID32BIT |
+ XFS_SB_VERSION2_CRCBIT |
+ XFS_SB_VERSION2_FTYPE);
+ if ((sb->sb_features2 & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_features2 flags that can be set after mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT);
+ if ((sb->sb_features2 & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ /* all v5 fields must be zero */
+ if (memchr_inv(&sb->sb_features_compat, 0,
+ sizeof(struct xfs_dsb) -
+ offsetof(struct xfs_dsb, sb_features_compat)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+ } else {
+ /* Check compat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN);
+ if ((sb->sb_features_compat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check ro compat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN |
+ XFS_SB_FEAT_RO_COMPAT_FINOBT |
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT |
+ XFS_SB_FEAT_RO_COMPAT_REFLINK);
+ if ((sb->sb_features_ro_compat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_ro_compat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check incompat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN |
+ XFS_SB_FEAT_INCOMPAT_FTYPE |
+ XFS_SB_FEAT_INCOMPAT_SPINODES |
+ XFS_SB_FEAT_INCOMPAT_META_UUID);
+ if ((sb->sb_features_incompat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_incompat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check log incompat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN);
+ if ((sb->sb_features_log_incompat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_log_incompat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Don't care about sb_crc */
+
+ if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /* Don't care about sb_lsn */
+ }
+
+ if (xfs_sb_version_hasmetauuid(&mp->m_sb)) {
+ /* The metadata UUID must be the same for all supers */
+ if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid))
+ xfs_scrub_block_set_corrupt(sc, bp);
+ }
+
+ /* Everything else must be zero. */
+ if (memchr_inv(sb + 1, 0,
+ BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ return error;
+}
+
+/* AGF */
+
+/* Scrub the AGF. */
+int
+xfs_scrub_agf(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agf *agf;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t eoag;
+ xfs_agblock_t agfl_first;
+ xfs_agblock_t agfl_last;
+ xfs_agblock_t agfl_count;
+ xfs_agblock_t fl_count;
+ int level;
+ int error = 0;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error))
+ goto out;
+
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+
+ /* Check the AG length */
+ eoag = be32_to_cpu(agf->agf_length);
+ if (eoag != xfs_ag_block_count(mp, agno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ /* Check the AGF btree roots and levels */
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ }
+
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ agbno = be32_to_cpu(agf->agf_refcount_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_refcount_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ }
+
+ /* Check the AGFL counters */
+ agfl_first = be32_to_cpu(agf->agf_flfirst);
+ agfl_last = be32_to_cpu(agf->agf_fllast);
+ agfl_count = be32_to_cpu(agf->agf_flcount);
+ if (agfl_last > agfl_first)
+ fl_count = agfl_last - agfl_first + 1;
+ else
+ fl_count = XFS_AGFL_SIZE(mp) - agfl_first + agfl_last + 1;
+ if (agfl_count != 0 && fl_count != agfl_count)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+out:
+ return error;
+}
+
+/* AGFL */
+
+struct xfs_scrub_agfl_info {
+ unsigned int sz_entries;
+ unsigned int nr_entries;
+ xfs_agblock_t *entries;
+};
+
+/* Scrub an AGFL block. */
+STATIC int
+xfs_scrub_agfl_block(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_agfl_info *sai = priv;
+ xfs_agnumber_t agno = sc->sa.agno;
+
+ if (xfs_verify_agbno(mp, agno, agbno) &&
+ sai->nr_entries < sai->sz_entries)
+ sai->entries[sai->nr_entries++] = agbno;
+ else
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agfl_bp);
+
+ return 0;
+}
+
+static int
+xfs_scrub_agblock_cmp(
+ const void *pa,
+ const void *pb)
+{
+ const xfs_agblock_t *a = pa;
+ const xfs_agblock_t *b = pb;
+
+ return (int)*a - (int)*b;
+}
+
+/* Scrub the AGFL. */
+int
+xfs_scrub_agfl(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_agfl_info sai = { 0 };
+ struct xfs_agf *agf;
+ xfs_agnumber_t agno;
+ unsigned int agflcount;
+ unsigned int i;
+ int error;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
+ goto out;
+ if (!sc->sa.agf_bp)
+ return -EFSCORRUPTED;
+
+ /* Allocate buffer to ensure uniqueness of AGFL entries. */
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agflcount = be32_to_cpu(agf->agf_flcount);
+ if (agflcount > XFS_AGFL_SIZE(sc->mp)) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ goto out;
+ }
+ sai.sz_entries = agflcount;
+ sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS);
+ if (!sai.entries) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ /* Check the blocks in the AGFL. */
+ error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai);
+ if (error)
+ goto out_free;
+
+ if (agflcount != sai.nr_entries) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ goto out_free;
+ }
+
+ /* Sort entries, check for duplicates. */
+ sort(sai.entries, sai.nr_entries, sizeof(sai.entries[0]),
+ xfs_scrub_agblock_cmp, NULL);
+ for (i = 1; i < sai.nr_entries; i++) {
+ if (sai.entries[i] == sai.entries[i - 1]) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ break;
+ }
+ }
+
+out_free:
+ kmem_free(sai.entries);
+out:
+ return error;
+}
+
+/* AGI */
+
+/* Scrub the AGI. */
+int
+xfs_scrub_agi(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agi *agi;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t eoag;
+ xfs_agino_t agino;
+ xfs_agino_t first_agino;
+ xfs_agino_t last_agino;
+ xfs_agino_t icount;
+ int i;
+ int level;
+ int error = 0;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error))
+ goto out;
+
+ agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+
+ /* Check the AG length */
+ eoag = be32_to_cpu(agi->agi_length);
+ if (eoag != xfs_ag_block_count(mp, agno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check btree roots and levels */
+ agbno = be32_to_cpu(agi->agi_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ level = be32_to_cpu(agi->agi_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agbno = be32_to_cpu(agi->agi_free_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ level = be32_to_cpu(agi->agi_free_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ /* Check inode counters */
+ xfs_ialloc_agino_range(mp, agno, &first_agino, &last_agino);
+ icount = be32_to_cpu(agi->agi_count);
+ if (icount > last_agino - first_agino + 1 ||
+ icount < be32_to_cpu(agi->agi_freecount))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check inode pointers */
+ agino = be32_to_cpu(agi->agi_newino);
+ if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ agino = be32_to_cpu(agi->agi_dirino);
+ if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check unlinked inode buckets */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ agino = be32_to_cpu(agi->agi_unlinked[i]);
+ if (agino == NULLAGINO)
+ continue;
+ if (!xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ if (agi->agi_pad32 != cpu_to_be32(0))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
new file mode 100644
index 000000000000..059663e13414
--- /dev/null
+++ b/fs/xfs/scrub/alloc.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub free space btrees.
+ */
+int
+xfs_scrub_setup_ag_allocbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Free space btree scrubber. */
+
+/* Scrub a bnobt/cntbt record. */
+STATIC int
+xfs_scrub_allocbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+ int error = 0;
+
+ bno = be32_to_cpu(rec->alloc.ar_startblock);
+ len = be32_to_cpu(rec->alloc.ar_blockcount);
+
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ return error;
+}
+
+/* Scrub the freespace btrees for some AG. */
+STATIC int
+xfs_scrub_allocbt(
+ struct xfs_scrub_context *sc,
+ xfs_btnum_t which)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_btree_cur *cur;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
+ return xfs_scrub_btree(sc, cur, xfs_scrub_allocbt_rec, &oinfo, NULL);
+}
+
+int
+xfs_scrub_bnobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_allocbt(sc, XFS_BTNUM_BNO);
+}
+
+int
+xfs_scrub_cntbt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_allocbt(sc, XFS_BTNUM_CNT);
+}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
new file mode 100644
index 000000000000..4ed80474f545
--- /dev/null
+++ b/fs/xfs/scrub/attr.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/dabtree.h"
+#include "scrub/trace.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+/* Set us up to scrub an inode's extended attributes. */
+int
+xfs_scrub_setup_xattr(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ size_t sz;
+
+ /*
+ * Allocate the buffer without the inode lock held. We need enough
+ * space to read every xattr value in the file or enough space to
+ * hold three copies of the xattr free space bitmap. (Not both at
+ * the same time.)
+ */
+ sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) *
+ BITS_TO_LONGS(sc->mp->m_attr_geo->blksize));
+ sc->buf = kmem_zalloc_large(sz, KM_SLEEP);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Extended Attributes */
+
+struct xfs_scrub_xattr {
+ struct xfs_attr_list_context context;
+ struct xfs_scrub_context *sc;
+};
+
+/*
+ * Check that an extended attribute key can be looked up by hash.
+ *
+ * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked)
+ * to call this function for every attribute key in an inode. Once
+ * we're here, we load the attribute value to see if any errors happen,
+ * or if we get more or less data than we expected.
+ */
+static void
+xfs_scrub_xattr_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen)
+{
+ struct xfs_scrub_xattr *sx;
+ struct xfs_da_args args = { NULL };
+ int error = 0;
+
+ sx = container_of(context, struct xfs_scrub_xattr, context);
+
+ if (flags & XFS_ATTR_INCOMPLETE) {
+ /* Incomplete attr key, just mark the inode for preening. */
+ xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino, NULL);
+ return;
+ }
+
+ args.flags = ATTR_KERNOTIME;
+ if (flags & XFS_ATTR_ROOT)
+ args.flags |= ATTR_ROOT;
+ else if (flags & XFS_ATTR_SECURE)
+ args.flags |= ATTR_SECURE;
+ args.geo = context->dp->i_mount->m_attr_geo;
+ args.whichfork = XFS_ATTR_FORK;
+ args.dp = context->dp;
+ args.name = name;
+ args.namelen = namelen;
+ args.hashval = xfs_da_hashname(args.name, args.namelen);
+ args.trans = context->tp;
+ args.value = sx->sc->buf;
+ args.valuelen = XATTR_SIZE_MAX;
+
+ error = xfs_attr_get_ilocked(context->dp, &args);
+ if (error == -EEXIST)
+ error = 0;
+ if (!xfs_scrub_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
+ &error))
+ goto fail_xref;
+ if (args.valuelen != valuelen)
+ xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
+ args.blkno);
+
+fail_xref:
+ return;
+}
+
+/*
+ * Mark a range [start, start+len) in this map. Returns true if the
+ * region was free, and false if there's a conflict or a problem.
+ *
+ * Within a char, the lowest bit of the char represents the byte with
+ * the smallest address
+ */
+STATIC bool
+xfs_scrub_xattr_set_map(
+ struct xfs_scrub_context *sc,
+ unsigned long *map,
+ unsigned int start,
+ unsigned int len)
+{
+ unsigned int mapsize = sc->mp->m_attr_geo->blksize;
+ bool ret = true;
+
+ if (start >= mapsize)
+ return false;
+ if (start + len > mapsize) {
+ len = mapsize - start;
+ ret = false;
+ }
+
+ if (find_next_bit(map, mapsize, start) < start + len)
+ ret = false;
+ bitmap_set(map, start, len);
+
+ return ret;
+}
+
+/*
+ * Check the leaf freemap from the usage bitmap. Returns false if the
+ * attr freemap has problems or points to used space.
+ */
+STATIC bool
+xfs_scrub_xattr_check_freemap(
+ struct xfs_scrub_context *sc,
+ unsigned long *map,
+ struct xfs_attr3_icleaf_hdr *leafhdr)
+{
+ unsigned long *freemap;
+ unsigned long *dstmap;
+ unsigned int mapsize = sc->mp->m_attr_geo->blksize;
+ int i;
+
+ /* Construct bitmap of freemap contents. */
+ freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize);
+ bitmap_zero(freemap, mapsize);
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ if (!xfs_scrub_xattr_set_map(sc, freemap,
+ leafhdr->freemap[i].base,
+ leafhdr->freemap[i].size))
+ return false;
+ }
+
+ /* Look for bits that are set in freemap and are marked in use. */
+ dstmap = freemap + BITS_TO_LONGS(mapsize);
+ return bitmap_and(dstmap, freemap, map, mapsize) == 0;
+}
+
+/*
+ * Check this leaf entry's relations to everything else.
+ * Returns the number of bytes used for the name/value data.
+ */
+STATIC void
+xfs_scrub_xattr_entry(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ char *buf_end,
+ struct xfs_attr_leafblock *leaf,
+ struct xfs_attr3_icleaf_hdr *leafhdr,
+ unsigned long *usedmap,
+ struct xfs_attr_leaf_entry *ent,
+ int idx,
+ unsigned int *usedbytes,
+ __u32 *last_hashval)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ char *name_end;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ unsigned int nameidx;
+ unsigned int namesize;
+
+ if (ent->pad2 != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ /* Hash values in order? */
+ if (be32_to_cpu(ent->hashval) < *last_hashval)
+ xfs_scrub_da_set_corrupt(ds, level);
+ *last_hashval = be32_to_cpu(ent->hashval);
+
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr->firstused ||
+ nameidx >= mp->m_attr_geo->blksize) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return;
+ }
+
+ /* Check the name information. */
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
+ name_end = (char *)lentry + namesize;
+ if (lentry->namelen == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ name_end = (char *)rentry + namesize;
+ if (rentry->namelen == 0 || rentry->valueblk == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+ if (name_end > buf_end)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, nameidx, namesize))
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ *usedbytes += namesize;
+}
+
+/* Scrub an attribute leaf. */
+STATIC int
+xfs_scrub_xattr_block(
+ struct xfs_scrub_da_btree *ds,
+ int level)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
+ struct xfs_buf *bp = blk->bp;
+ xfs_dablk_t *last_checked = ds->private;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *ent;
+ struct xfs_attr_leaf_entry *entries;
+ unsigned long *usedmap = ds->sc->buf;
+ char *buf_end;
+ size_t off;
+ __u32 last_hashval = 0;
+ unsigned int usedbytes = 0;
+ unsigned int hdrsize;
+ int i;
+
+ if (*last_checked == blk->blkno)
+ return 0;
+ *last_checked = blk->blkno;
+ bitmap_zero(usedmap, mp->m_attr_geo->blksize);
+
+ /* Check all the padding. */
+ if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) {
+ struct xfs_attr3_leafblock *leaf = bp->b_addr;
+
+ if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 ||
+ leaf->hdr.info.hdr.pad != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ } else {
+ if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+
+ /* Check the leaf header */
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+
+ if (leafhdr.usedbytes > mp->m_attr_geo->blksize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (leafhdr.firstused > mp->m_attr_geo->blksize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (leafhdr.firstused < hdrsize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, 0, hdrsize))
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if ((char *)&entries[leafhdr.count] > (char *)leaf + leafhdr.firstused)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
+ /* Mark the leaf entry itself. */
+ off = (char *)ent - (char *)leaf;
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, off,
+ sizeof(xfs_attr_leaf_entry_t))) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+
+ /* Check the entry and nameval. */
+ xfs_scrub_xattr_entry(ds, level, buf_end, leaf, &leafhdr,
+ usedmap, ent, i, &usedbytes, &last_hashval);
+
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+ }
+
+ if (!xfs_scrub_xattr_check_freemap(ds->sc, usedmap, &leafhdr))
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (leafhdr.usedbytes != usedbytes)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+out:
+ return 0;
+}
+
+/* Scrub a attribute btree record. */
+STATIC int
+xfs_scrub_xattr_rec(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ void *rec)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_attr_leaf_entry *ent = rec;
+ struct xfs_da_state_blk *blk;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ struct xfs_buf *bp;
+ xfs_dahash_t calc_hash;
+ xfs_dahash_t hash;
+ int nameidx;
+ int hdrsize;
+ unsigned int badflags;
+ int error;
+
+ blk = &ds->state->path.blk[level];
+
+ /* Check the whole block, if necessary. */
+ error = xfs_scrub_xattr_block(ds, level);
+ if (error)
+ goto out;
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check the hash of the entry. */
+ error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval);
+ if (error)
+ goto out;
+
+ /* Find the attr entry's location. */
+ bp = blk->bp;
+ hdrsize = xfs_attr3_leaf_hdr_size(bp->b_addr);
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < hdrsize || nameidx >= mp->m_attr_geo->blksize) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+
+ /* Retrieve the entry and check it. */
+ hash = be32_to_cpu(ent->hashval);
+ badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
+ XFS_ATTR_INCOMPLETE);
+ if ((ent->flags & badflags) != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = (struct xfs_attr_leaf_name_local *)
+ (((char *)bp->b_addr) + nameidx);
+ if (lentry->namelen <= 0) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen);
+ } else {
+ rentry = (struct xfs_attr_leaf_name_remote *)
+ (((char *)bp->b_addr) + nameidx);
+ if (rentry->namelen <= 0) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_da_hashname(rentry->name, rentry->namelen);
+ }
+ if (calc_hash != hash)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+out:
+ return error;
+}
+
+/* Scrub the extended attribute metadata. */
+int
+xfs_scrub_xattr(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_xattr sx;
+ struct attrlist_cursor_kern cursor = { 0 };
+ xfs_dablk_t last_checked = -1U;
+ int error = 0;
+
+ if (!xfs_inode_hasattr(sc->ip))
+ return -ENOENT;
+
+ memset(&sx, 0, sizeof(sx));
+ /* Check attribute tree structure */
+ error = xfs_scrub_da_btree(sc, XFS_ATTR_FORK, xfs_scrub_xattr_rec,
+ &last_checked);
+ if (error)
+ goto out;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check that every attr key can also be looked up by hash. */
+ sx.context.dp = sc->ip;
+ sx.context.cursor = &cursor;
+ sx.context.resynch = 1;
+ sx.context.put_listent = xfs_scrub_xattr_listent;
+ sx.context.tp = sc->tp;
+ sx.context.flags = ATTR_INCOMPLETE;
+ sx.sc = sc;
+
+ /*
+ * Look up every xattr in this file by name.
+ *
+ * Use the backend implementation of xfs_attr_list to call
+ * xfs_scrub_xattr_listent on every attribute key in this inode.
+ * In other words, we use the same iterator/callback mechanism
+ * that listattr uses to scrub extended attributes, though in our
+ * _listent function, we check the value of the attribute.
+ *
+ * The VFS only locks i_rwsem when modifying attrs, so keep all
+ * three locks held because that's the only way to ensure we're
+ * the only thread poking into the da btree. We traverse the da
+ * btree while holding a leaf buffer locked for the xattr name
+ * iteration, which doesn't really follow the usual buffer
+ * locking order.
+ */
+ error = xfs_attr_list_int_ilocked(&sx.context);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ goto out;
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
new file mode 100644
index 000000000000..42fec0bcd9e1
--- /dev/null
+++ b/fs/xfs/scrub/bmap.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* Set us up with an inode's bmap. */
+int
+xfs_scrub_setup_inode_bmap(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_scrub_get_inode(sc, ip);
+ if (error)
+ goto out;
+
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ /*
+ * We don't want any ephemeral data fork updates sitting around
+ * while we inspect block mappings, so wait for directio to finish
+ * and flush dirty data if we have delalloc reservations.
+ */
+ if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
+ sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
+ inode_dio_wait(VFS_I(sc->ip));
+ error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping);
+ if (error)
+ goto out;
+ }
+
+ /* Got the inode, lock it and we're ready to go. */
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode */
+ return error;
+}
+
+/*
+ * Inode fork block mapping (BMBT) scrubber.
+ * More complex than the others because we have to scrub
+ * all the extents regardless of whether or not the fork
+ * is in btree format.
+ */
+
+struct xfs_scrub_bmap_info {
+ struct xfs_scrub_context *sc;
+ xfs_fileoff_t lastoff;
+ bool is_rt;
+ bool is_shared;
+ int whichfork;
+};
+
+/* Scrub a single extent record. */
+STATIC int
+xfs_scrub_bmap_extent(
+ struct xfs_inode *ip,
+ struct xfs_btree_cur *cur,
+ struct xfs_scrub_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = info->sc->mp;
+ struct xfs_buf *bp = NULL;
+ int error = 0;
+
+ if (cur)
+ xfs_btree_get_block(cur, 0, &bp);
+
+ /*
+ * Check for out-of-order extents. This record could have come
+ * from the incore list, for which there is no ordering check.
+ */
+ if (irec->br_startoff < info->lastoff)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* There should never be a "hole" extent in either extent list. */
+ if (irec->br_startblock == HOLESTARTBLOCK)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /*
+ * Check for delalloc extents. We never iterate the ones in the
+ * in-core extent scan, and we should never see these in the bmbt.
+ */
+ if (isnullstartblock(irec->br_startblock))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* Make sure the extent points to a valid place. */
+ if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (info->is_rt &&
+ (!xfs_verify_rtbno(mp, irec->br_startblock) ||
+ !xfs_verify_rtbno(mp, irec->br_startblock +
+ irec->br_blockcount - 1)))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (!info->is_rt &&
+ (!xfs_verify_fsbno(mp, irec->br_startblock) ||
+ !xfs_verify_fsbno(mp, irec->br_startblock +
+ irec->br_blockcount - 1)))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* We don't allow unwritten extents on attr forks. */
+ if (irec->br_state == XFS_EXT_UNWRITTEN &&
+ info->whichfork == XFS_ATTR_FORK)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ info->lastoff = irec->br_startoff + irec->br_blockcount;
+ return error;
+}
+
+/* Scrub a bmbt record. */
+STATIC int
+xfs_scrub_bmapbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_scrub_bmap_info *info = bs->private;
+ struct xfs_inode *ip = bs->cur->bc_private.b.ip;
+ struct xfs_buf *bp = NULL;
+ struct xfs_btree_block *block;
+ uint64_t owner;
+ int i;
+
+ /*
+ * Check the owners of the btree blocks up to the level below
+ * the root since the verifiers don't do that.
+ */
+ if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) &&
+ bs->cur->bc_ptrs[0] == 1) {
+ for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
+ block = xfs_btree_get_block(bs->cur, i, &bp);
+ owner = be64_to_cpu(block->bb_u.l.bb_owner);
+ if (owner != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(bs->sc,
+ info->whichfork, 0);
+ }
+ }
+
+ /* Set up the in-core record and scrub it. */
+ xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ return xfs_scrub_bmap_extent(ip, bs->cur, info, &irec);
+}
+
+/* Scan the btree records. */
+STATIC int
+xfs_scrub_bmap_btree(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ struct xfs_scrub_bmap_info *info)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+ error = xfs_scrub_btree(sc, cur, xfs_scrub_bmapbt_rec, &oinfo, info);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ return error;
+}
+
+/*
+ * Scrub an inode fork's block mappings.
+ *
+ * First we scan every record in every btree block, if applicable.
+ * Then we unconditionally scan the incore extent cache.
+ */
+STATIC int
+xfs_scrub_bmap(
+ struct xfs_scrub_context *sc,
+ int whichfork)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_scrub_bmap_info info = { NULL };
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t endoff;
+ struct xfs_iext_cursor icur;
+ bool found;
+ int error = 0;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+ info.whichfork = whichfork;
+ info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
+ info.sc = sc;
+
+ switch (whichfork) {
+ case XFS_COW_FORK:
+ /* Non-existent CoW forks are ignorable. */
+ if (!ifp)
+ goto out;
+ /* No CoW forks on non-reflink inodes/filesystems. */
+ if (!xfs_is_reflink_inode(ip)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ goto out;
+ }
+ break;
+ case XFS_ATTR_FORK:
+ if (!ifp)
+ goto out;
+ if (!xfs_sb_version_hasattr(&mp->m_sb) &&
+ !xfs_sb_version_hasattr2(&mp->m_sb))
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ break;
+ default:
+ ASSERT(whichfork == XFS_DATA_FORK);
+ break;
+ }
+
+ /* Check the fork values */
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_UUID:
+ case XFS_DINODE_FMT_DEV:
+ case XFS_DINODE_FMT_LOCAL:
+ /* No mappings to check. */
+ goto out;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (whichfork == XFS_COW_FORK) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+
+ error = xfs_scrub_bmap_btree(sc, whichfork, &info);
+ if (error)
+ goto out;
+ break;
+ default:
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Now try to scrub the in-memory extent list. */
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(sc->tp, ip, whichfork);
+ if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
+ }
+
+ /* Find the offset of the last extent in the mapping. */
+ error = xfs_bmap_last_offset(ip, &endoff, whichfork);
+ if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
+
+ /* Scrub extent records. */
+ info.lastoff = 0;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ for (found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &irec);
+ found != 0;
+ found = xfs_iext_next_extent(ifp, &icur, &irec)) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+ if (isnullstartblock(irec.br_startblock))
+ continue;
+ if (irec.br_startoff >= endoff) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork,
+ irec.br_startoff);
+ goto out;
+ }
+ error = xfs_scrub_bmap_extent(ip, NULL, &info, &irec);
+ if (error)
+ goto out;
+ }
+
+out:
+ return error;
+}
+
+/* Scrub an inode's data fork. */
+int
+xfs_scrub_bmap_data(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_bmap(sc, XFS_DATA_FORK);
+}
+
+/* Scrub an inode's attr fork. */
+int
+xfs_scrub_bmap_attr(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_bmap(sc, XFS_ATTR_FORK);
+}
+
+/* Scrub an inode's CoW fork. */
+int
+xfs_scrub_bmap_cow(
+ struct xfs_scrub_context *sc)
+{
+ if (!xfs_is_reflink_inode(sc->ip))
+ return -ENOENT;
+
+ return xfs_scrub_bmap(sc, XFS_COW_FORK);
+}
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
new file mode 100644
index 000000000000..df0766132ace
--- /dev/null
+++ b/fs/xfs/scrub/btree.c
@@ -0,0 +1,516 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* btree scrubbing */
+
+/*
+ * Check for btree operation errors. See the section about handling
+ * operational errors in common.c.
+ */
+bool
+xfs_scrub_btree_process_error(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level,
+ int *error)
+{
+ if (*error == 0)
+ return true;
+
+ switch (*error) {
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ trace_xfs_scrub_ifork_btree_op_error(sc, cur, level,
+ *error, __return_address);
+ else
+ trace_xfs_scrub_btree_op_error(sc, cur, level,
+ *error, __return_address);
+ break;
+ }
+ return false;
+}
+
+/* Record btree block corruption. */
+void
+xfs_scrub_btree_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ trace_xfs_scrub_ifork_btree_error(sc, cur, level,
+ __return_address);
+ else
+ trace_xfs_scrub_btree_error(sc, cur, level,
+ __return_address);
+}
+
+/*
+ * Make sure this record is in order and doesn't stray outside of the parent
+ * keys.
+ */
+STATIC void
+xfs_scrub_btree_rec(
+ struct xfs_scrub_btree *bs)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_rec *rec;
+ union xfs_btree_key key;
+ union xfs_btree_key hkey;
+ union xfs_btree_key *keyp;
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *keyblock;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, 0, &bp);
+ rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+
+ trace_xfs_scrub_btree_rec(bs->sc, cur, 0);
+
+ /* If this isn't the first record, are they in order? */
+ if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 0);
+ bs->firstrec = false;
+ memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
+
+ if (cur->bc_nlevels == 1)
+ return;
+
+ /* Is this at least as large as the parent low key? */
+ cur->bc_ops->init_key_from_rec(&key, rec);
+ keyblock = xfs_btree_get_block(cur, 1, &bp);
+ keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Is this no larger than the parent high key? */
+ cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+}
+
+/*
+ * Make sure this key is in order and doesn't stray outside of the parent
+ * keys.
+ */
+STATIC void
+xfs_scrub_btree_key(
+ struct xfs_scrub_btree *bs,
+ int level)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_key *key;
+ union xfs_btree_key *keyp;
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *keyblock;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+
+ trace_xfs_scrub_btree_key(bs->sc, cur, level);
+
+ /* If this isn't the first key, are they in order? */
+ if (!bs->firstkey[level] &&
+ !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ bs->firstkey[level] = false;
+ memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len);
+
+ if (level + 1 >= cur->bc_nlevels)
+ return;
+
+ /* Is this at least as large as the parent low key? */
+ keyblock = xfs_btree_get_block(cur, level + 1, &bp);
+ keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Is this no larger than the parent high key? */
+ key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+}
+
+/*
+ * Check a btree pointer. Returns true if it's ok to use this pointer.
+ * Callers do not need to set the corrupt flag.
+ */
+static bool
+xfs_scrub_btree_ptr_ok(
+ struct xfs_scrub_btree *bs,
+ int level,
+ union xfs_btree_ptr *ptr)
+{
+ bool res;
+
+ /* A btree rooted in an inode has no block pointer to the root. */
+ if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == bs->cur->bc_nlevels)
+ return true;
+
+ /* Otherwise, check the pointers. */
+ if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
+ else
+ res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
+ if (!res)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+
+ return res;
+}
+
+/* Check that a btree block's sibling matches what we expect it. */
+STATIC int
+xfs_scrub_btree_block_check_sibling(
+ struct xfs_scrub_btree *bs,
+ int level,
+ int direction,
+ union xfs_btree_ptr *sibling)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ struct xfs_btree_block *pblock;
+ struct xfs_buf *pbp;
+ struct xfs_btree_cur *ncur = NULL;
+ union xfs_btree_ptr *pp;
+ int success;
+ int error;
+
+ error = xfs_btree_dup_cursor(cur, &ncur);
+ if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error) ||
+ !ncur)
+ return error;
+
+ /*
+ * If the pointer is null, we shouldn't be able to move the upper
+ * level pointer anywhere.
+ */
+ if (xfs_btree_ptr_is_null(cur, sibling)) {
+ if (direction > 0)
+ error = xfs_btree_increment(ncur, level + 1, &success);
+ else
+ error = xfs_btree_decrement(ncur, level + 1, &success);
+ if (error == 0 && success)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ error = 0;
+ goto out;
+ }
+
+ /* Increment upper level pointer. */
+ if (direction > 0)
+ error = xfs_btree_increment(ncur, level + 1, &success);
+ else
+ error = xfs_btree_decrement(ncur, level + 1, &success);
+ if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error))
+ goto out;
+ if (!success) {
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level + 1);
+ goto out;
+ }
+
+ /* Compare upper level pointer to sibling pointer. */
+ pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
+ pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock);
+ if (!xfs_scrub_btree_ptr_ok(bs, level + 1, pp))
+ goto out;
+
+ if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+out:
+ xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/* Check the siblings of a btree block. */
+STATIC int
+xfs_scrub_btree_block_check_siblings(
+ struct xfs_scrub_btree *bs,
+ struct xfs_btree_block *block)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_ptr leftsib;
+ union xfs_btree_ptr rightsib;
+ int level;
+ int error = 0;
+
+ xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
+ xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
+ level = xfs_btree_get_level(block);
+
+ /* Root block should never have siblings. */
+ if (level == cur->bc_nlevels - 1) {
+ if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
+ !xfs_btree_ptr_is_null(cur, &rightsib))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ goto out;
+ }
+
+ /*
+ * Does the left & right sibling pointers match the adjacent
+ * parent level pointers?
+ * (These function absorbs error codes for us.)
+ */
+ error = xfs_scrub_btree_block_check_sibling(bs, level, -1, &leftsib);
+ if (error)
+ return error;
+ error = xfs_scrub_btree_block_check_sibling(bs, level, 1, &rightsib);
+ if (error)
+ return error;
+out:
+ return error;
+}
+
+/*
+ * Grab and scrub a btree block given a btree pointer. Returns block
+ * and buffer pointers (if applicable) if they're ok to use.
+ */
+STATIC int
+xfs_scrub_btree_get_block(
+ struct xfs_scrub_btree *bs,
+ int level,
+ union xfs_btree_ptr *pp,
+ struct xfs_btree_block **pblock,
+ struct xfs_buf **pbp)
+{
+ void *failed_at;
+ int error;
+
+ *pblock = NULL;
+ *pbp = NULL;
+
+ error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
+ if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, level, &error) ||
+ !*pblock)
+ return error;
+
+ xfs_btree_get_block(bs->cur, level, pbp);
+ if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
+ level, *pbp);
+ else
+ failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
+ level, *pbp);
+ if (failed_at) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+ return 0;
+ }
+
+ /*
+ * Check the block's siblings; this function absorbs error codes
+ * for us.
+ */
+ return xfs_scrub_btree_block_check_siblings(bs, *pblock);
+}
+
+/*
+ * Check that the low and high keys of this block match the keys stored
+ * in the parent block.
+ */
+STATIC void
+xfs_scrub_btree_block_keys(
+ struct xfs_scrub_btree *bs,
+ int level,
+ struct xfs_btree_block *block)
+{
+ union xfs_btree_key block_keys;
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_key *high_bk;
+ union xfs_btree_key *parent_keys;
+ union xfs_btree_key *high_pk;
+ struct xfs_btree_block *parent_block;
+ struct xfs_buf *bp;
+
+ if (level >= cur->bc_nlevels - 1)
+ return;
+
+ /* Calculate the keys for this block. */
+ xfs_btree_get_keys(cur, block, &block_keys);
+
+ /* Obtain the parent's copy of the keys for this block. */
+ parent_block = xfs_btree_get_block(cur, level + 1, &bp);
+ parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_block);
+
+ if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Get high keys */
+ high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
+ high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_block);
+
+ if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+}
+
+/*
+ * Visit all nodes and leaves of a btree. Check that all pointers and
+ * records are in order, that the keys reflect the records, and use a callback
+ * so that the caller can verify individual records.
+ */
+int
+xfs_scrub_btree(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ xfs_scrub_btree_rec_fn scrub_fn,
+ struct xfs_owner_info *oinfo,
+ void *private)
+{
+ struct xfs_scrub_btree bs = { NULL };
+ union xfs_btree_ptr ptr;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_rec *recp;
+ struct xfs_btree_block *block;
+ int level;
+ struct xfs_buf *bp;
+ int i;
+ int error = 0;
+
+ /* Initialize scrub state */
+ bs.cur = cur;
+ bs.scrub_rec = scrub_fn;
+ bs.oinfo = oinfo;
+ bs.firstrec = true;
+ bs.private = private;
+ bs.sc = sc;
+ for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
+ bs.firstkey[i] = true;
+ INIT_LIST_HEAD(&bs.to_check);
+
+ /* Don't try to check a tree with a height we can't handle. */
+ if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) {
+ xfs_scrub_btree_set_corrupt(sc, cur, 0);
+ goto out;
+ }
+
+ /*
+ * Load the root of the btree. The helper function absorbs
+ * error codes for us.
+ */
+ level = cur->bc_nlevels - 1;
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ if (!xfs_scrub_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr))
+ goto out;
+ error = xfs_scrub_btree_get_block(&bs, level, &ptr, &block, &bp);
+ if (error || !block)
+ goto out;
+
+ cur->bc_ptrs[level] = 1;
+
+ while (level < cur->bc_nlevels) {
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ if (level == 0) {
+ /* End of leaf, pop back towards the root. */
+ if (cur->bc_ptrs[level] >
+ be16_to_cpu(block->bb_numrecs)) {
+ xfs_scrub_btree_block_keys(&bs, level, block);
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_ptrs[level + 1]++;
+ level++;
+ continue;
+ }
+
+ /* Records in order for scrub? */
+ xfs_scrub_btree_rec(&bs);
+
+ /* Call out to the record checker. */
+ recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ error = bs.scrub_rec(&bs, recp);
+ if (error)
+ break;
+ if (xfs_scrub_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ break;
+
+ cur->bc_ptrs[level]++;
+ continue;
+ }
+
+ /* End of node, pop back towards the root. */
+ if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+ xfs_scrub_btree_block_keys(&bs, level, block);
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_ptrs[level + 1]++;
+ level++;
+ continue;
+ }
+
+ /* Keys in order for scrub? */
+ xfs_scrub_btree_key(&bs, level);
+
+ /* Drill another level deeper. */
+ pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+ if (!xfs_scrub_btree_ptr_ok(&bs, level, pp)) {
+ cur->bc_ptrs[level]++;
+ continue;
+ }
+ level--;
+ error = xfs_scrub_btree_get_block(&bs, level, pp, &block, &bp);
+ if (error || !block)
+ goto out;
+
+ cur->bc_ptrs[level] = 1;
+ }
+
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
new file mode 100644
index 000000000000..4de825a626d1
--- /dev/null
+++ b/fs/xfs/scrub/btree.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_BTREE_H__
+#define __XFS_SCRUB_BTREE_H__
+
+/* btree scrub */
+
+/* Check for btree operation errors. */
+bool xfs_scrub_btree_process_error(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur, int level, int *error);
+
+/* Check for btree corruption. */
+void xfs_scrub_btree_set_corrupt(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur, int level);
+
+struct xfs_scrub_btree;
+typedef int (*xfs_scrub_btree_rec_fn)(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec);
+
+struct xfs_scrub_btree {
+ /* caller-provided scrub state */
+ struct xfs_scrub_context *sc;
+ struct xfs_btree_cur *cur;
+ xfs_scrub_btree_rec_fn scrub_rec;
+ struct xfs_owner_info *oinfo;
+ void *private;
+
+ /* internal scrub state */
+ union xfs_btree_rec lastrec;
+ bool firstrec;
+ union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
+ bool firstkey[XFS_BTREE_MAXLEVELS];
+ struct list_head to_check;
+};
+int xfs_scrub_btree(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ xfs_scrub_btree_rec_fn scrub_fn,
+ struct xfs_owner_info *oinfo, void *private);
+
+#endif /* __XFS_SCRUB_BTREE_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
new file mode 100644
index 000000000000..ac95fe911d96
--- /dev/null
+++ b/fs/xfs/scrub/common.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/btree.h"
+
+/* Common code for the metadata scrubbers. */
+
+/*
+ * Handling operational errors.
+ *
+ * The *_process_error() family of functions are used to process error return
+ * codes from functions called as part of a scrub operation.
+ *
+ * If there's no error, we return true to tell the caller that it's ok
+ * to move on to the next check in its list.
+ *
+ * For non-verifier errors (e.g. ENOMEM) we return false to tell the
+ * caller that something bad happened, and we preserve *error so that
+ * the caller can return the *error up the stack to userspace.
+ *
+ * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
+ * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words,
+ * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
+ * not via return codes. We return false to tell the caller that
+ * something bad happened. Since the error has been cleared, the caller
+ * will (presumably) return that zero and scrubbing will move on to
+ * whatever's next.
+ *
+ * ftrace can be used to record the precise metadata location and the
+ * approximate code location of the failed operation.
+ */
+
+/* Check for operational errors. */
+bool
+xfs_scrub_process_error(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ int *error)
+{
+ switch (*error) {
+ case 0:
+ return true;
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_op_error(sc, agno, bno, *error,
+ __return_address);
+ break;
+ }
+ return false;
+}
+
+/* Check for operational errors for a file offset. */
+bool
+xfs_scrub_fblock_process_error(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset,
+ int *error)
+{
+ switch (*error) {
+ case 0:
+ return true;
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_file_op_error(sc, whichfork, offset, *error,
+ __return_address);
+ break;
+ }
+ return false;
+}
+
+/*
+ * Handling scrub corruption/optimization/warning checks.
+ *
+ * The *_set_{corrupt,preen,warning}() family of functions are used to
+ * record the presence of metadata that is incorrect (corrupt), could be
+ * optimized somehow (preen), or should be flagged for administrative
+ * review but is not incorrect (warn).
+ *
+ * ftrace can be used to record the precise metadata location and
+ * approximate code location of the failed check.
+ */
+
+/* Record a block which could be optimized. */
+void
+xfs_scrub_block_set_preen(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xfs_scrub_block_preen(sc, bp->b_bn, __return_address);
+}
+
+/*
+ * Record an inode which could be optimized. The trace data will
+ * include the block given by bp if bp is given; otherwise it will use
+ * the block location of the inode record itself.
+ */
+void
+xfs_scrub_ino_set_preen(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xfs_scrub_ino_preen(sc, ino, bp ? bp->b_bn : 0,
+ __return_address);
+}
+
+/* Record a corrupt block. */
+void
+xfs_scrub_block_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address);
+}
+
+/*
+ * Record a corrupt inode. The trace data will include the block given
+ * by bp if bp is given; otherwise it will use the block location of the
+ * inode record itself.
+ */
+void
+xfs_scrub_ino_set_corrupt(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address);
+}
+
+/* Record corruption in a block indexed by a file fork. */
+void
+xfs_scrub_fblock_set_corrupt(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address);
+}
+
+/*
+ * Warn about inodes that need administrative review but is not
+ * incorrect.
+ */
+void
+xfs_scrub_ino_set_warning(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
+ trace_xfs_scrub_ino_warning(sc, ino, bp ? bp->b_bn : 0,
+ __return_address);
+}
+
+/* Warn about a block indexed by a file fork that needs review. */
+void
+xfs_scrub_fblock_set_warning(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
+ trace_xfs_scrub_fblock_warning(sc, whichfork, offset, __return_address);
+}
+
+/* Signal an incomplete scrub. */
+void
+xfs_scrub_set_incomplete(
+ struct xfs_scrub_context *sc)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
+ trace_xfs_scrub_incomplete(sc, __return_address);
+}
+
+/*
+ * AG scrubbing
+ *
+ * These helpers facilitate locking an allocation group's header
+ * buffers, setting up cursors for all btrees that are present, and
+ * cleaning everything up once we're through.
+ */
+
+/* Decide if we want to return an AG header read failure. */
+static inline bool
+want_ag_read_header_failure(
+ struct xfs_scrub_context *sc,
+ unsigned int type)
+{
+ /* Return all AG header read failures when scanning btrees. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
+ return true;
+ /*
+ * If we're scanning a given type of AG header, we only want to
+ * see read failures from that specific header. We'd like the
+ * other headers to cross-check them, but this isn't required.
+ */
+ if (sc->sm->sm_type == type)
+ return true;
+ return false;
+}
+
+/*
+ * Grab all the headers for an AG.
+ *
+ * The headers should be released by xfs_scrub_ag_free, but as a fail
+ * safe we attach all the buffers we grab to the scrub transaction so
+ * they'll all be freed when we cancel it.
+ */
+int
+xfs_scrub_ag_read_headers(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ struct xfs_buf **agi,
+ struct xfs_buf **agf,
+ struct xfs_buf **agfl)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_ialloc_read_agi(mp, sc->tp, agno, agi);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
+ goto out;
+
+ error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, agf);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
+ goto out;
+
+ error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
+ goto out;
+
+out:
+ return error;
+}
+
+/* Release all the AG btree cursors. */
+void
+xfs_scrub_ag_btcur_free(
+ struct xfs_scrub_ag *sa)
+{
+ if (sa->refc_cur)
+ xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
+ if (sa->rmap_cur)
+ xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
+ if (sa->fino_cur)
+ xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
+ if (sa->ino_cur)
+ xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
+ if (sa->cnt_cur)
+ xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
+ if (sa->bno_cur)
+ xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
+
+ sa->refc_cur = NULL;
+ sa->rmap_cur = NULL;
+ sa->fino_cur = NULL;
+ sa->ino_cur = NULL;
+ sa->bno_cur = NULL;
+ sa->cnt_cur = NULL;
+}
+
+/* Initialize all the btree cursors for an AG. */
+int
+xfs_scrub_ag_btcur_init(
+ struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t agno = sa->agno;
+
+ if (sa->agf_bp) {
+ /* Set up a bnobt cursor for cross-referencing. */
+ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno, XFS_BTNUM_BNO);
+ if (!sa->bno_cur)
+ goto err;
+
+ /* Set up a cntbt cursor for cross-referencing. */
+ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno, XFS_BTNUM_CNT);
+ if (!sa->cnt_cur)
+ goto err;
+ }
+
+ /* Set up a inobt cursor for cross-referencing. */
+ if (sa->agi_bp) {
+ sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+ agno, XFS_BTNUM_INO);
+ if (!sa->ino_cur)
+ goto err;
+ }
+
+ /* Set up a finobt cursor for cross-referencing. */
+ if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+ agno, XFS_BTNUM_FINO);
+ if (!sa->fino_cur)
+ goto err;
+ }
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno);
+ if (!sa->rmap_cur)
+ goto err;
+ }
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) {
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, agno, NULL);
+ if (!sa->refc_cur)
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENOMEM;
+}
+
+/* Release the AG header context and btree cursors. */
+void
+xfs_scrub_ag_free(
+ struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa)
+{
+ xfs_scrub_ag_btcur_free(sa);
+ if (sa->agfl_bp) {
+ xfs_trans_brelse(sc->tp, sa->agfl_bp);
+ sa->agfl_bp = NULL;
+ }
+ if (sa->agf_bp) {
+ xfs_trans_brelse(sc->tp, sa->agf_bp);
+ sa->agf_bp = NULL;
+ }
+ if (sa->agi_bp) {
+ xfs_trans_brelse(sc->tp, sa->agi_bp);
+ sa->agi_bp = NULL;
+ }
+ sa->agno = NULLAGNUMBER;
+}
+
+/*
+ * For scrub, grab the AGI and the AGF headers, in that order. Locking
+ * order requires us to get the AGI before the AGF. We use the
+ * transaction to avoid deadlocking on crosslinked metadata buffers;
+ * either the caller passes one in (bmap scrub) or we have to create a
+ * transaction ourselves.
+ */
+int
+xfs_scrub_ag_init(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ struct xfs_scrub_ag *sa)
+{
+ int error;
+
+ sa->agno = agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sa->agi_bp,
+ &sa->agf_bp, &sa->agfl_bp);
+ if (error)
+ return error;
+
+ return xfs_scrub_ag_btcur_init(sc, sa);
+}
+
+/* Per-scrubber setup functions */
+
+/* Set us up with a transaction and an empty context. */
+int
+xfs_scrub_setup_fs(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
+}
+
+/* Set us up with AG headers and btree cursors. */
+int
+xfs_scrub_setup_ag_btree(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip,
+ bool force_log)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ /*
+ * If the caller asks us to checkpont the log, do so. This
+ * expensive operation should be performed infrequently and only
+ * as a last resort. Any caller that sets force_log should
+ * document why they need to do so.
+ */
+ if (force_log) {
+ error = xfs_scrub_checkpoint_log(mp);
+ if (error)
+ return error;
+ }
+
+ error = xfs_scrub_setup_ag_header(sc, ip);
+ if (error)
+ return error;
+
+ return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa);
+}
+
+/* Push everything out of the log onto disk. */
+int
+xfs_scrub_checkpoint_log(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ if (error)
+ return error;
+ xfs_ail_push_all_sync(mp->m_ail);
+ return 0;
+}
+
+/*
+ * Given an inode and the scrub control structure, grab either the
+ * inode referenced in the control structure or the inode passed in.
+ * The inode is not locked.
+ */
+int
+xfs_scrub_get_inode(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip_in)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = NULL;
+ int error;
+
+ /*
+ * If userspace passed us an AG number or a generation number
+ * without an inode number, they haven't got a clue so bail out
+ * immediately.
+ */
+ if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino))
+ return -EINVAL;
+
+ /* We want to scan the inode we already had opened. */
+ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
+ sc->ip = ip_in;
+ return 0;
+ }
+
+ /* Look up the inode, see if the generation number matches. */
+ if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ return -ENOENT;
+ error = xfs_iget(mp, NULL, sc->sm->sm_ino,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip);
+ if (error == -ENOENT || error == -EINVAL) {
+ /* inode doesn't exist... */
+ return -ENOENT;
+ } else if (error) {
+ trace_xfs_scrub_op_error(sc,
+ XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
+ XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+ error, __return_address);
+ return error;
+ }
+ if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
+ iput(VFS_I(ip));
+ return -ENOENT;
+ }
+
+ sc->ip = ip;
+ return 0;
+}
+
+/* Set us up to scrub a file's contents. */
+int
+xfs_scrub_setup_inode_contents(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip,
+ unsigned int resblks)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_scrub_get_inode(sc, ip);
+ if (error)
+ return error;
+
+ /* Got the inode, lock it and we're ready to go. */
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode for us */
+ return error;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
new file mode 100644
index 000000000000..5c043855570e
--- /dev/null
+++ b/fs/xfs/scrub/common.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_COMMON_H__
+#define __XFS_SCRUB_COMMON_H__
+
+/*
+ * We /could/ terminate a scrub/repair operation early. If we're not
+ * in a good place to continue (fatal signal, etc.) then bail out.
+ * Note that we're careful not to make any judgements about *error.
+ */
+static inline bool
+xfs_scrub_should_terminate(
+ struct xfs_scrub_context *sc,
+ int *error)
+{
+ if (fatal_signal_pending(current)) {
+ if (*error == 0)
+ *error = -EAGAIN;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Grab an empty transaction so that we can re-grab locked buffers if
+ * one of our btrees turns out to be cyclic.
+ */
+static inline int
+xfs_scrub_trans_alloc(
+ struct xfs_scrub_metadata *sm,
+ struct xfs_mount *mp,
+ struct xfs_trans **tpp)
+{
+ return xfs_trans_alloc_empty(mp, tpp);
+}
+
+bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ xfs_agblock_t bno, int *error);
+bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, int *error);
+
+void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc,
+ struct xfs_buf *bp);
+void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino,
+ struct xfs_buf *bp);
+
+void xfs_scrub_block_set_corrupt(struct xfs_scrub_context *sc,
+ struct xfs_buf *bp);
+void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino,
+ struct xfs_buf *bp);
+void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset);
+
+void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino,
+ struct xfs_buf *bp);
+void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset);
+
+void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc);
+int xfs_scrub_checkpoint_log(struct xfs_mount *mp);
+
+/* Setup functions */
+int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_allocbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_iallocbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_rmapbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_refcountbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode_bmap(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode_bmap_data(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_directory(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_xattr(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_symlink(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_parent(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+#ifdef CONFIG_XFS_RT
+int xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+#else
+static inline int
+xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip)
+{
+ return -ENOENT;
+}
+#endif
+#ifdef CONFIG_XFS_QUOTA
+int xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+#else
+static inline int
+xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip)
+{
+ return -ENOENT;
+}
+#endif
+
+void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ struct xfs_buf **agi, struct xfs_buf **agf,
+ struct xfs_buf **agfl);
+void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa);
+int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc,
+ int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno,
+ void *),
+ void *priv);
+
+int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip, bool force_log);
+int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in);
+int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip, unsigned int resblks);
+
+#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
new file mode 100644
index 000000000000..d94edd93cba8
--- /dev/null
+++ b/fs/xfs/scrub/dabtree.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_leaf.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/dabtree.h"
+
+/* Directory/Attribute Btree */
+
+/*
+ * Check for da btree operation errors. See the section about handling
+ * operational errors in common.c.
+ */
+bool
+xfs_scrub_da_process_error(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int *error)
+{
+ struct xfs_scrub_context *sc = ds->sc;
+
+ if (*error == 0)
+ return true;
+
+ switch (*error) {
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_file_op_error(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ *error, __return_address);
+ break;
+ }
+ return false;
+}
+
+/*
+ * Check for da btree corruption. See the section about handling
+ * operational errors in common.c.
+ */
+void
+xfs_scrub_da_set_corrupt(
+ struct xfs_scrub_da_btree *ds,
+ int level)
+{
+ struct xfs_scrub_context *sc = ds->sc;
+
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+ trace_xfs_scrub_fblock_error(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ __return_address);
+}
+
+/* Find an entry at a certain level in a da btree. */
+STATIC void *
+xfs_scrub_da_btree_entry(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int rec)
+{
+ char *ents;
+ struct xfs_da_state_blk *blk;
+ void *baddr;
+
+ /* Dispatch the entry finding function. */
+ blk = &ds->state->path.blk[level];
+ baddr = blk->bp->b_addr;
+ switch (blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ ents = (char *)xfs_attr3_leaf_entryp(baddr);
+ return ents + (rec * sizeof(struct xfs_attr_leaf_entry));
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
+ return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
+ return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr);
+ return ents + (rec * sizeof(struct xfs_da_node_entry));
+ }
+
+ return NULL;
+}
+
+/* Scrub a da btree hash (key). */
+int
+xfs_scrub_da_btree_hash(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ __be32 *hashp)
+{
+ struct xfs_da_state_blk *blks;
+ struct xfs_da_node_entry *entry;
+ xfs_dahash_t hash;
+ xfs_dahash_t parent_hash;
+
+ /* Is this hash in order? */
+ hash = be32_to_cpu(*hashp);
+ if (hash < ds->hashes[level])
+ xfs_scrub_da_set_corrupt(ds, level);
+ ds->hashes[level] = hash;
+
+ if (level == 0)
+ return 0;
+
+ /* Is this hash no larger than the parent hash? */
+ blks = ds->state->path.blk;
+ entry = xfs_scrub_da_btree_entry(ds, level - 1, blks[level - 1].index);
+ parent_hash = be32_to_cpu(entry->hashval);
+ if (parent_hash < hash)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ return 0;
+}
+
+/*
+ * Check a da btree pointer. Returns true if it's ok to use this
+ * pointer.
+ */
+STATIC bool
+xfs_scrub_da_btree_ptr_ok(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ xfs_dablk_t blkno)
+{
+ if (blkno < ds->lowest || (ds->highest != 0 && blkno >= ds->highest)) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * The da btree scrubber can handle leaf1 blocks as a degenerate
+ * form of leafn blocks. Since the regular da code doesn't handle
+ * leaf1, we must multiplex the verifiers.
+ */
+static void
+xfs_scrub_da_btree_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ default:
+ /*
+ * xfs_da3_node_buf_ops already know how to handle
+ * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks.
+ */
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ }
+}
+static void
+xfs_scrub_da_btree_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ bp->b_ops->verify_write(bp);
+ return;
+ default:
+ /*
+ * xfs_da3_node_buf_ops already know how to handle
+ * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks.
+ */
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ bp->b_ops->verify_write(bp);
+ return;
+ }
+}
+
+static const struct xfs_buf_ops xfs_scrub_da_btree_buf_ops = {
+ .name = "xfs_scrub_da_btree",
+ .verify_read = xfs_scrub_da_btree_read_verify,
+ .verify_write = xfs_scrub_da_btree_write_verify,
+};
+
+/* Check a block's sibling. */
+STATIC int
+xfs_scrub_da_btree_block_check_sibling(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int direction,
+ xfs_dablk_t sibling)
+{
+ int retval;
+ int error;
+
+ memcpy(&ds->state->altpath, &ds->state->path,
+ sizeof(ds->state->altpath));
+
+ /*
+ * If the pointer is null, we shouldn't be able to move the upper
+ * level pointer anywhere.
+ */
+ if (sibling == 0) {
+ error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
+ direction, false, &retval);
+ if (error == 0 && retval == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ error = 0;
+ goto out;
+ }
+
+ /* Move the alternate cursor one block in the direction given. */
+ error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
+ direction, false, &retval);
+ if (!xfs_scrub_da_process_error(ds, level, &error))
+ return error;
+ if (retval) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return error;
+ }
+
+ /* Compare upper level pointer to sibling pointer. */
+ if (ds->state->altpath.blk[level].blkno != sibling)
+ xfs_scrub_da_set_corrupt(ds, level);
+ xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp);
+out:
+ return error;
+}
+
+/* Check a block's sibling pointers. */
+STATIC int
+xfs_scrub_da_btree_block_check_siblings(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ struct xfs_da_blkinfo *hdr)
+{
+ xfs_dablk_t forw;
+ xfs_dablk_t back;
+ int error = 0;
+
+ forw = be32_to_cpu(hdr->forw);
+ back = be32_to_cpu(hdr->back);
+
+ /* Top level blocks should not have sibling pointers. */
+ if (level == 0) {
+ if (forw != 0 || back != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ return 0;
+ }
+
+ /*
+ * Check back (left) and forw (right) pointers. These functions
+ * absorb error codes for us.
+ */
+ error = xfs_scrub_da_btree_block_check_sibling(ds, level, 0, back);
+ if (error)
+ goto out;
+ error = xfs_scrub_da_btree_block_check_sibling(ds, level, 1, forw);
+
+out:
+ memset(&ds->state->altpath, 0, sizeof(ds->state->altpath));
+ return error;
+}
+
+/* Load a dir/attribute block from a btree. */
+STATIC int
+xfs_scrub_da_btree_block(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ xfs_dablk_t blkno)
+{
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_blkinfo *hdr3;
+ struct xfs_da_args *dargs = &ds->dargs;
+ struct xfs_inode *ip = ds->dargs.dp;
+ xfs_ino_t owner;
+ int *pmaxrecs;
+ struct xfs_da3_icnode_hdr nodehdr;
+ int error = 0;
+
+ blk = &ds->state->path.blk[level];
+ ds->state->path.active = level + 1;
+
+ /* Release old block. */
+ if (blk->bp) {
+ xfs_trans_brelse(dargs->trans, blk->bp);
+ blk->bp = NULL;
+ }
+
+ /* Check the pointer. */
+ blk->blkno = blkno;
+ if (!xfs_scrub_da_btree_ptr_ok(ds, level, blkno))
+ goto out_nobuf;
+
+ /* Read the buffer. */
+ error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2,
+ &blk->bp, dargs->whichfork,
+ &xfs_scrub_da_btree_buf_ops);
+ if (!xfs_scrub_da_process_error(ds, level, &error))
+ goto out_nobuf;
+
+ /*
+ * We didn't find a dir btree root block, which means that
+ * there's no LEAF1/LEAFN tree (at least not where it's supposed
+ * to be), so jump out now.
+ */
+ if (ds->dargs.whichfork == XFS_DATA_FORK && level == 0 &&
+ blk->bp == NULL)
+ goto out_nobuf;
+
+ /* It's /not/ ok for attr trees not to have a da btree. */
+ if (blk->bp == NULL) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_nobuf;
+ }
+
+ hdr3 = blk->bp->b_addr;
+ blk->magic = be16_to_cpu(hdr3->hdr.magic);
+ pmaxrecs = &ds->maxrecs[level];
+
+ /* We only started zeroing the header on v5 filesystems. */
+ if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ /* Check the owner. */
+ if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) {
+ owner = be64_to_cpu(hdr3->owner);
+ if (owner != ip->i_ino)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+
+ /* Check the siblings. */
+ error = xfs_scrub_da_btree_block_check_siblings(ds, level, &hdr3->hdr);
+ if (error)
+ goto out;
+
+ /* Interpret the buffer. */
+ switch (blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_ATTR_LEAF_BUF);
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DIR_LEAFN_BUF);
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DIR_LEAF1_BUF);
+ blk->magic = XFS_DIR2_LEAF1_MAGIC;
+ blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DA_NODE_BUF);
+ blk->magic = XFS_DA_NODE_MAGIC;
+ node = blk->bp->b_addr;
+ ip->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = ip->d_ops->node_tree_p(node);
+ *pmaxrecs = nodehdr.count;
+ blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval);
+ if (level == 0) {
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ ds->tree_level = nodehdr.level;
+ } else {
+ if (ds->tree_level != nodehdr.level) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ }
+
+ /* XXX: Check hdr3.pad32 once we know how to fix it. */
+ break;
+ default:
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+
+out:
+ return error;
+out_freebp:
+ xfs_trans_brelse(dargs->trans, blk->bp);
+ blk->bp = NULL;
+out_nobuf:
+ blk->blkno = 0;
+ return error;
+}
+
+/* Visit all nodes and leaves of a da btree. */
+int
+xfs_scrub_da_btree(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_scrub_da_btree_rec_fn scrub_fn,
+ void *private)
+{
+ struct xfs_scrub_da_btree ds = {};
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_da_state_blk *blks;
+ struct xfs_da_node_entry *key;
+ void *rec;
+ xfs_dablk_t blkno;
+ int level;
+ int error;
+
+ /* Skip short format data structures; no btree to scan. */
+ if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ return 0;
+
+ /* Set up initial da state. */
+ ds.dargs.dp = sc->ip;
+ ds.dargs.whichfork = whichfork;
+ ds.dargs.trans = sc->tp;
+ ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds.state = xfs_da_state_alloc();
+ ds.state->args = &ds.dargs;
+ ds.state->mp = mp;
+ ds.sc = sc;
+ ds.private = private;
+ if (whichfork == XFS_ATTR_FORK) {
+ ds.dargs.geo = mp->m_attr_geo;
+ ds.lowest = 0;
+ ds.highest = 0;
+ } else {
+ ds.dargs.geo = mp->m_dir_geo;
+ ds.lowest = ds.dargs.geo->leafblk;
+ ds.highest = ds.dargs.geo->freeblk;
+ }
+ blkno = ds.lowest;
+ level = 0;
+
+ /* Find the root of the da tree, if present. */
+ blks = ds.state->path.blk;
+ error = xfs_scrub_da_btree_block(&ds, level, blkno);
+ if (error)
+ goto out_state;
+ /*
+ * We didn't find a block at ds.lowest, which means that there's
+ * no LEAF1/LEAFN tree (at least not where it's supposed to be),
+ * so jump out now.
+ */
+ if (blks[level].bp == NULL)
+ goto out_state;
+
+ blks[level].index = 0;
+ while (level >= 0 && level < XFS_DA_NODE_MAXDEPTH) {
+ /* Handle leaf block. */
+ if (blks[level].magic != XFS_DA_NODE_MAGIC) {
+ /* End of leaf, pop back towards the root. */
+ if (blks[level].index >= ds.maxrecs[level]) {
+ if (level > 0)
+ blks[level - 1].index++;
+ ds.tree_level++;
+ level--;
+ continue;
+ }
+
+ /* Dispatch record scrubbing. */
+ rec = xfs_scrub_da_btree_entry(&ds, level,
+ blks[level].index);
+ error = scrub_fn(&ds, level, rec);
+ if (error)
+ break;
+ if (xfs_scrub_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ break;
+
+ blks[level].index++;
+ continue;
+ }
+
+
+ /* End of node, pop back towards the root. */
+ if (blks[level].index >= ds.maxrecs[level]) {
+ if (level > 0)
+ blks[level - 1].index++;
+ ds.tree_level++;
+ level--;
+ continue;
+ }
+
+ /* Hashes in order for scrub? */
+ key = xfs_scrub_da_btree_entry(&ds, level, blks[level].index);
+ error = xfs_scrub_da_btree_hash(&ds, level, &key->hashval);
+ if (error)
+ goto out;
+
+ /* Drill another level deeper. */
+ blkno = be32_to_cpu(key->before);
+ level++;
+ ds.tree_level--;
+ error = xfs_scrub_da_btree_block(&ds, level, blkno);
+ if (error)
+ goto out;
+ if (blks[level].bp == NULL)
+ goto out;
+
+ blks[level].index = 0;
+ }
+
+out:
+ /* Release all the buffers we're tracking. */
+ for (level = 0; level < XFS_DA_NODE_MAXDEPTH; level++) {
+ if (blks[level].bp == NULL)
+ continue;
+ xfs_trans_brelse(sc->tp, blks[level].bp);
+ blks[level].bp = NULL;
+ }
+
+out_state:
+ xfs_da_state_free(ds.state);
+ return error;
+}
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
new file mode 100644
index 000000000000..d31468d68cef
--- /dev/null
+++ b/fs/xfs/scrub/dabtree.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_DABTREE_H__
+#define __XFS_SCRUB_DABTREE_H__
+
+/* dir/attr btree */
+
+struct xfs_scrub_da_btree {
+ struct xfs_da_args dargs;
+ xfs_dahash_t hashes[XFS_DA_NODE_MAXDEPTH];
+ int maxrecs[XFS_DA_NODE_MAXDEPTH];
+ struct xfs_da_state *state;
+ struct xfs_scrub_context *sc;
+ void *private;
+
+ /*
+ * Lowest and highest directory block address in which we expect
+ * to find dir/attr btree node blocks. For a directory this
+ * (presumably) means between LEAF_OFFSET and FREE_OFFSET; for
+ * attributes there is no limit.
+ */
+ xfs_dablk_t lowest;
+ xfs_dablk_t highest;
+
+ int tree_level;
+};
+
+typedef int (*xfs_scrub_da_btree_rec_fn)(struct xfs_scrub_da_btree *ds,
+ int level, void *rec);
+
+/* Check for da btree operation errors. */
+bool xfs_scrub_da_process_error(struct xfs_scrub_da_btree *ds, int level, int *error);
+
+/* Check for da btree corruption. */
+void xfs_scrub_da_set_corrupt(struct xfs_scrub_da_btree *ds, int level);
+
+int xfs_scrub_da_btree_hash(struct xfs_scrub_da_btree *ds, int level,
+ __be32 *hashp);
+int xfs_scrub_da_btree(struct xfs_scrub_context *sc, int whichfork,
+ xfs_scrub_da_btree_rec_fn scrub_fn, void *private);
+
+#endif /* __XFS_SCRUB_DABTREE_H__ */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
new file mode 100644
index 000000000000..69e1efdd4019
--- /dev/null
+++ b/fs/xfs/scrub/dir.c
@@ -0,0 +1,816 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ialloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/dabtree.h"
+
+/* Set us up to scrub directories. */
+int
+xfs_scrub_setup_directory(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Directories */
+
+/* Scrub a directory entry. */
+
+struct xfs_scrub_dir_ctx {
+ /* VFS fill-directory iterator */
+ struct dir_context dir_iter;
+
+ struct xfs_scrub_context *sc;
+};
+
+/* Check that an inode's mode matches a given DT_ type. */
+STATIC int
+xfs_scrub_dir_check_ftype(
+ struct xfs_scrub_dir_ctx *sdc,
+ xfs_fileoff_t offset,
+ xfs_ino_t inum,
+ int dtype)
+{
+ struct xfs_mount *mp = sdc->sc->mp;
+ struct xfs_inode *ip;
+ int ino_dtype;
+ int error = 0;
+
+ if (!xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (dtype != DT_UNKNOWN && dtype != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ goto out;
+ }
+
+ /*
+ * Grab the inode pointed to by the dirent. We release the
+ * inode before we cancel the scrub transaction. Since we're
+ * don't know a priori that releasing the inode won't trigger
+ * eofblocks cleanup (which allocates what would be a nested
+ * transaction), we can't use DONTCACHE here because DONTCACHE
+ * inodes can trigger immediate inactive cleanup of the inode.
+ */
+ error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip);
+ if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
+ &error))
+ goto out;
+
+ /* Convert mode to the DT_* values that dir_emit uses. */
+ ino_dtype = xfs_dir3_get_dtype(mp,
+ xfs_mode_to_ftype(VFS_I(ip)->i_mode));
+ if (ino_dtype != dtype)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ iput(VFS_I(ip));
+out:
+ return error;
+}
+
+/*
+ * Scrub a single directory entry.
+ *
+ * We use the VFS directory iterator (i.e. readdir) to call this
+ * function for every directory entry in a directory. Once we're here,
+ * we check the inode number to make sure it's sane, then we check that
+ * we can look up this filename. Finally, we check the ftype.
+ */
+STATIC int
+xfs_scrub_dir_actor(
+ struct dir_context *dir_iter,
+ const char *name,
+ int namelen,
+ loff_t pos,
+ u64 ino,
+ unsigned type)
+{
+ struct xfs_mount *mp;
+ struct xfs_inode *ip;
+ struct xfs_scrub_dir_ctx *sdc;
+ struct xfs_name xname;
+ xfs_ino_t lookup_ino;
+ xfs_dablk_t offset;
+ int error = 0;
+
+ sdc = container_of(dir_iter, struct xfs_scrub_dir_ctx, dir_iter);
+ ip = sdc->sc->ip;
+ mp = ip->i_mount;
+ offset = xfs_dir2_db_to_da(mp->m_dir_geo,
+ xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
+
+ /* Does this inode number make sense? */
+ if (!xfs_verify_dir_ino(mp, ino)) {
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
+ if (!strncmp(".", name, namelen)) {
+ /* If this is "." then check that the inum matches the dir. */
+ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ if (ino != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ } else if (!strncmp("..", name, namelen)) {
+ /*
+ * If this is ".." in the root inode, check that the inum
+ * matches this dir.
+ */
+ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ }
+
+ /* Verify that we can look up this name by hash. */
+ xname.name = name;
+ xname.len = namelen;
+ xname.type = XFS_DIR3_FT_UNKNOWN;
+
+ error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
+ if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
+ &error))
+ goto fail_xref;
+ if (lookup_ino != ino) {
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
+ /* Verify the file type. This function absorbs error codes. */
+ error = xfs_scrub_dir_check_ftype(sdc, offset, lookup_ino, type);
+ if (error)
+ goto out;
+out:
+ return error;
+fail_xref:
+ return error;
+}
+
+/* Scrub a directory btree record. */
+STATIC int
+xfs_scrub_dir_rec(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ void *rec)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_dir2_leaf_entry *ent = rec;
+ struct xfs_inode *dp = ds->dargs.dp;
+ struct xfs_dir2_data_entry *dent;
+ struct xfs_buf *bp;
+ xfs_ino_t ino;
+ xfs_dablk_t rec_bno;
+ xfs_dir2_db_t db;
+ xfs_dir2_data_aoff_t off;
+ xfs_dir2_dataptr_t ptr;
+ xfs_dahash_t calc_hash;
+ xfs_dahash_t hash;
+ unsigned int tag;
+ int error;
+
+ /* Check the hash of the entry. */
+ error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval);
+ if (error)
+ goto out;
+
+ /* Valid hash pointer? */
+ ptr = be32_to_cpu(ent->address);
+ if (ptr == 0)
+ return 0;
+
+ /* Find the directory entry's location. */
+ db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr);
+ off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr);
+ rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db);
+
+ if (rec_bno >= mp->m_dir_geo->leafblk) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp);
+ if (!xfs_scrub_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
+ &error))
+ goto out;
+ if (!bp) {
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out;
+ }
+
+ /* Retrieve the entry, sanity check it, and compare hashes. */
+ dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off);
+ ino = be64_to_cpu(dent->inumber);
+ hash = be32_to_cpu(ent->hashval);
+ tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent));
+ if (!xfs_verify_dir_ino(mp, ino) || tag != off)
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ if (dent->namelen == 0) {
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out_relse;
+ }
+ calc_hash = xfs_da_hashname(dent->name, dent->namelen);
+ if (calc_hash != hash)
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+
+out_relse:
+ xfs_trans_brelse(ds->dargs.trans, bp);
+out:
+ return error;
+}
+
+/*
+ * Is this unused entry either in the bestfree or smaller than all of
+ * them? We've already checked that the bestfrees are sorted longest to
+ * shortest, and that there aren't any bogus entries.
+ */
+STATIC void
+xfs_scrub_directory_check_free_entry(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup)
+{
+ struct xfs_dir2_data_free *dfp;
+ unsigned int dup_length;
+
+ dup_length = be16_to_cpu(dup->length);
+
+ /* Unused entry is shorter than any of the bestfrees */
+ if (dup_length < be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+ return;
+
+ for (dfp = &bf[XFS_DIR2_DATA_FD_COUNT - 1]; dfp >= bf; dfp--)
+ if (dup_length == be16_to_cpu(dfp->length))
+ return;
+
+ /* Unused entry should be in the bestfrees but wasn't found. */
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+}
+
+/* Check free space info in a directory data block. */
+STATIC int
+xfs_scrub_directory_data_bestfree(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ bool is_block)
+{
+ struct xfs_dir2_data_unused *dup;
+ struct xfs_dir2_data_free *dfp;
+ struct xfs_buf *bp;
+ struct xfs_dir2_data_free *bf;
+ struct xfs_mount *mp = sc->mp;
+ const struct xfs_dir_ops *d_ops;
+ char *ptr;
+ char *endptr;
+ u16 tag;
+ unsigned int nr_bestfrees = 0;
+ unsigned int nr_frees = 0;
+ unsigned int smallest_bestfree;
+ int newlen;
+ int offset;
+ int error;
+
+ d_ops = sc->ip->d_ops;
+
+ if (is_block) {
+ /* dir block format */
+ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
+ } else {
+ /* dir data format */
+ error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp);
+ }
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
+
+ /* Do the bestfrees correspond to actual free space? */
+ bf = d_ops->data_bestfree_p(bp->b_addr);
+ smallest_bestfree = UINT_MAX;
+ for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+ offset = be16_to_cpu(dfp->offset);
+ if (offset == 0)
+ continue;
+ if (offset >= mp->m_dir_geo->blksize) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+ dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset);
+ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
+
+ /* bestfree doesn't match the entry it points at? */
+ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) ||
+ be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) ||
+ tag != ((char *)dup - (char *)bp->b_addr)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+
+ /* bestfree records should be ordered largest to smallest */
+ if (smallest_bestfree < be16_to_cpu(dfp->length)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+
+ smallest_bestfree = be16_to_cpu(dfp->length);
+ nr_bestfrees++;
+ }
+
+ /* Make sure the bestfrees are actually the best free spaces. */
+ ptr = (char *)d_ops->data_entry_p(bp->b_addr);
+ if (is_block) {
+ struct xfs_dir2_block_tail *btp;
+
+ btp = xfs_dir2_block_tail_p(mp->m_dir_geo, bp->b_addr);
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
+ } else
+ endptr = (char *)bp->b_addr + BBTOB(bp->b_length);
+
+ /* Iterate the entries, stopping when we hit or go past the end. */
+ while (ptr < endptr) {
+ dup = (struct xfs_dir2_data_unused *)ptr;
+ /* Skip real entries */
+ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) {
+ struct xfs_dir2_data_entry *dep;
+
+ dep = (struct xfs_dir2_data_entry *)ptr;
+ newlen = d_ops->data_entsize(dep->namelen);
+ if (newlen <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ lblk);
+ goto out_buf;
+ }
+ ptr += newlen;
+ continue;
+ }
+
+ /* Spot check this free entry */
+ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
+ if (tag != ((char *)dup - (char *)bp->b_addr))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /*
+ * Either this entry is a bestfree or it's smaller than
+ * any of the bestfrees.
+ */
+ xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup);
+
+ /* Move on. */
+ newlen = be16_to_cpu(dup->length);
+ if (newlen <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+ ptr += newlen;
+ if (ptr <= endptr)
+ nr_frees++;
+ }
+
+ /* We're required to fill all the space. */
+ if (ptr != endptr)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /* Did we see at least as many free slots as there are bestfrees? */
+ if (nr_frees < nr_bestfrees)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+out_buf:
+ xfs_trans_brelse(sc->tp, bp);
+out:
+ return error;
+}
+
+/*
+ * Does the free space length in the free space index block ($len) match
+ * the longest length in the directory data block's bestfree array?
+ * Assume that we've already checked that the data block's bestfree
+ * array is in order.
+ */
+STATIC void
+xfs_scrub_directory_check_freesp(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ struct xfs_buf *dbp,
+ unsigned int len)
+{
+ struct xfs_dir2_data_free *dfp;
+
+ dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr);
+
+ if (len != be16_to_cpu(dfp->length))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ if (len > 0 && be16_to_cpu(dfp->offset) == 0)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+}
+
+/* Check free space info in a directory leaf1 block. */
+STATIC int
+xfs_scrub_directory_leaf1_bestfree(
+ struct xfs_scrub_context *sc,
+ struct xfs_da_args *args,
+ xfs_dablk_t lblk)
+{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_leaf_tail *ltp;
+ struct xfs_dir2_leaf *leaf;
+ struct xfs_buf *dbp;
+ struct xfs_buf *bp;
+ const struct xfs_dir_ops *d_ops = sc->ip->d_ops;
+ struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
+ __be16 *bestp;
+ __u16 best;
+ __u32 hash;
+ __u32 lasthash = 0;
+ __u32 bestcount;
+ unsigned int stale = 0;
+ int i;
+ int error;
+
+ /* Read the free space block. */
+ error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ leaf = bp->b_addr;
+ d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = d_ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+ bestcount = be32_to_cpu(ltp->bestcount);
+ bestp = xfs_dir2_leaf_bests_p(ltp);
+
+ if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->pad != cpu_to_be32(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ }
+
+ /*
+ * There should be as many bestfree slots as there are dir data
+ * blocks that can fit under i_size.
+ */
+ if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_d.di_size)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Is the leaf count even remotely sane? */
+ if (leafhdr.count > d_ops->leaf_max_ents(geo)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Leaves and bests don't overlap in leaf format. */
+ if ((char *)&ents[leafhdr.count] > (char *)bestp) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Check hash value order, count stale entries. */
+ for (i = 0; i < leafhdr.count; i++) {
+ hash = be32_to_cpu(ents[i].hashval);
+ if (i > 0 && lasthash > hash)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ lasthash = hash;
+ if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ }
+ if (leafhdr.stale != stale)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /* Check all the bestfree entries. */
+ for (i = 0; i < bestcount; i++, bestp++) {
+ best = be16_to_cpu(*bestp);
+ if (best == NULLDATAOFF)
+ continue;
+ error = xfs_dir3_data_read(sc->tp, sc->ip,
+ i * args->geo->fsbcount, -1, &dbp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
+ &error))
+ continue;
+ xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
+ xfs_trans_brelse(sc->tp, dbp);
+ }
+out:
+ return error;
+}
+
+/* Check free space info in a directory freespace block. */
+STATIC int
+xfs_scrub_directory_free_bestfree(
+ struct xfs_scrub_context *sc,
+ struct xfs_da_args *args,
+ xfs_dablk_t lblk)
+{
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_buf *dbp;
+ struct xfs_buf *bp;
+ __be16 *bestp;
+ __u16 best;
+ unsigned int stale = 0;
+ int i;
+ int error;
+
+ /* Read the free space block */
+ error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->pad != cpu_to_be32(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ }
+
+ /* Check all the entries. */
+ sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr);
+ bestp = sc->ip->d_ops->free_bests_p(bp->b_addr);
+ for (i = 0; i < freehdr.nvalid; i++, bestp++) {
+ best = be16_to_cpu(*bestp);
+ if (best == NULLDATAOFF) {
+ stale++;
+ continue;
+ }
+ error = xfs_dir3_data_read(sc->tp, sc->ip,
+ (freehdr.firstdb + i) * args->geo->fsbcount,
+ -1, &dbp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
+ &error))
+ continue;
+ xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
+ xfs_trans_brelse(sc->tp, dbp);
+ }
+
+ if (freehdr.nused + stale != freehdr.nvalid)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+out:
+ return error;
+}
+
+/* Check free space information in directories. */
+STATIC int
+xfs_scrub_directory_blocks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_da_args args;
+ struct xfs_ifork *ifp;
+ struct xfs_mount *mp = sc->mp;
+ xfs_fileoff_t leaf_lblk;
+ xfs_fileoff_t free_lblk;
+ xfs_fileoff_t lblk;
+ struct xfs_iext_cursor icur;
+ xfs_dablk_t dabno;
+ bool found;
+ int is_block = 0;
+ int error;
+
+ /* Ignore local format directories. */
+ if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+ sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ return 0;
+
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET);
+ leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET);
+ free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
+
+ /* Is this a block dir? */
+ args.dp = sc->ip;
+ args.geo = mp->m_dir_geo;
+ args.trans = sc->tp;
+ error = xfs_dir2_isblock(&args, &is_block);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ /* Iterate all the data extents in the directory... */
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ while (found) {
+ /* Block directories only have a single block at offset 0. */
+ if (is_block &&
+ (got.br_startoff > 0 ||
+ got.br_blockcount != args.geo->fsbcount)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ got.br_startoff);
+ break;
+ }
+
+ /* No more data blocks... */
+ if (got.br_startoff >= leaf_lblk)
+ break;
+
+ /*
+ * Check each data block's bestfree data.
+ *
+ * Iterate all the fsbcount-aligned block offsets in
+ * this directory. The directory block reading code is
+ * smart enough to do its own bmap lookups to handle
+ * discontiguous directory blocks. When we're done
+ * with the extent record, re-query the bmap at the
+ * next fsbcount-aligned offset to avoid redundant
+ * block checks.
+ */
+ for (lblk = roundup((xfs_dablk_t)got.br_startoff,
+ args.geo->fsbcount);
+ lblk < got.br_startoff + got.br_blockcount;
+ lblk += args.geo->fsbcount) {
+ error = xfs_scrub_directory_data_bestfree(sc, lblk,
+ is_block);
+ if (error)
+ goto out;
+ }
+ dabno = got.br_startoff + got.br_blockcount;
+ lblk = roundup(dabno, args.geo->fsbcount);
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Look for a leaf1 block, which has free info. */
+ if (xfs_iext_lookup_extent(sc->ip, ifp, leaf_lblk, &icur, &got) &&
+ got.br_startoff == leaf_lblk &&
+ got.br_blockcount == args.geo->fsbcount &&
+ !xfs_iext_next_extent(ifp, &icur, &got)) {
+ if (is_block) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+ error = xfs_scrub_directory_leaf1_bestfree(sc, &args,
+ leaf_lblk);
+ if (error)
+ goto out;
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Scan for free blocks */
+ lblk = free_lblk;
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ while (found) {
+ /*
+ * Dirs can't have blocks mapped above 2^32.
+ * Single-block dirs shouldn't even be here.
+ */
+ lblk = got.br_startoff;
+ if (lblk & ~0xFFFFFFFFULL) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+ if (is_block) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /*
+ * Check each dir free block's bestfree data.
+ *
+ * Iterate all the fsbcount-aligned block offsets in
+ * this directory. The directory block reading code is
+ * smart enough to do its own bmap lookups to handle
+ * discontiguous directory blocks. When we're done
+ * with the extent record, re-query the bmap at the
+ * next fsbcount-aligned offset to avoid redundant
+ * block checks.
+ */
+ for (lblk = roundup((xfs_dablk_t)got.br_startoff,
+ args.geo->fsbcount);
+ lblk < got.br_startoff + got.br_blockcount;
+ lblk += args.geo->fsbcount) {
+ error = xfs_scrub_directory_free_bestfree(sc, &args,
+ lblk);
+ if (error)
+ goto out;
+ }
+ dabno = got.br_startoff + got.br_blockcount;
+ lblk = roundup(dabno, args.geo->fsbcount);
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ }
+out:
+ return error;
+}
+
+/* Scrub a whole directory. */
+int
+xfs_scrub_directory(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_dir_ctx sdc = {
+ .dir_iter.actor = xfs_scrub_dir_actor,
+ .dir_iter.pos = 0,
+ .sc = sc,
+ };
+ size_t bufsize;
+ loff_t oldpos;
+ int error = 0;
+
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ /* Plausible size? */
+ if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ goto out;
+ }
+
+ /* Check directory tree structure */
+ error = xfs_scrub_da_btree(sc, XFS_DATA_FORK, xfs_scrub_dir_rec, NULL);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+
+ /* Check the freespace. */
+ error = xfs_scrub_directory_blocks(sc);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+
+ /*
+ * Check that every dirent we see can also be looked up by hash.
+ * Userspace usually asks for a 32k buffer, so we will too.
+ */
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
+ sc->ip->i_d.di_size);
+
+ /*
+ * Look up every name in this directory by hash.
+ *
+ * Use the xfs_readdir function to call xfs_scrub_dir_actor on
+ * every directory entry in this directory. In _actor, we check
+ * the name, inode number, and ftype (if applicable) of the
+ * entry. xfs_readdir uses the VFS filldir functions to provide
+ * iteration context.
+ *
+ * The VFS grabs a read or write lock via i_rwsem before it reads
+ * or writes to a directory. If we've gotten this far we've
+ * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
+ * getting a write lock on i_rwsem. Therefore, it is safe for us
+ * to drop the ILOCK here in order to reuse the _readdir and
+ * _dir_lookup routines, which do their own ILOCK locking.
+ */
+ oldpos = 0;
+ sc->ilock_flags &= ~XFS_ILOCK_EXCL;
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+ while (true) {
+ error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ goto out;
+ if (oldpos == sdc.dir_iter.pos)
+ break;
+ oldpos = sdc.dir_iter.pos;
+ }
+
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
new file mode 100644
index 000000000000..496d6f2fbb9e
--- /dev/null
+++ b/fs/xfs/scrub/ialloc.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub inode btrees.
+ * If we detect a discrepancy between the inobt and the inode,
+ * try again after forcing logged inode cores out to disk.
+ */
+int
+xfs_scrub_setup_ag_iallocbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder);
+}
+
+/* Inode btree scrubber. */
+
+/* Is this chunk worth checking? */
+STATIC bool
+xfs_scrub_iallocbt_chunk(
+ struct xfs_scrub_btree *bs,
+ struct xfs_inobt_rec_incore *irec,
+ xfs_agino_t agino,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+
+ bno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ return true;
+}
+
+/* Count the number of free inodes. */
+static unsigned int
+xfs_scrub_iallocbt_freecount(
+ xfs_inofree_t freemask)
+{
+ BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64));
+ return hweight64(freemask);
+}
+
+/* Check a particular inode with ir_free. */
+STATIC int
+xfs_scrub_iallocbt_check_cluster_freemask(
+ struct xfs_scrub_btree *bs,
+ xfs_ino_t fsino,
+ xfs_agino_t chunkino,
+ xfs_agino_t clusterino,
+ struct xfs_inobt_rec_incore *irec,
+ struct xfs_buf *bp)
+{
+ struct xfs_dinode *dip;
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ bool inode_is_free = false;
+ bool freemask_ok;
+ bool inuse;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(bs->sc, &error))
+ return error;
+
+ dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
+ (dip->di_version >= 3 &&
+ be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ goto out;
+ }
+
+ if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
+ inode_is_free = true;
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
+ fsino + clusterino, &inuse);
+ if (error == -ENODATA) {
+ /* Not cached, just read the disk buffer */
+ freemask_ok = inode_is_free ^ !!(dip->di_mode);
+ if (!bs->sc->try_harder && !freemask_ok)
+ return -EDEADLOCK;
+ } else if (error < 0) {
+ /*
+ * Inode is only half assembled, or there was an IO error,
+ * or the verifier failed, so don't bother trying to check.
+ * The inode scrubber can deal with this.
+ */
+ goto out;
+ } else {
+ /* Inode is all there. */
+ freemask_ok = inode_is_free ^ inuse;
+ }
+ if (!freemask_ok)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+out:
+ return 0;
+}
+
+/* Make sure the free mask is consistent with what the inodes think. */
+STATIC int
+xfs_scrub_iallocbt_check_freemask(
+ struct xfs_scrub_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_imap imap;
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *bp;
+ xfs_ino_t fsino;
+ xfs_agino_t nr_inodes;
+ xfs_agino_t agino;
+ xfs_agino_t chunkino;
+ xfs_agino_t clusterino;
+ xfs_agblock_t agbno;
+ int blks_per_cluster;
+ uint16_t holemask;
+ uint16_t ir_holemask;
+ int error = 0;
+
+ /* Make sure the freemask matches the inode records. */
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
+
+ for (agino = irec->ir_startino;
+ agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
+ agino += blks_per_cluster * mp->m_sb.sb_inopblock) {
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ chunkino = agino - irec->ir_startino;
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+
+ /* Compute the holemask mask for this cluster. */
+ for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
+ clusterino += XFS_INODES_PER_HOLEMASK_BIT)
+ holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
+ XFS_INODES_PER_HOLEMASK_BIT);
+
+ /* The whole cluster must be a hole or not a hole. */
+ ir_holemask = (irec->ir_holemask & holemask);
+ if (ir_holemask != holemask && ir_holemask != 0) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ continue;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask)
+ continue;
+
+ /* Grab the inode cluster buffer. */
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
+ agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap.im_boffset = 0;
+
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
+ &dip, &bp, 0, 0);
+ if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error))
+ continue;
+
+ /* Which inodes are free? */
+ for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
+ error = xfs_scrub_iallocbt_check_cluster_freemask(bs,
+ fsino, chunkino, clusterino, irec, bp);
+ if (error) {
+ xfs_trans_brelse(bs->cur->bc_tp, bp);
+ return error;
+ }
+ }
+
+ xfs_trans_brelse(bs->cur->bc_tp, bp);
+ }
+
+ return error;
+}
+
+/* Scrub an inobt/finobt record. */
+STATIC int
+xfs_scrub_iallocbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_inobt_rec_incore irec;
+ uint64_t holes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agino_t agino;
+ xfs_agblock_t agbno;
+ xfs_extlen_t len;
+ int holecount;
+ int i;
+ int error = 0;
+ unsigned int real_freecount;
+ uint16_t holemask;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ if (irec.ir_count > XFS_INODES_PER_CHUNK ||
+ irec.ir_freecount > XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ real_freecount = irec.ir_freecount +
+ (XFS_INODES_PER_CHUNK - irec.ir_count);
+ if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ agino = irec.ir_startino;
+ /* Record has to be properly aligned within the AG. */
+ if (!xfs_verify_agino(mp, agno, agino) ||
+ !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ goto out;
+ }
+
+ /* Make sure this record is aligned to cluster and inoalignmnt size. */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
+ if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) ||
+ (agbno & (xfs_icluster_size_fsb(mp) - 1)))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ /* Handle non-sparse inodes */
+ if (!xfs_inobt_issparse(irec.ir_holemask)) {
+ len = XFS_B_TO_FSB(mp,
+ XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize);
+ if (irec.ir_count != XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
+ goto out;
+ goto check_freemask;
+ }
+
+ /* Check each chunk of a sparse inode cluster. */
+ holemask = irec.ir_holemask;
+ holecount = 0;
+ len = XFS_B_TO_FSB(mp,
+ XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize);
+ holes = ~xfs_inobt_irec_to_allocmask(&irec);
+ if ((holes & irec.ir_free) != holes ||
+ irec.ir_freecount > irec.ir_count)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) {
+ if (holemask & 1)
+ holecount += XFS_INODES_PER_HOLEMASK_BIT;
+ else if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
+ break;
+ holemask >>= 1;
+ agino += XFS_INODES_PER_HOLEMASK_BIT;
+ }
+
+ if (holecount > XFS_INODES_PER_CHUNK ||
+ holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+check_freemask:
+ error = xfs_scrub_iallocbt_check_freemask(bs, &irec);
+ if (error)
+ goto out;
+
+out:
+ return error;
+}
+
+/* Scrub the inode btrees for some AG. */
+STATIC int
+xfs_scrub_iallocbt(
+ struct xfs_scrub_context *sc,
+ xfs_btnum_t which)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
+ cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
+ return xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo, NULL);
+}
+
+int
+xfs_scrub_inobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_iallocbt(sc, XFS_BTNUM_INO);
+}
+
+int
+xfs_scrub_finobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_iallocbt(sc, XFS_BTNUM_FINO);
+}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
new file mode 100644
index 000000000000..637b7a892313
--- /dev/null
+++ b/fs/xfs/scrub/inode.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Grab total control of the inode metadata. It doesn't matter here if
+ * the file data is still changing; exclusive access to the metadata is
+ * the goal.
+ */
+int
+xfs_scrub_setup_inode(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ /*
+ * Try to get the inode. If the verifiers fail, we try again
+ * in raw mode.
+ */
+ error = xfs_scrub_get_inode(sc, ip);
+ switch (error) {
+ case 0:
+ break;
+ case -EFSCORRUPTED:
+ case -EFSBADCRC:
+ return 0;
+ default:
+ return error;
+ }
+
+ /* Got the inode, lock it and we're ready to go. */
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode for us */
+ return error;
+}
+
+/* Inode core */
+
+/*
+ * Validate di_extsize hint.
+ *
+ * The rules are documented at xfs_ioctl_setattr_check_extsize().
+ * These functions must be kept in sync with each other.
+ */
+STATIC void
+xfs_scrub_inode_extsize(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags)
+{
+ struct xfs_mount *mp = sc->mp;
+ bool rt_flag;
+ bool hint_flag;
+ bool inherit_flag;
+ uint32_t extsize;
+ uint32_t extsize_bytes;
+ uint32_t blocksize_bytes;
+
+ rt_flag = (flags & XFS_DIFLAG_REALTIME);
+ hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
+ inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
+ extsize = be32_to_cpu(dip->di_extsize);
+ extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize);
+
+ if (rt_flag)
+ blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+ else
+ blocksize_bytes = mp->m_sb.sb_blocksize;
+
+ if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
+ goto bad;
+
+ if (hint_flag && !S_ISREG(mode))
+ goto bad;
+
+ if (inherit_flag && !S_ISDIR(mode))
+ goto bad;
+
+ if ((hint_flag || inherit_flag) && extsize == 0)
+ goto bad;
+
+ if (!(hint_flag || inherit_flag) && extsize != 0)
+ goto bad;
+
+ if (extsize_bytes % blocksize_bytes)
+ goto bad;
+
+ if (extsize > MAXEXTLEN)
+ goto bad;
+
+ if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+}
+
+/*
+ * Validate di_cowextsize hint.
+ *
+ * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
+ * These functions must be kept in sync with each other.
+ */
+STATIC void
+xfs_scrub_inode_cowextsize(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ struct xfs_mount *mp = sc->mp;
+ bool rt_flag;
+ bool hint_flag;
+ uint32_t extsize;
+ uint32_t extsize_bytes;
+
+ rt_flag = (flags & XFS_DIFLAG_REALTIME);
+ hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
+ extsize = be32_to_cpu(dip->di_cowextsize);
+ extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize);
+
+ if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb))
+ goto bad;
+
+ if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode)))
+ goto bad;
+
+ if (hint_flag && extsize == 0)
+ goto bad;
+
+ if (!hint_flag && extsize != 0)
+ goto bad;
+
+ if (hint_flag && rt_flag)
+ goto bad;
+
+ if (extsize_bytes % mp->m_sb.sb_blocksize)
+ goto bad;
+
+ if (extsize > MAXEXTLEN)
+ goto bad;
+
+ if (extsize > mp->m_sb.sb_agblocks / 2)
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+}
+
+/* Make sure the di_flags make sense for the inode. */
+STATIC void
+xfs_scrub_inode_flags(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ if (flags & ~XFS_DIFLAG_ANY)
+ goto bad;
+
+ /* rt flags require rt device */
+ if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) &&
+ !mp->m_rtdev_targp)
+ goto bad;
+
+ /* new rt bitmap flag only valid for rbmino */
+ if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino)
+ goto bad;
+
+ /* directory-only flags */
+ if ((flags & (XFS_DIFLAG_RTINHERIT |
+ XFS_DIFLAG_EXTSZINHERIT |
+ XFS_DIFLAG_PROJINHERIT |
+ XFS_DIFLAG_NOSYMLINKS)) &&
+ !S_ISDIR(mode))
+ goto bad;
+
+ /* file-only flags */
+ if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) &&
+ !S_ISREG(mode))
+ goto bad;
+
+ /* filestreams and rt make no sense */
+ if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+}
+
+/* Make sure the di_flags2 make sense for the inode. */
+STATIC void
+xfs_scrub_inode_flags2(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ if (flags2 & ~XFS_DIFLAG2_ANY)
+ goto bad;
+
+ /* reflink flag requires reflink feature */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) &&
+ !xfs_sb_version_hasreflink(&mp->m_sb))
+ goto bad;
+
+ /* cowextsize flag is checked w.r.t. mode separately */
+
+ /* file/dir-only flags */
+ if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode)))
+ goto bad;
+
+ /* file-only flags */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
+ goto bad;
+
+ /* realtime and reflink make no sense, currently */
+ if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+ goto bad;
+
+ /* dax and reflink make no sense, currently */
+ if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK))
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+}
+
+/* Scrub all the ondisk inode fields. */
+STATIC void
+xfs_scrub_dinode(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino)
+{
+ struct xfs_mount *mp = sc->mp;
+ size_t fork_recs;
+ unsigned long long isize;
+ uint64_t flags2;
+ uint32_t nextents;
+ uint16_t flags;
+ uint16_t mode;
+
+ flags = be16_to_cpu(dip->di_flags);
+ if (dip->di_version >= 3)
+ flags2 = be64_to_cpu(dip->di_flags2);
+ else
+ flags2 = 0;
+
+ /* di_mode */
+ mode = be16_to_cpu(dip->di_mode);
+ if (mode & ~(S_IALLUGO | S_IFMT))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* v1/v2 fields */
+ switch (dip->di_version) {
+ case 1:
+ /*
+ * We autoconvert v1 inodes into v2 inodes on writeout,
+ * so just mark this inode for preening.
+ */
+ xfs_scrub_ino_set_preen(sc, ino, bp);
+ break;
+ case 2:
+ case 3:
+ if (dip->di_onlink != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ if (dip->di_mode == 0 && sc->ip)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ if (dip->di_projid_hi != 0 &&
+ !xfs_sb_version_hasprojid32bit(&mp->m_sb))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ default:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ return;
+ }
+
+ /*
+ * di_uid/di_gid -- -1 isn't invalid, but there's no way that
+ * userspace could have created that.
+ */
+ if (dip->di_uid == cpu_to_be32(-1U) ||
+ dip->di_gid == cpu_to_be32(-1U))
+ xfs_scrub_ino_set_warning(sc, ino, bp);
+
+ /* di_format */
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
+ !S_ISFIFO(mode) && !S_ISSOCK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ if (!S_ISDIR(mode) && !S_ISLNK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (!S_ISREG(mode) && !S_ISDIR(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_UUID:
+ default:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ }
+
+ /*
+ * di_size. xfs_dinode_verify checks for things that screw up
+ * the VFS such as the upper bit being set and zero-length
+ * symlinks/directories, but we can do more here.
+ */
+ isize = be64_to_cpu(dip->di_size);
+ if (isize & (1ULL << 63))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* Devices, fifos, and sockets must have zero size */
+ if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* Directories can't be larger than the data section size (32G) */
+ if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* Symlinks can't be larger than SYMLINK_MAXLEN */
+ if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /*
+ * Warn if the running kernel can't handle the kinds of offsets
+ * needed to deal with the file size. In other words, if the
+ * pagecache can't cache all the blocks in this file due to
+ * overly large offsets, flag the inode for admin review.
+ */
+ if (isize >= mp->m_super->s_maxbytes)
+ xfs_scrub_ino_set_warning(sc, ino, bp);
+
+ /* di_nblocks */
+ if (flags2 & XFS_DIFLAG2_REFLINK) {
+ ; /* nblocks can exceed dblocks */
+ } else if (flags & XFS_DIFLAG_REALTIME) {
+ /*
+ * nblocks is the sum of data extents (in the rtdev),
+ * attr extents (in the datadev), and both forks' bmbt
+ * blocks (in the datadev). This clumsy check is the
+ * best we can do without cross-referencing with the
+ * inode forks.
+ */
+ if (be64_to_cpu(dip->di_nblocks) >=
+ mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ } else {
+ if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ }
+
+ xfs_scrub_inode_flags(sc, bp, dip, ino, mode, flags);
+
+ xfs_scrub_inode_extsize(sc, bp, dip, ino, mode, flags);
+
+ /* di_nextents */
+ nextents = be32_to_cpu(dip->di_nextents);
+ fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ if (nextents > fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (nextents <= fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ default:
+ if (nextents != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ }
+
+ /* di_forkoff */
+ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* di_aformat */
+ if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
+ dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
+ dip->di_aformat != XFS_DINODE_FMT_BTREE)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* di_anextents */
+ nextents = be16_to_cpu(dip->di_anextents);
+ fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+ switch (dip->di_aformat) {
+ case XFS_DINODE_FMT_EXTENTS:
+ if (nextents > fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (nextents <= fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ default:
+ if (nextents != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ }
+
+ if (dip->di_version >= 3) {
+ xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2);
+ xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags,
+ flags2);
+ }
+}
+
+/* Map and read a raw inode. */
+STATIC int
+xfs_scrub_inode_map_raw(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_buf **bpp,
+ struct xfs_dinode **dipp)
+{
+ struct xfs_imap imap;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp = NULL;
+ struct xfs_dinode *dip;
+ int error;
+
+ error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED);
+ if (error == -EINVAL) {
+ /*
+ * Inode could have gotten deleted out from under us;
+ * just forget about it.
+ */
+ error = -ENOENT;
+ goto out;
+ }
+ if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
+ XFS_INO_TO_AGBNO(mp, ino), &error))
+ goto out;
+
+ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+ imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp,
+ NULL);
+ if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
+ XFS_INO_TO_AGBNO(mp, ino), &error))
+ goto out;
+
+ /*
+ * Is this really an inode? We disabled verifiers in the above
+ * xfs_trans_read_buf call because the inode buffer verifier
+ * fails on /any/ inode record in the inode cluster with a bad
+ * magic or version number, not just the one that we're
+ * checking. Therefore, grab the buffer unconditionally, attach
+ * the inode verifiers by hand, and run the inode verifier only
+ * on the one inode we want.
+ */
+ bp->b_ops = &xfs_inode_buf_ops;
+ dip = xfs_buf_offset(bp, imap.im_boffset);
+ if (!xfs_dinode_verify(mp, ino, dip) ||
+ !xfs_dinode_good_version(mp, dip->di_version)) {
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ goto out_buf;
+ }
+
+ /* ...and is it the one we asked for? */
+ if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) {
+ error = -ENOENT;
+ goto out_buf;
+ }
+
+ *dipp = dip;
+ *bpp = bp;
+out:
+ return error;
+out_buf:
+ xfs_trans_brelse(sc->tp, bp);
+ return error;
+}
+
+/* Scrub an inode. */
+int
+xfs_scrub_inode(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_dinode di;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp = NULL;
+ struct xfs_dinode *dip;
+ xfs_ino_t ino;
+
+ bool has_shared;
+ int error = 0;
+
+ /* Did we get the in-core inode, or are we doing this manually? */
+ if (sc->ip) {
+ ino = sc->ip->i_ino;
+ xfs_inode_to_disk(sc->ip, &di, 0);
+ dip = &di;
+ } else {
+ /* Map & read inode. */
+ ino = sc->sm->sm_ino;
+ error = xfs_scrub_inode_map_raw(sc, ino, &bp, &dip);
+ if (error || !bp)
+ goto out;
+ }
+
+ xfs_scrub_dinode(sc, bp, dip, ino);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Now let's do the things that require a live inode. */
+ if (!sc->ip)
+ goto out;
+
+ /*
+ * Does this inode have the reflink flag set but no shared extents?
+ * Set the preening flag if this is the case.
+ */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+ &has_shared);
+ if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
+ XFS_INO_TO_AGBNO(mp, ino), &error))
+ goto out;
+ if (!has_shared)
+ xfs_scrub_ino_set_preen(sc, ino, bp);
+ }
+
+out:
+ if (bp)
+ xfs_trans_brelse(sc->tp, bp);
+ return error;
+}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
new file mode 100644
index 000000000000..63a25334fc83
--- /dev/null
+++ b/fs/xfs/scrub/parent.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ialloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up to scrub parents. */
+int
+xfs_scrub_setup_parent(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Parent pointers */
+
+/* Look for an entry in a parent pointing to this inode. */
+
+struct xfs_scrub_parent_ctx {
+ struct dir_context dc;
+ xfs_ino_t ino;
+ xfs_nlink_t nlink;
+};
+
+/* Look for a single entry in a directory pointing to an inode. */
+STATIC int
+xfs_scrub_parent_actor(
+ struct dir_context *dc,
+ const char *name,
+ int namelen,
+ loff_t pos,
+ u64 ino,
+ unsigned type)
+{
+ struct xfs_scrub_parent_ctx *spc;
+
+ spc = container_of(dc, struct xfs_scrub_parent_ctx, dc);
+ if (spc->ino == ino)
+ spc->nlink++;
+ return 0;
+}
+
+/* Count the number of dentries in the parent dir that point to this inode. */
+STATIC int
+xfs_scrub_parent_count_parent_dentries(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *parent,
+ xfs_nlink_t *nlink)
+{
+ struct xfs_scrub_parent_ctx spc = {
+ .dc.actor = xfs_scrub_parent_actor,
+ .dc.pos = 0,
+ .ino = sc->ip->i_ino,
+ .nlink = 0,
+ };
+ size_t bufsize;
+ loff_t oldpos;
+ uint lock_mode;
+ int error = 0;
+
+ /*
+ * If there are any blocks, read-ahead block 0 as we're almost
+ * certain to have the next operation be a read there. This is
+ * how we guarantee that the parent's extent map has been loaded,
+ * if there is one.
+ */
+ lock_mode = xfs_ilock_data_map_shared(parent);
+ if (parent->i_d.di_nextents > 0)
+ error = xfs_dir3_data_readahead(parent, 0, -1);
+ xfs_iunlock(parent, lock_mode);
+ if (error)
+ return error;
+
+ /*
+ * Iterate the parent dir to confirm that there is
+ * exactly one entry pointing back to the inode being
+ * scanned.
+ */
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
+ parent->i_d.di_size);
+ oldpos = 0;
+ while (true) {
+ error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
+ if (error)
+ goto out;
+ if (oldpos == spc.dc.pos)
+ break;
+ oldpos = spc.dc.pos;
+ }
+ *nlink = spc.nlink;
+out:
+ return error;
+}
+
+/*
+ * Given the inode number of the alleged parent of the inode being
+ * scrubbed, try to validate that the parent has exactly one directory
+ * entry pointing back to the inode being scrubbed.
+ */
+STATIC int
+xfs_scrub_parent_validate(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t dnum,
+ bool *try_again)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *dp = NULL;
+ xfs_nlink_t expected_nlink;
+ xfs_nlink_t nlink;
+ int error = 0;
+
+ *try_again = false;
+
+ /* '..' must not point to ourselves. */
+ if (sc->ip->i_ino == dnum) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /*
+ * If we're an unlinked directory, the parent /won't/ have a link
+ * to us. Otherwise, it should have one link.
+ */
+ expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
+
+ /*
+ * Grab this parent inode. We release the inode before we
+ * cancel the scrub transaction. Since we're don't know a
+ * priori that releasing the inode won't trigger eofblocks
+ * cleanup (which allocates what would be a nested transaction)
+ * if the parent pointer erroneously points to a file, we
+ * can't use DONTCACHE here because DONTCACHE inodes can trigger
+ * immediate inactive cleanup of the inode.
+ */
+ error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (dp == sc->ip) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out_rele;
+ }
+
+ /*
+ * We prefer to keep the inode locked while we lock and search
+ * its alleged parent for a forward reference. If we can grab
+ * the iolock, validate the pointers and we're done. We must
+ * use nowait here to avoid an ABBA deadlock on the parent and
+ * the child inodes.
+ */
+ if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) {
+ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ goto out_unlock;
+ if (nlink != expected_nlink)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out_unlock;
+ }
+
+ /*
+ * The game changes if we get here. We failed to lock the parent,
+ * so we're going to try to verify both pointers while only holding
+ * one lock so as to avoid deadlocking with something that's actually
+ * trying to traverse down the directory tree.
+ */
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = 0;
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
+
+ /* Go looking for our dentry. */
+ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_unlock;
+
+ /* Drop the parent lock, relock this inode. */
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ /*
+ * If we're an unlinked directory, the parent /won't/ have a link
+ * to us. Otherwise, it should have one link. We have to re-set
+ * it here because we dropped the lock on sc->ip.
+ */
+ expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
+
+ /* Look up '..' to see if the inode changed. */
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_rele;
+
+ /* Drat, parent changed. Try again! */
+ if (dnum != dp->i_ino) {
+ iput(VFS_I(dp));
+ *try_again = true;
+ return 0;
+ }
+ iput(VFS_I(dp));
+
+ /*
+ * '..' didn't change, so check that there was only one entry
+ * for us in the parent.
+ */
+ if (nlink != expected_nlink)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return error;
+
+out_unlock:
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+out_rele:
+ iput(VFS_I(dp));
+out:
+ return error;
+}
+
+/* Scrub a parent pointer. */
+int
+xfs_scrub_parent(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t dnum;
+ bool try_again;
+ int tries = 0;
+ int error = 0;
+
+ /*
+ * If we're a directory, check that the '..' link points up to
+ * a directory that has one entry pointing to us.
+ */
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ /* We're not a special inode, are we? */
+ if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /*
+ * The VFS grabs a read or write lock via i_rwsem before it reads
+ * or writes to a directory. If we've gotten this far we've
+ * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
+ * getting a write lock on i_rwsem. Therefore, it is safe for us
+ * to drop the ILOCK here in order to do directory lookups.
+ */
+ sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
+ /* Look up '..' */
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (!xfs_verify_dir_ino(mp, dnum)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Is this the root dir? Then '..' must point to itself. */
+ if (sc->ip == mp->m_rootip) {
+ if (sc->ip->i_ino != mp->m_sb.sb_rootino ||
+ sc->ip->i_ino != dnum)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ do {
+ error = xfs_scrub_parent_validate(sc, dnum, &try_again);
+ if (error)
+ goto out;
+ } while (try_again && ++tries < 20);
+
+ /*
+ * We gave it our best shot but failed, so mark this scrub
+ * incomplete. Userspace can decide if it wants to try again.
+ */
+ if (try_again && tries == 20)
+ xfs_scrub_set_incomplete(sc);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
new file mode 100644
index 000000000000..8e58ba842946
--- /dev/null
+++ b/fs/xfs/scrub/quota.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dquot.h"
+#include "xfs_dquot_item.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Convert a scrub type code to a DQ flag, or return 0 if error. */
+static inline uint
+xfs_scrub_quota_to_dqtype(
+ struct xfs_scrub_context *sc)
+{
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_UQUOTA:
+ return XFS_DQ_USER;
+ case XFS_SCRUB_TYPE_GQUOTA:
+ return XFS_DQ_GROUP;
+ case XFS_SCRUB_TYPE_PQUOTA:
+ return XFS_DQ_PROJ;
+ default:
+ return 0;
+ }
+}
+
+/* Set us up to scrub a quota. */
+int
+xfs_scrub_setup_quota(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ uint dqtype;
+
+ /*
+ * If userspace gave us an AG number or inode data, they don't
+ * know what they're doing. Get out.
+ */
+ if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen)
+ return -EINVAL;
+
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
+ if (dqtype == 0)
+ return -EINVAL;
+ if (!xfs_this_quota_on(sc->mp, dqtype))
+ return -ENOENT;
+ return 0;
+}
+
+/* Quotas. */
+
+/* Scrub the fields in an individual quota item. */
+STATIC void
+xfs_scrub_quota_item(
+ struct xfs_scrub_context *sc,
+ uint dqtype,
+ struct xfs_dquot *dq,
+ xfs_dqid_t id)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_disk_dquot *d = &dq->q_core;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ xfs_fileoff_t offset;
+ unsigned long long bsoft;
+ unsigned long long isoft;
+ unsigned long long rsoft;
+ unsigned long long bhard;
+ unsigned long long ihard;
+ unsigned long long rhard;
+ unsigned long long bcount;
+ unsigned long long icount;
+ unsigned long long rcount;
+ xfs_ino_t fs_icount;
+
+ offset = id * qi->qi_dqperchunk;
+
+ /*
+ * We fed $id and DQNEXT into the xfs_qm_dqget call, which means
+ * that the actual dquot we got must either have the same id or
+ * the next higher id.
+ */
+ if (id > be32_to_cpu(d->d_id))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Did we get the dquot type we wanted? */
+ if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Check the limits. */
+ bhard = be64_to_cpu(d->d_blk_hardlimit);
+ ihard = be64_to_cpu(d->d_ino_hardlimit);
+ rhard = be64_to_cpu(d->d_rtb_hardlimit);
+
+ bsoft = be64_to_cpu(d->d_blk_softlimit);
+ isoft = be64_to_cpu(d->d_ino_softlimit);
+ rsoft = be64_to_cpu(d->d_rtb_softlimit);
+
+ /*
+ * Warn if the hard limits are larger than the fs.
+ * Administrators can do this, though in production this seems
+ * suspect, which is why we flag it for review.
+ *
+ * Complain about corruption if the soft limit is greater than
+ * the hard limit.
+ */
+ if (bhard > mp->m_sb.sb_dblocks)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (bsoft > bhard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (ihard > mp->m_maxicount)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (isoft > ihard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (rhard > mp->m_sb.sb_rblocks)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (rsoft > rhard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Check the resource counts. */
+ bcount = be64_to_cpu(d->d_bcount);
+ icount = be64_to_cpu(d->d_icount);
+ rcount = be64_to_cpu(d->d_rtbcount);
+ fs_icount = percpu_counter_sum(&mp->m_icount);
+
+ /*
+ * Check that usage doesn't exceed physical limits. However, on
+ * a reflink filesystem we're allowed to exceed physical space
+ * if there are no quota limits.
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_dblocks < bcount)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK,
+ offset);
+ } else {
+ if (mp->m_sb.sb_dblocks < bcount)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ offset);
+ }
+ if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /*
+ * We can violate the hard limits if the admin suddenly sets a
+ * lower limit than the actual usage. However, we flag it for
+ * admin review.
+ */
+ if (id != 0 && bhard != 0 && bcount > bhard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (id != 0 && ihard != 0 && icount > ihard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (id != 0 && rhard != 0 && rcount > rhard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+}
+
+/* Scrub all of a quota type's items. */
+int
+xfs_scrub_quota(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_bmbt_irec irec = { 0 };
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_dquot *dq;
+ xfs_fileoff_t max_dqid_off;
+ xfs_fileoff_t off = 0;
+ xfs_dqid_t id = 0;
+ uint dqtype;
+ int nimaps;
+ int error;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return -ENOENT;
+
+ mutex_lock(&qi->qi_quotaofflock);
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
+ if (!xfs_this_quota_on(sc->mp, dqtype)) {
+ error = -ENOENT;
+ goto out_unlock_quota;
+ }
+
+ /* Attach to the quota inode and set sc->ip so that reporting works. */
+ ip = xfs_quota_inode(sc->mp, dqtype);
+ sc->ip = ip;
+
+ /* Look for problem extents. */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ goto out_unlock_inode;
+ }
+ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+ while (1) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+
+ off = irec.br_startoff + irec.br_blockcount;
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps,
+ XFS_BMAPI_ENTIRE);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off,
+ &error))
+ goto out_unlock_inode;
+ if (!nimaps)
+ break;
+ if (irec.br_startblock == HOLESTARTBLOCK)
+ continue;
+
+ /* Check the extent record doesn't point to crap. */
+ if (irec.br_startblock + irec.br_blockcount <=
+ irec.br_startblock)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ irec.br_startoff);
+ if (!xfs_verify_fsbno(mp, irec.br_startblock) ||
+ !xfs_verify_fsbno(mp, irec.br_startblock +
+ irec.br_blockcount - 1))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ irec.br_startoff);
+
+ /*
+ * Unwritten extents or blocks mapped above the highest
+ * quota id shouldn't happen.
+ */
+ if (isnullstartblock(irec.br_startblock) ||
+ irec.br_startoff > max_dqid_off ||
+ irec.br_startoff + irec.br_blockcount > max_dqid_off + 1)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check all the quota items. */
+ while (id < ((xfs_dqid_t)-1ULL)) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+
+ error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT,
+ &dq);
+ if (error == -ENOENT)
+ break;
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
+ id * qi->qi_dqperchunk, &error))
+ break;
+
+ xfs_scrub_quota_item(sc, dqtype, dq, id);
+
+ id = be32_to_cpu(dq->q_core.d_id) + 1;
+ xfs_qm_dqput(dq);
+ if (!id)
+ break;
+ }
+
+out:
+ /* We set sc->ip earlier, so make sure we clear it now. */
+ sc->ip = NULL;
+out_unlock_quota:
+ mutex_unlock(&qi->qi_quotaofflock);
+ return error;
+
+out_unlock_inode:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out;
+}
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
new file mode 100644
index 000000000000..2f88a8d44bd0
--- /dev/null
+++ b/fs/xfs/scrub/refcount.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub reference count btrees.
+ */
+int
+xfs_scrub_setup_ag_refcountbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Reference count btree scrubber. */
+
+/* Scrub a refcountbt record. */
+STATIC int
+xfs_scrub_refcountbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+ xfs_nlink_t refcount;
+ bool has_cowflag;
+ int error = 0;
+
+ bno = be32_to_cpu(rec->refc.rc_startblock);
+ len = be32_to_cpu(rec->refc.rc_blockcount);
+ refcount = be32_to_cpu(rec->refc.rc_refcount);
+
+ /* Only CoW records can have refcount == 1. */
+ has_cowflag = (bno & XFS_REFC_COW_START);
+ if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ /* Check the extent. */
+ bno &= ~XFS_REFC_COW_START;
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (refcount == 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ return error;
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xfs_scrub_refcountbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
+ return xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec,
+ &oinfo, NULL);
+}
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
new file mode 100644
index 000000000000..97846c424690
--- /dev/null
+++ b/fs/xfs/scrub/rmap.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub reverse mapping btrees.
+ */
+int
+xfs_scrub_setup_ag_rmapbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Reverse-mapping scrubber. */
+
+/* Scrub an rmapbt record. */
+STATIC int
+xfs_scrub_rmapbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_rmap_irec irec;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ bool non_inode;
+ bool is_unwritten;
+ bool is_bmbt;
+ bool is_attr;
+ int error;
+
+ error = xfs_rmap_btrec_to_irec(rec, &irec);
+ if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error))
+ goto out;
+
+ /* Check extent. */
+ if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (irec.rm_owner == XFS_RMAP_OWN_FS) {
+ /*
+ * xfs_verify_agbno returns false for static fs metadata.
+ * Since that only exists at the start of the AG, validate
+ * that by hand.
+ */
+ if (irec.rm_startblock != 0 ||
+ irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ } else {
+ /*
+ * Otherwise we must point somewhere past the static metadata
+ * but before the end of the FS. Run the regular check.
+ */
+ if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) ||
+ !xfs_verify_agbno(mp, agno, irec.rm_startblock +
+ irec.rm_blockcount - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ }
+
+ /* Check flags. */
+ non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner);
+ is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK;
+ is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK;
+ is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN;
+
+ if (is_bmbt && irec.rm_offset != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (non_inode && irec.rm_offset != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (is_unwritten && (is_bmbt || non_inode || is_attr))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (non_inode && (is_bmbt || is_unwritten || is_attr))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (!non_inode) {
+ if (!xfs_verify_ino(mp, irec.rm_owner))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ } else {
+ /* Non-inode owner within the magic values? */
+ if (irec.rm_owner <= XFS_RMAP_OWN_MIN ||
+ irec.rm_owner > XFS_RMAP_OWN_FS)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ }
+out:
+ return error;
+}
+
+/* Scrub the rmap btree for some AG. */
+int
+xfs_scrub_rmapbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ return xfs_scrub_btree(sc, sc->sa.rmap_cur, xfs_scrub_rmapbt_rec,
+ &oinfo, NULL);
+}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
new file mode 100644
index 000000000000..c6fedb698008
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up with the realtime metadata locked. */
+int
+xfs_scrub_setup_rt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ /*
+ * If userspace gave us an AG number or inode data, they don't
+ * know what they're doing. Get out.
+ */
+ if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen)
+ return -EINVAL;
+
+ error = xfs_scrub_setup_fs(sc, ip);
+ if (error)
+ return error;
+
+ sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP;
+ sc->ip = mp->m_rbmip;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ return 0;
+}
+
+/* Realtime bitmap. */
+
+/* Scrub a free extent record from the realtime bitmap. */
+STATIC int
+xfs_scrub_rtbitmap_rec(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_scrub_context *sc = priv;
+
+ if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock ||
+ !xfs_verify_rtbno(sc->mp, rec->ar_startblock) ||
+ !xfs_verify_rtbno(sc->mp, rec->ar_startblock +
+ rec->ar_blockcount - 1))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+}
+
+/* Scrub the realtime bitmap. */
+int
+xfs_scrub_rtbitmap(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+
+out:
+ return error;
+}
+
+/* Scrub the realtime summary. */
+int
+xfs_scrub_rtsummary(
+ struct xfs_scrub_context *sc)
+{
+ /* XXX: implement this some day */
+ return -ENOENT;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
new file mode 100644
index 000000000000..9c42c4efd01e
--- /dev/null
+++ b/fs/xfs/scrub/scrub.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/scrub.h"
+#include "scrub/btree.h"
+
+/*
+ * Online Scrub and Repair
+ *
+ * Traditionally, XFS (the kernel driver) did not know how to check or
+ * repair on-disk data structures. That task was left to the xfs_check
+ * and xfs_repair tools, both of which require taking the filesystem
+ * offline for a thorough but time consuming examination. Online
+ * scrub & repair, on the other hand, enables us to check the metadata
+ * for obvious errors while carefully stepping around the filesystem's
+ * ongoing operations, locking rules, etc.
+ *
+ * Given that most XFS metadata consist of records stored in a btree,
+ * most of the checking functions iterate the btree blocks themselves
+ * looking for irregularities. When a record block is encountered, each
+ * record can be checked for obviously bad values. Record values can
+ * also be cross-referenced against other btrees to look for potential
+ * misunderstandings between pieces of metadata.
+ *
+ * It is expected that the checkers responsible for per-AG metadata
+ * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
+ * metadata structure, and perform any relevant cross-referencing before
+ * unlocking the AG and returning the results to userspace. These
+ * scrubbers must not keep an AG locked for too long to avoid tying up
+ * the block and inode allocators.
+ *
+ * Block maps and b-trees rooted in an inode present a special challenge
+ * because they can involve extents from any AG. The general scrubber
+ * structure of lock -> check -> xref -> unlock still holds, but AG
+ * locking order rules /must/ be obeyed to avoid deadlocks. The
+ * ordering rule, of course, is that we must lock in increasing AG
+ * order. Helper functions are provided to track which AG headers we've
+ * already locked. If we detect an imminent locking order violation, we
+ * can signal a potential deadlock, in which case the scrubber can jump
+ * out to the top level, lock all the AGs in order, and retry the scrub.
+ *
+ * For file data (directories, extended attributes, symlinks) scrub, we
+ * can simply lock the inode and walk the data. For btree data
+ * (directories and attributes) we follow the same btree-scrubbing
+ * strategy outlined previously to check the records.
+ *
+ * We use a bit of trickery with transactions to avoid buffer deadlocks
+ * if there is a cycle in the metadata. The basic problem is that
+ * travelling down a btree involves locking the current buffer at each
+ * tree level. If a pointer should somehow point back to a buffer that
+ * we've already examined, we will deadlock due to the second buffer
+ * locking attempt. Note however that grabbing a buffer in transaction
+ * context links the locked buffer to the transaction. If we try to
+ * re-grab the buffer in the context of the same transaction, we avoid
+ * the second lock attempt and continue. Between the verifier and the
+ * scrubber, something will notice that something is amiss and report
+ * the corruption. Therefore, each scrubber will allocate an empty
+ * transaction, attach buffers to it, and cancel the transaction at the
+ * end of the scrub run. Cancelling a non-dirty transaction simply
+ * unlocks the buffers.
+ *
+ * There are four pieces of data that scrub can communicate to
+ * userspace. The first is the error code (errno), which can be used to
+ * communicate operational errors in performing the scrub. There are
+ * also three flags that can be set in the scrub context. If the data
+ * structure itself is corrupt, the CORRUPT flag will be set. If
+ * the metadata is correct but otherwise suboptimal, the PREEN flag
+ * will be set.
+ */
+
+/*
+ * Scrub probe -- userspace uses this to probe if we're willing to scrub
+ * or repair a given mountpoint. This will be used by xfs_scrub to
+ * probe the kernel's abilities to scrub (and repair) the metadata. We
+ * do this by validating the ioctl inputs from userspace, preparing the
+ * filesystem for a scrub (or a repair) operation, and immediately
+ * returning to userspace. Userspace can use the returned errno and
+ * structure state to decide (in broad terms) if scrub/repair are
+ * supported by the running kernel.
+ */
+static int
+xfs_scrub_probe(
+ struct xfs_scrub_context *sc)
+{
+ int error = 0;
+
+ if (sc->sm->sm_ino || sc->sm->sm_agno)
+ return -EINVAL;
+ if (xfs_scrub_should_terminate(sc, &error))
+ return error;
+
+ return 0;
+}
+
+/* Scrub setup and teardown */
+
+/* Free all the resources and finish the transactions. */
+STATIC int
+xfs_scrub_teardown(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip_in,
+ int error)
+{
+ xfs_scrub_ag_free(sc, &sc->sa);
+ if (sc->tp) {
+ xfs_trans_cancel(sc->tp);
+ sc->tp = NULL;
+ }
+ if (sc->ip) {
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ if (sc->ip != ip_in &&
+ !xfs_internal_inum(sc->mp, sc->ip->i_ino))
+ iput(VFS_I(sc->ip));
+ sc->ip = NULL;
+ }
+ if (sc->buf) {
+ kmem_free(sc->buf);
+ sc->buf = NULL;
+ }
+ return error;
+}
+
+/* Scrubbing dispatch. */
+
+static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
+ { /* ioctl presence test */
+ .setup = xfs_scrub_setup_fs,
+ .scrub = xfs_scrub_probe,
+ },
+ { /* superblock */
+ .setup = xfs_scrub_setup_ag_header,
+ .scrub = xfs_scrub_superblock,
+ },
+ { /* agf */
+ .setup = xfs_scrub_setup_ag_header,
+ .scrub = xfs_scrub_agf,
+ },
+ { /* agfl */
+ .setup = xfs_scrub_setup_ag_header,
+ .scrub = xfs_scrub_agfl,
+ },
+ { /* agi */
+ .setup = xfs_scrub_setup_ag_header,
+ .scrub = xfs_scrub_agi,
+ },
+ { /* bnobt */
+ .setup = xfs_scrub_setup_ag_allocbt,
+ .scrub = xfs_scrub_bnobt,
+ },
+ { /* cntbt */
+ .setup = xfs_scrub_setup_ag_allocbt,
+ .scrub = xfs_scrub_cntbt,
+ },
+ { /* inobt */
+ .setup = xfs_scrub_setup_ag_iallocbt,
+ .scrub = xfs_scrub_inobt,
+ },
+ { /* finobt */
+ .setup = xfs_scrub_setup_ag_iallocbt,
+ .scrub = xfs_scrub_finobt,
+ .has = xfs_sb_version_hasfinobt,
+ },
+ { /* rmapbt */
+ .setup = xfs_scrub_setup_ag_rmapbt,
+ .scrub = xfs_scrub_rmapbt,
+ .has = xfs_sb_version_hasrmapbt,
+ },
+ { /* refcountbt */
+ .setup = xfs_scrub_setup_ag_refcountbt,
+ .scrub = xfs_scrub_refcountbt,
+ .has = xfs_sb_version_hasreflink,
+ },
+ { /* inode record */
+ .setup = xfs_scrub_setup_inode,
+ .scrub = xfs_scrub_inode,
+ },
+ { /* inode data fork */
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_data,
+ },
+ { /* inode attr fork */
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_attr,
+ },
+ { /* inode CoW fork */
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_cow,
+ },
+ { /* directory */
+ .setup = xfs_scrub_setup_directory,
+ .scrub = xfs_scrub_directory,
+ },
+ { /* extended attributes */
+ .setup = xfs_scrub_setup_xattr,
+ .scrub = xfs_scrub_xattr,
+ },
+ { /* symbolic link */
+ .setup = xfs_scrub_setup_symlink,
+ .scrub = xfs_scrub_symlink,
+ },
+ { /* parent pointers */
+ .setup = xfs_scrub_setup_parent,
+ .scrub = xfs_scrub_parent,
+ },
+ { /* realtime bitmap */
+ .setup = xfs_scrub_setup_rt,
+ .scrub = xfs_scrub_rtbitmap,
+ .has = xfs_sb_version_hasrealtime,
+ },
+ { /* realtime summary */
+ .setup = xfs_scrub_setup_rt,
+ .scrub = xfs_scrub_rtsummary,
+ .has = xfs_sb_version_hasrealtime,
+ },
+ { /* user quota */
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+ { /* group quota */
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+ { /* project quota */
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+};
+
+/* This isn't a stable feature, warn once per day. */
+static inline void
+xfs_scrub_experimental_warning(
+ struct xfs_mount *mp)
+{
+ static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
+ "xfs_scrub_warning", 86400 * HZ, 1);
+ ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
+
+ if (__ratelimit(&scrub_warning))
+ xfs_alert(mp,
+"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
+}
+
+/* Dispatch metadata scrubbing. */
+int
+xfs_scrub_metadata(
+ struct xfs_inode *ip,
+ struct xfs_scrub_metadata *sm)
+{
+ struct xfs_scrub_context sc;
+ struct xfs_mount *mp = ip->i_mount;
+ const struct xfs_scrub_meta_ops *ops;
+ bool try_harder = false;
+ int error = 0;
+
+ trace_xfs_scrub_start(ip, sm, error);
+
+ /* Forbidden if we are shut down or mounted norecovery. */
+ error = -ESHUTDOWN;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out;
+ error = -ENOTRECOVERABLE;
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ goto out;
+
+ /* Check our inputs. */
+ error = -EINVAL;
+ sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
+ goto out;
+ if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
+ goto out;
+
+ /* Do we know about this type of metadata? */
+ error = -ENOENT;
+ if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
+ goto out;
+ ops = &meta_scrub_ops[sm->sm_type];
+ if (ops->scrub == NULL)
+ goto out;
+
+ /*
+ * We won't scrub any filesystem that doesn't have the ability
+ * to record unwritten extents. The option was made default in
+ * 2003, removed from mkfs in 2007, and cannot be disabled in
+ * v5, so if we find a filesystem without this flag it's either
+ * really old or totally unsupported. Avoid it either way.
+ * We also don't support v1-v3 filesystems, which aren't
+ * mountable.
+ */
+ error = -EOPNOTSUPP;
+ if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
+ goto out;
+
+ /* Does this fs even support this type of metadata? */
+ error = -ENOENT;
+ if (ops->has && !ops->has(&mp->m_sb))
+ goto out;
+
+ /* We don't know how to repair anything yet. */
+ error = -EOPNOTSUPP;
+ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ goto out;
+
+ xfs_scrub_experimental_warning(mp);
+
+retry_op:
+ /* Set up for the operation. */
+ memset(&sc, 0, sizeof(sc));
+ sc.mp = ip->i_mount;
+ sc.sm = sm;
+ sc.ops = ops;
+ sc.try_harder = try_harder;
+ sc.sa.agno = NULLAGNUMBER;
+ error = sc.ops->setup(&sc, ip);
+ if (error)
+ goto out_teardown;
+
+ /* Scrub for errors. */
+ error = sc.ops->scrub(&sc);
+ if (!try_harder && error == -EDEADLOCK) {
+ /*
+ * Scrubbers return -EDEADLOCK to mean 'try harder'.
+ * Tear down everything we hold, then set up again with
+ * preparation for worst-case scenarios.
+ */
+ error = xfs_scrub_teardown(&sc, ip, 0);
+ if (error)
+ goto out;
+ try_harder = true;
+ goto retry_op;
+ } else if (error)
+ goto out_teardown;
+
+ if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT))
+ xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
+
+out_teardown:
+ error = xfs_scrub_teardown(&sc, ip, error);
+out:
+ trace_xfs_scrub_done(ip, sm, error);
+ if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
+ sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ error = 0;
+ }
+ return error;
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
new file mode 100644
index 000000000000..e9ec041cf713
--- /dev/null
+++ b/fs/xfs/scrub/scrub.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_SCRUB_H__
+#define __XFS_SCRUB_SCRUB_H__
+
+struct xfs_scrub_context;
+
+struct xfs_scrub_meta_ops {
+ /* Acquire whatever resources are needed for the operation. */
+ int (*setup)(struct xfs_scrub_context *,
+ struct xfs_inode *);
+
+ /* Examine metadata for errors. */
+ int (*scrub)(struct xfs_scrub_context *);
+
+ /* Decide if we even have this piece of metadata. */
+ bool (*has)(struct xfs_sb *);
+};
+
+/* Buffer pointers and btree cursors for an entire AG. */
+struct xfs_scrub_ag {
+ xfs_agnumber_t agno;
+
+ /* AG btree roots */
+ struct xfs_buf *agf_bp;
+ struct xfs_buf *agfl_bp;
+ struct xfs_buf *agi_bp;
+
+ /* AG btrees */
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ struct xfs_btree_cur *ino_cur;
+ struct xfs_btree_cur *fino_cur;
+ struct xfs_btree_cur *rmap_cur;
+ struct xfs_btree_cur *refc_cur;
+};
+
+struct xfs_scrub_context {
+ /* General scrub state. */
+ struct xfs_mount *mp;
+ struct xfs_scrub_metadata *sm;
+ const struct xfs_scrub_meta_ops *ops;
+ struct xfs_trans *tp;
+ struct xfs_inode *ip;
+ void *buf;
+ uint ilock_flags;
+ bool try_harder;
+
+ /* State tracking for single-AG operations. */
+ struct xfs_scrub_ag sa;
+};
+
+/* Metadata scrubbers */
+int xfs_scrub_tester(struct xfs_scrub_context *sc);
+int xfs_scrub_superblock(struct xfs_scrub_context *sc);
+int xfs_scrub_agf(struct xfs_scrub_context *sc);
+int xfs_scrub_agfl(struct xfs_scrub_context *sc);
+int xfs_scrub_agi(struct xfs_scrub_context *sc);
+int xfs_scrub_bnobt(struct xfs_scrub_context *sc);
+int xfs_scrub_cntbt(struct xfs_scrub_context *sc);
+int xfs_scrub_inobt(struct xfs_scrub_context *sc);
+int xfs_scrub_finobt(struct xfs_scrub_context *sc);
+int xfs_scrub_rmapbt(struct xfs_scrub_context *sc);
+int xfs_scrub_refcountbt(struct xfs_scrub_context *sc);
+int xfs_scrub_inode(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_data(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_attr(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_cow(struct xfs_scrub_context *sc);
+int xfs_scrub_directory(struct xfs_scrub_context *sc);
+int xfs_scrub_xattr(struct xfs_scrub_context *sc);
+int xfs_scrub_symlink(struct xfs_scrub_context *sc);
+int xfs_scrub_parent(struct xfs_scrub_context *sc);
+#ifdef CONFIG_XFS_RT
+int xfs_scrub_rtbitmap(struct xfs_scrub_context *sc);
+int xfs_scrub_rtsummary(struct xfs_scrub_context *sc);
+#else
+static inline int
+xfs_scrub_rtbitmap(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+static inline int
+xfs_scrub_rtsummary(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+#endif
+#ifdef CONFIG_XFS_QUOTA
+int xfs_scrub_quota(struct xfs_scrub_context *sc);
+#else
+static inline int
+xfs_scrub_quota(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+#endif
+
+#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
new file mode 100644
index 000000000000..3aa3d60f7c16
--- /dev/null
+++ b/fs/xfs/scrub/symlink.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_symlink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up to scrub a symbolic link. */
+int
+xfs_scrub_setup_symlink(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ /* Allocate the buffer without the inode lock held. */
+ sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Symbolic links. */
+
+int
+xfs_scrub_symlink(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_ifork *ifp;
+ loff_t len;
+ int error = 0;
+
+ if (!S_ISLNK(VFS_I(ip)->i_mode))
+ return -ENOENT;
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ len = ip->i_d.di_size;
+
+ /* Plausible size? */
+ if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Inline symlink? */
+ if (ifp->if_flags & XFS_IFINLINE) {
+ if (len > XFS_IFORK_DSIZE(ip) ||
+ len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip)))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Remote symlink; must read the contents. */
+ error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
new file mode 100644
index 000000000000..472080e75788
--- /dev/null
+++ b/fs/xfs/scrub/trace.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Figure out which block the btree cursor was pointing to. */
+static inline xfs_fsblock_t
+xfs_scrub_btree_cur_fsbno(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level < cur->bc_nlevels && cur->bc_bufs[level])
+ return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn);
+ else if (level == cur->bc_nlevels - 1 &&
+ cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino);
+ else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
+ return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0);
+ return NULLFSBLOCK;
+}
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "scrub/trace.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
new file mode 100644
index 000000000000..c4ebfb5c1ee8
--- /dev/null
+++ b/fs/xfs/scrub/trace.h
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs_scrub
+
+#if !defined(_TRACE_XFS_SCRUB_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_SCRUB_TRACE_H
+
+#include <linux/tracepoint.h>
+#include "xfs_bit.h"
+
+DECLARE_EVENT_CLASS(xfs_scrub_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
+ int error),
+ TP_ARGS(ip, sm, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->type = sm->sm_type;
+ __entry->agno = sm->sm_agno;
+ __entry->inum = sm->sm_ino;
+ __entry->gen = sm->sm_gen;
+ __entry->flags = sm->sm_flags;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d ino %llu type %u agno %u inum %llu gen %u flags 0x%x error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->type,
+ __entry->agno,
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->error)
+)
+#define DEFINE_SCRUB_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, \
+ int error), \
+ TP_ARGS(ip, sm, error))
+
+DEFINE_SCRUB_EVENT(xfs_scrub_start);
+DEFINE_SCRUB_EVENT(xfs_scrub_done);
+DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry);
+
+TRACE_EVENT(xfs_scrub_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ xfs_agblock_t bno, int error, void *ret_ip),
+ TP_ARGS(sc, agno, bno, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = agno;
+ __entry->bno = bno;
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_file_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, int error, void *ret_ip),
+ TP_ARGS(sc, whichfork, offset, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_fileoff_t, offset)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->ip->i_mount->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->offset = offset;
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->offset,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_scrub_block_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, void *ret_ip),
+ TP_ARGS(sc, daddr, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+
+ fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr);
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = agno;
+ __entry->bno = bno;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+)
+
+#define DEFINE_SCRUB_BLOCK_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_block_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, \
+ void *ret_ip), \
+ TP_ARGS(sc, daddr, ret_ip))
+
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_error);
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_preen);
+
+DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, xfs_daddr_t daddr,
+ void *ret_ip),
+ TP_ARGS(sc, ino, daddr, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+
+ if (daddr) {
+ fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr);
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+ } else {
+ agno = XFS_INO_TO_AGNO(sc->mp, ino);
+ bno = XFS_AGINO_TO_AGBNO(sc->mp,
+ XFS_INO_TO_AGINO(sc->mp, ino));
+ }
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = agno;
+ __entry->bno = bno;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu type %u agno %u agbno %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->type,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+)
+
+#define DEFINE_SCRUB_INO_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_ino_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, \
+ xfs_daddr_t daddr, void *ret_ip), \
+ TP_ARGS(sc, ino, daddr, ret_ip))
+
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_error);
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_preen);
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_warning);
+
+DECLARE_EVENT_CLASS(xfs_scrub_fblock_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, void *ret_ip),
+ TP_ARGS(sc, whichfork, offset, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_fileoff_t, offset)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->ip->i_mount->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->offset = offset;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->offset,
+ __entry->ret_ip)
+);
+
+#define DEFINE_SCRUB_FBLOCK_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_fblock_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork, \
+ xfs_fileoff_t offset, void *ret_ip), \
+ TP_ARGS(sc, whichfork, offset, ret_ip))
+
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_error);
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_warning);
+
+TRACE_EVENT(xfs_scrub_incomplete,
+ TP_PROTO(struct xfs_scrub_context *sc, void *ret_ip),
+ TP_ARGS(sc, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_btree_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, int error, void *ret_ip),
+ TP_ARGS(sc, cur, level, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_ifork_btree_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, int error, void *ret_ip),
+ TP_ARGS(sc, cur, level, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(int, ptr)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_btree_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, void *ret_ip),
+ TP_ARGS(sc, cur, level, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_ifork_btree_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, void *ret_ip),
+ TP_ARGS(sc, cur, level, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_scrub_sbtree_class,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level),
+ TP_ARGS(sc, cur, level),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, level)
+ __field(int, nlevels)
+ __field(int, ptr)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->level = level;
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->ptr = cur->bc_ptrs[level];
+ ),
+ TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->agno,
+ __entry->bno,
+ __entry->level,
+ __entry->nlevels,
+ __entry->ptr)
+)
+#define DEFINE_SCRUB_SBTREE_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_sbtree_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, \
+ int level), \
+ TP_ARGS(sc, cur, level))
+
+DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_rec);
+DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_key);
+
+#endif /* _TRACE_XFS_SCRUB_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE scrub/trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
new file mode 100644
index 000000000000..e00e0eadac6a
--- /dev/null
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_H__
+#define __XFS_SCRUB_H__
+
+#ifndef CONFIG_XFS_ONLINE_SCRUB
+# define xfs_scrub_metadata(ip, sm) (-ENOTTY)
+#else
+int xfs_scrub_metadata(struct xfs_inode *ip, struct xfs_scrub_metadata *sm);
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
+
+#endif /* __XFS_SCRUB_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 80cd0fd86783..5ff7f228d616 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -19,7 +19,6 @@
#define __XFS_H__
#ifdef CONFIG_XFS_DEBUG
-#define STATIC
#define DEBUG 1
#define XFS_BUF_LOCK_TRACKING 1
#endif
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7034e17535de..3354140de07e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode)
int
xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
+ umode_t mode;
+ bool set_mode = false;
int error = 0;
if (!acl)
@@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return error;
if (type == ACL_TYPE_ACCESS) {
- umode_t mode;
-
error = posix_acl_update_mode(inode, &mode, &acl);
if (error)
return error;
- error = xfs_set_mode(inode, mode);
- if (error)
- return error;
+ set_mode = true;
}
set_acl:
- return __xfs_set_acl(inode, acl, type);
+ error = __xfs_set_acl(inode, acl, type);
+ if (error)
+ return error;
+
+ /*
+ * We set the mode after successfully updating the ACL xattr because the
+ * xattr update can fail at ENOSPC and we don't want to change the mode
+ * if the ACL update hasn't been applied.
+ */
+ if (set_mode)
+ error = xfs_set_mode(inode, mode);
+
+ return error;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 29172609f2a3..a3eeaba156c5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -343,7 +343,8 @@ xfs_end_io(
error = xfs_reflink_end_cow(ip, offset, size);
break;
case XFS_IO_UNWRITTEN:
- error = xfs_iomap_write_unwritten(ip, offset, size);
+ /* writeback should never update isize */
+ error = xfs_iomap_write_unwritten(ip, offset, size, false);
break;
default:
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
@@ -445,6 +446,19 @@ xfs_imap_valid(
{
offset >>= inode->i_blkbits;
+ /*
+ * We have to make sure the cached mapping is within EOF to protect
+ * against eofblocks trimming on file release leaving us with a stale
+ * mapping. Otherwise, a page for a subsequent file extending buffered
+ * write could get picked up by this writeback cycle and written to the
+ * wrong blocks.
+ *
+ * Note that what we really want here is a generic mapping invalidation
+ * mechanism to protect us from arbitrary extent modifying contexts, not
+ * just eofblocks.
+ */
+ xfs_trim_extent_eof(imap, XFS_I(inode));
+
return offset >= imap->br_startoff &&
offset < imap->br_startoff + imap->br_blockcount;
}
@@ -734,6 +748,14 @@ xfs_vm_invalidatepage(
{
trace_xfs_invalidatepage(page->mapping->host, page, offset,
length);
+
+ /*
+ * If we are invalidating the entire page, clear the dirty state from it
+ * so that we can check for attempts to release dirty cached pages in
+ * xfs_vm_releasepage().
+ */
+ if (offset == 0 && length >= PAGE_SIZE)
+ cancel_dirty_page(page);
block_invalidatepage(page, offset, length);
}
@@ -1189,25 +1211,27 @@ xfs_vm_releasepage(
* mm accommodates an old ext3 case where clean pages might not have had
* the dirty bit cleared. Thus, it can send actual dirty pages to
* ->releasepage() via shrink_active_list(). Conversely,
- * block_invalidatepage() can send pages that are still marked dirty
- * but otherwise have invalidated buffers.
+ * block_invalidatepage() can send pages that are still marked dirty but
+ * otherwise have invalidated buffers.
*
* We want to release the latter to avoid unnecessary buildup of the
- * LRU, skip the former and warn if we've left any lingering
- * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
- * or unwritten buffers and warn if the page is not dirty. Otherwise
- * try to release the buffers.
+ * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
+ * that are entirely invalidated and need to be released. Hence the
+ * only time we should get dirty pages here is through
+ * shrink_active_list() and so we can simply skip those now.
+ *
+ * warn if we've left any lingering delalloc/unwritten buffers on clean
+ * or invalidated pages we are about to release.
*/
+ if (PageDirty(page))
+ return 0;
+
xfs_count_page_state(page, &delalloc, &unwritten);
- if (delalloc) {
- WARN_ON_ONCE(!PageDirty(page));
+ if (WARN_ON_ONCE(delalloc))
return 0;
- }
- if (unwritten) {
- WARN_ON_ONCE(!PageDirty(page));
+ if (WARN_ON_ONCE(unwritten))
return 0;
- }
return try_to_free_buffers(page);
}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 5d5a5e277f35..d07bf27451c9 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -48,6 +48,8 @@ struct xfs_attr_list_context;
#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
+#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
+
#define XFS_ATTR_FLAGS \
{ ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
{ ATTR_ROOT, "ROOT" }, \
@@ -56,7 +58,8 @@ struct xfs_attr_list_context;
{ ATTR_CREATE, "CREATE" }, \
{ ATTR_REPLACE, "REPLACE" }, \
{ ATTR_KERNOTIME, "KERNOTIME" }, \
- { ATTR_KERNOVAL, "KERNOVAL" }
+ { ATTR_KERNOVAL, "KERNOVAL" }, \
+ { ATTR_INCOMPLETE, "INCOMPLETE" }
/*
* The maximum size (into the kernel or returned from the kernel) of an
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index ebd66b19fbfc..52818ea2eb50 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -251,47 +251,44 @@ xfs_attr3_node_inactive(
* traversal of the tree so we may deal with many blocks
* before we come back to this one.
*/
- error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
- XFS_ATTR_FORK);
+ error = xfs_da3_node_read(*trans, dp, child_fsb, -1, &child_bp,
+ XFS_ATTR_FORK);
if (error)
return error;
- if (child_bp) {
- /* save for re-read later */
- child_blkno = XFS_BUF_ADDR(child_bp);
- /*
- * Invalidate the subtree, however we have to.
- */
- info = child_bp->b_addr;
- switch (info->magic) {
- case cpu_to_be16(XFS_DA_NODE_MAGIC):
- case cpu_to_be16(XFS_DA3_NODE_MAGIC):
- error = xfs_attr3_node_inactive(trans, dp,
- child_bp, level + 1);
- break;
- case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
- case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
- error = xfs_attr3_leaf_inactive(trans, dp,
- child_bp);
- break;
- default:
- error = -EIO;
- xfs_trans_brelse(*trans, child_bp);
- break;
- }
- if (error)
- return error;
+ /* save for re-read later */
+ child_blkno = XFS_BUF_ADDR(child_bp);
- /*
- * Remove the subsidiary block from the cache
- * and from the log.
- */
- error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
- &child_bp, XFS_ATTR_FORK);
- if (error)
- return error;
- xfs_trans_binval(*trans, child_bp);
+ /*
+ * Invalidate the subtree, however we have to.
+ */
+ info = child_bp->b_addr;
+ switch (info->magic) {
+ case cpu_to_be16(XFS_DA_NODE_MAGIC):
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ error = xfs_attr3_node_inactive(trans, dp, child_bp,
+ level + 1);
+ break;
+ case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ error = xfs_attr3_leaf_inactive(trans, dp, child_bp);
+ break;
+ default:
+ error = -EIO;
+ xfs_trans_brelse(*trans, child_bp);
+ break;
}
+ if (error)
+ return error;
+
+ /*
+ * Remove the subsidiary block from the cache and from the log.
+ */
+ error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+ xfs_trans_binval(*trans, child_bp);
/*
* If we're not done, re-read the parent to get the next
@@ -302,6 +299,8 @@ xfs_attr3_node_inactive(
&bp, XFS_ATTR_FORK);
if (error)
return error;
+ node = bp->b_addr;
+ btree = dp->d_ops->node_tree_p(node);
child_fsb = be32_to_cpu(btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 7740c8a5e736..3e59a348ea71 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -204,19 +204,103 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
return 0;
}
+/*
+ * We didn't find the block & hash mentioned in the cursor state, so
+ * walk down the attr btree looking for the hash.
+ */
STATIC int
-xfs_attr_node_list(xfs_attr_list_context_t *context)
+xfs_attr_node_list_lookup(
+ struct xfs_attr_list_context *context,
+ struct attrlist_cursor_kern *cursor,
+ struct xfs_buf **pbp)
{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_leafblock_t *leaf;
- xfs_da_intnode_t *node;
- struct xfs_attr3_icleaf_hdr leafhdr;
- struct xfs_da3_icnode_hdr nodehdr;
- struct xfs_da_node_entry *btree;
- int error, i;
- struct xfs_buf *bp;
- struct xfs_inode *dp = context->dp;
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_inode *dp = context->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_trans *tp = context->tp;
+ struct xfs_buf *bp;
+ int i;
+ int error = 0;
+ unsigned int expected_level = 0;
+ uint16_t magic;
+
+ ASSERT(*pbp == NULL);
+ cursor->blkno = 0;
+ for (;;) {
+ error = xfs_da3_node_read(tp, dp, cursor->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+ node = bp->b_addr;
+ magic = be16_to_cpu(node->hdr.info.magic);
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC)
+ break;
+ if (magic != XFS_DA_NODE_MAGIC &&
+ magic != XFS_DA3_NODE_MAGIC) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ node);
+ goto out_corruptbuf;
+ }
+
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+
+ /* Tree taller than we can handle; bail out! */
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ goto out_corruptbuf;
+
+ /* Check the level from the root node. */
+ if (cursor->blkno == 0)
+ expected_level = nodehdr.level - 1;
+ else if (expected_level != nodehdr.level)
+ goto out_corruptbuf;
+ else
+ expected_level--;
+
+ btree = dp->d_ops->node_tree_p(node);
+ for (i = 0; i < nodehdr.count; btree++, i++) {
+ if (cursor->hashval <= be32_to_cpu(btree->hashval)) {
+ cursor->blkno = be32_to_cpu(btree->before);
+ trace_xfs_attr_list_node_descend(context,
+ btree);
+ break;
+ }
+ }
+ xfs_trans_brelse(tp, bp);
+
+ if (i == nodehdr.count)
+ return 0;
+
+ /* We can't point back to the root. */
+ if (cursor->blkno == 0)
+ return -EFSCORRUPTED;
+ }
+
+ if (expected_level != 0)
+ goto out_corruptbuf;
+
+ *pbp = bp;
+ return 0;
+
+out_corruptbuf:
+ xfs_trans_brelse(tp, bp);
+ return -EFSCORRUPTED;
+}
+
+STATIC int
+xfs_attr_node_list(
+ struct xfs_attr_list_context *context)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct attrlist_cursor_kern *cursor;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_da_intnode *node;
+ struct xfs_buf *bp;
+ struct xfs_inode *dp = context->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int error;
trace_xfs_attr_node_list(context);
@@ -277,47 +361,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
* Note that start of node block is same as start of leaf block.
*/
if (bp == NULL) {
- cursor->blkno = 0;
- for (;;) {
- uint16_t magic;
-
- error = xfs_da3_node_read(context->tp, dp,
- cursor->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error)
- return error;
- node = bp->b_addr;
- magic = be16_to_cpu(node->hdr.info.magic);
- if (magic == XFS_ATTR_LEAF_MAGIC ||
- magic == XFS_ATTR3_LEAF_MAGIC)
- break;
- if (magic != XFS_DA_NODE_MAGIC &&
- magic != XFS_DA3_NODE_MAGIC) {
- XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount,
- node);
- xfs_trans_brelse(context->tp, bp);
- return -EFSCORRUPTED;
- }
-
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
- for (i = 0; i < nodehdr.count; btree++, i++) {
- if (cursor->hashval
- <= be32_to_cpu(btree->hashval)) {
- cursor->blkno = be32_to_cpu(btree->before);
- trace_xfs_attr_list_node_descend(context,
- btree);
- break;
- }
- }
- if (i == nodehdr.count) {
- xfs_trans_brelse(context->tp, bp);
- return 0;
- }
- xfs_trans_brelse(context->tp, bp);
- }
+ error = xfs_attr_node_list_lookup(context, cursor, &bp);
+ if (error || !bp)
+ return error;
}
ASSERT(bp != NULL);
@@ -407,7 +453,8 @@ xfs_attr3_leaf_list_int(
cursor->offset = 0;
}
- if (entry->flags & XFS_ATTR_INCOMPLETE)
+ if ((entry->flags & XFS_ATTR_INCOMPLETE) &&
+ !(context->flags & ATTR_INCOMPLETE))
continue; /* skip incomplete entries */
if (entry->flags & XFS_ATTR_LOCAL) {
@@ -499,8 +546,8 @@ xfs_attr_list_int(
#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
(((struct attrlist_ent *) 0)->a_name - (char *) 0)
#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
- ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
- & ~(sizeof(u_int32_t)-1))
+ ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \
+ & ~(sizeof(uint32_t)-1))
/*
* Format an attribute and copy it out to the user's buffer.
@@ -583,6 +630,10 @@ xfs_attr_list(
(cursor->hashval || cursor->blkno || cursor->offset))
return -EINVAL;
+ /* Only internal consumers can retrieve incomplete attrs. */
+ if (flags & ATTR_INCOMPLETE)
+ return -EINVAL;
+
/*
* Check for a properly aligned buffer.
*/
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index cd9a5400ba4f..6d37ab43195f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -84,6 +84,7 @@ xfs_zero_extent(
GFP_NOFS, 0);
}
+#ifdef CONFIG_XFS_RT
int
xfs_bmap_rtalloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
@@ -190,6 +191,7 @@ xfs_bmap_rtalloc(
}
return 0;
}
+#endif /* CONFIG_XFS_RT */
/*
* Check if the endoff is outside the last extent. If so the caller will grow
@@ -227,15 +229,17 @@ xfs_bmap_count_leaves(
struct xfs_ifork *ifp,
xfs_filblks_t *count)
{
+ struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got;
- xfs_extnum_t numrecs = 0, i = 0;
+ xfs_extnum_t numrecs = 0;
- while (xfs_iext_get_extent(ifp, i++, &got)) {
+ for_each_xfs_iext(ifp, &icur, &got) {
if (!isnullstartblock(got.br_startblock)) {
*count += got.br_blockcount;
numrecs++;
}
}
+
return numrecs;
}
@@ -403,125 +407,103 @@ xfs_bmap_count_blocks(
return 0;
}
-/*
- * returns 1 for success, 0 if we failed to map the extent.
- */
-STATIC int
-xfs_getbmapx_fix_eof_hole(
- xfs_inode_t *ip, /* xfs incore inode pointer */
- int whichfork,
- struct getbmapx *out, /* output structure */
- int prealloced, /* this is a file with
- * preallocated data space */
- int64_t end, /* last block requested */
- xfs_fsblock_t startblock,
- bool moretocome)
+static int
+xfs_getbmap_report_one(
+ struct xfs_inode *ip,
+ struct getbmapx *bmv,
+ struct kgetbmap *out,
+ int64_t bmv_end,
+ struct xfs_bmbt_irec *got)
{
- int64_t fixlen;
- xfs_mount_t *mp; /* file system mount point */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last extent pointer */
- xfs_fileoff_t fileblock;
-
- if (startblock == HOLESTARTBLOCK) {
- mp = ip->i_mount;
- out->bmv_block = -1;
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
- fixlen -= out->bmv_offset;
- if (prealloced && out->bmv_offset + out->bmv_length == end) {
- /* Came to hole at EOF. Trim it. */
- if (fixlen <= 0)
- return 0;
- out->bmv_length = fixlen;
- }
+ struct kgetbmap *p = out + bmv->bmv_entries;
+ bool shared = false, trimmed = false;
+ int error;
+
+ error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed);
+ if (error)
+ return error;
+
+ if (isnullstartblock(got->br_startblock) ||
+ got->br_startblock == DELAYSTARTBLOCK) {
+ /*
+ * Delalloc extents that start beyond EOF can occur due to
+ * speculative EOF allocation when the delalloc extent is larger
+ * than the largest freespace extent at conversion time. These
+ * extents cannot be converted by data writeback, so can exist
+ * here even if we are not supposed to be finding delalloc
+ * extents.
+ */
+ if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
+ ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
+
+ p->bmv_oflags |= BMV_OF_DELALLOC;
+ p->bmv_block = -2;
} else {
- if (startblock == DELAYSTARTBLOCK)
- out->bmv_block = -2;
- else
- out->bmv_block = xfs_fsb_to_db(ip, startblock);
- fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!moretocome &&
- xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
- (lastx == xfs_iext_count(ifp) - 1))
- out->bmv_oflags |= BMV_OF_LAST;
+ p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
}
- return 1;
+ if (got->br_state == XFS_EXT_UNWRITTEN &&
+ (bmv->bmv_iflags & BMV_IF_PREALLOC))
+ p->bmv_oflags |= BMV_OF_PREALLOC;
+
+ if (shared)
+ p->bmv_oflags |= BMV_OF_SHARED;
+
+ p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
+ p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
+
+ bmv->bmv_offset = p->bmv_offset + p->bmv_length;
+ bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
+ bmv->bmv_entries++;
+ return 0;
}
-/* Adjust the reported bmap around shared/unshared extent transitions. */
-STATIC int
-xfs_getbmap_adjust_shared(
- struct xfs_inode *ip,
- int whichfork,
- struct xfs_bmbt_irec *map,
- struct getbmapx *out,
- struct xfs_bmbt_irec *next_map)
+static void
+xfs_getbmap_report_hole(
+ struct xfs_inode *ip,
+ struct getbmapx *bmv,
+ struct kgetbmap *out,
+ int64_t bmv_end,
+ xfs_fileoff_t bno,
+ xfs_fileoff_t end)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_agblock_t ebno;
- xfs_extlen_t elen;
- xfs_extlen_t nlen;
- int error;
+ struct kgetbmap *p = out + bmv->bmv_entries;
- next_map->br_startblock = NULLFSBLOCK;
- next_map->br_startoff = NULLFILEOFF;
- next_map->br_blockcount = 0;
+ if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
+ return;
- /* Only written data blocks can be shared. */
- if (!xfs_is_reflink_inode(ip) ||
- whichfork != XFS_DATA_FORK ||
- !xfs_bmap_is_real_extent(map))
- return 0;
+ p->bmv_block = -1;
+ p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
+ p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
- agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
- error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
- map->br_blockcount, &ebno, &elen, true);
- if (error)
- return error;
+ bmv->bmv_offset = p->bmv_offset + p->bmv_length;
+ bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
+ bmv->bmv_entries++;
+}
- if (ebno == NULLAGBLOCK) {
- /* No shared blocks at all. */
- return 0;
- } else if (agbno == ebno) {
- /*
- * Shared extent at (agbno, elen). Shrink the reported
- * extent length and prepare to move the start of map[i]
- * to agbno+elen, with the aim of (re)formatting the new
- * map[i] the next time through the inner loop.
- */
- out->bmv_length = XFS_FSB_TO_BB(mp, elen);
- out->bmv_oflags |= BMV_OF_SHARED;
- if (elen != map->br_blockcount) {
- *next_map = *map;
- next_map->br_startblock += elen;
- next_map->br_startoff += elen;
- next_map->br_blockcount -= elen;
- }
- map->br_blockcount -= elen;
- } else {
- /*
- * There's an unshared extent (agbno, ebno - agbno)
- * followed by shared extent at (ebno, elen). Shrink
- * the reported extent length to cover only the unshared
- * extent and prepare to move up the start of map[i] to
- * ebno, with the aim of (re)formatting the new map[i]
- * the next time through the inner loop.
- */
- *next_map = *map;
- nlen = ebno - agbno;
- out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
- next_map->br_startblock += nlen;
- next_map->br_startoff += nlen;
- next_map->br_blockcount -= nlen;
- map->br_blockcount -= nlen;
- }
+static inline bool
+xfs_getbmap_full(
+ struct getbmapx *bmv)
+{
+ return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1;
+}
- return 0;
+static bool
+xfs_getbmap_next_rec(
+ struct xfs_bmbt_irec *rec,
+ xfs_fileoff_t total_end)
+{
+ xfs_fileoff_t end = rec->br_startoff + rec->br_blockcount;
+
+ if (end == total_end)
+ return false;
+
+ rec->br_startoff += rec->br_blockcount;
+ if (!isnullstartblock(rec->br_startblock) &&
+ rec->br_startblock != DELAYSTARTBLOCK)
+ rec->br_startblock += rec->br_blockcount;
+ rec->br_blockcount = total_end - end;
+ return true;
}
/*
@@ -533,33 +515,22 @@ xfs_getbmap_adjust_shared(
*/
int /* error code */
xfs_getbmap(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
struct getbmapx *bmv, /* user bmap structure */
- xfs_bmap_format_t formatter, /* format to user */
- void *arg) /* formatter arg */
+ struct kgetbmap *out)
{
- int64_t bmvend; /* last block requested */
- int error = 0; /* return value */
- int64_t fixlen; /* length for -1 case */
- int i; /* extent number */
- int lock; /* lock state */
- xfs_bmbt_irec_t *map; /* buffer for user's data */
- xfs_mount_t *mp; /* file system mount point */
- int nex; /* # of user extents can do */
- int subnex; /* # of bmapi's can do */
- int nmap; /* number of map entries */
- struct getbmapx *out; /* output structure */
- int whichfork; /* data or attr fork */
- int prealloced; /* this is a file with
- * preallocated data space */
- int iflags; /* interface flags */
- int bmapi_flags; /* flags for xfs_bmapi */
- int cur_ext = 0;
- struct xfs_bmbt_irec inject_map;
-
- mp = ip->i_mount;
- iflags = bmv->bmv_iflags;
-
+ struct xfs_mount *mp = ip->i_mount;
+ int iflags = bmv->bmv_iflags;
+ int whichfork, lock, error = 0;
+ int64_t bmv_end, max_len;
+ xfs_fileoff_t bno, first_bno;
+ struct xfs_ifork *ifp;
+ struct xfs_bmbt_irec got, rec;
+ xfs_filblks_t len;
+ struct xfs_iext_cursor icur;
+
+ if (bmv->bmv_iflags & ~BMV_IF_VALID)
+ return -EINVAL;
#ifndef DEBUG
/* Only allow CoW fork queries if we're debugging. */
if (iflags & BMV_IF_COWFORK)
@@ -568,89 +539,42 @@ xfs_getbmap(
if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
return -EINVAL;
+ if (bmv->bmv_length < -1)
+ return -EINVAL;
+ bmv->bmv_entries = 0;
+ if (bmv->bmv_length == 0)
+ return 0;
+
if (iflags & BMV_IF_ATTRFORK)
whichfork = XFS_ATTR_FORK;
else if (iflags & BMV_IF_COWFORK)
whichfork = XFS_COW_FORK;
else
whichfork = XFS_DATA_FORK;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
switch (whichfork) {
case XFS_ATTR_FORK:
- if (XFS_IFORK_Q(ip)) {
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
- return -EINVAL;
- } else if (unlikely(
- ip->i_d.di_aformat != 0 &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
- XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return -EFSCORRUPTED;
- }
+ if (!XFS_IFORK_Q(ip))
+ goto out_unlock_iolock;
- prealloced = 0;
- fixlen = 1LL << 32;
+ max_len = 1LL << 32;
+ lock = xfs_ilock_attr_map_shared(ip);
break;
case XFS_COW_FORK:
- if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
- return -EINVAL;
+ /* No CoW fork? Just return */
+ if (!ifp)
+ goto out_unlock_iolock;
- if (xfs_get_cowextsz_hint(ip)) {
- prealloced = 1;
- fixlen = mp->m_super->s_maxbytes;
- } else {
- prealloced = 0;
- fixlen = XFS_ISIZE(ip);
- }
- break;
- default:
- /* Local format data forks report no extents. */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- bmv->bmv_entries = 0;
- return 0;
- }
- if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
- return -EINVAL;
+ if (xfs_get_cowextsz_hint(ip))
+ max_len = mp->m_super->s_maxbytes;
+ else
+ max_len = XFS_ISIZE(ip);
- if (xfs_get_extsz_hint(ip) ||
- ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
- prealloced = 1;
- fixlen = mp->m_super->s_maxbytes;
- } else {
- prealloced = 0;
- fixlen = XFS_ISIZE(ip);
- }
+ lock = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lock);
break;
- }
-
- if (bmv->bmv_length == -1) {
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
- bmv->bmv_length =
- max_t(int64_t, fixlen - bmv->bmv_offset, 0);
- } else if (bmv->bmv_length == 0) {
- bmv->bmv_entries = 0;
- return 0;
- } else if (bmv->bmv_length < 0) {
- return -EINVAL;
- }
-
- nex = bmv->bmv_count - 1;
- if (nex <= 0)
- return -EINVAL;
- bmvend = bmv->bmv_offset + bmv->bmv_length;
-
-
- if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
- return -ENOMEM;
- out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
- if (!out)
- return -ENOMEM;
-
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- switch (whichfork) {
case XFS_DATA_FORK:
if (!(iflags & BMV_IF_DELALLOC) &&
(ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
@@ -668,154 +592,105 @@ xfs_getbmap(
*/
}
+ if (xfs_get_extsz_hint(ip) ||
+ (ip->i_d.di_flags &
+ (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
+ max_len = mp->m_super->s_maxbytes;
+ else
+ max_len = XFS_ISIZE(ip);
+
lock = xfs_ilock_data_map_shared(ip);
break;
- case XFS_COW_FORK:
- lock = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lock);
- break;
- case XFS_ATTR_FORK:
- lock = xfs_ilock_attr_map_shared(ip);
- break;
}
- /*
- * Don't let nex be bigger than the number of extents
- * we can have assuming alternating holes and real extents.
- */
- if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
- nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
-
- bmapi_flags = xfs_bmapi_aflag(whichfork);
- if (!(iflags & BMV_IF_PREALLOC))
- bmapi_flags |= XFS_BMAPI_IGSTATE;
-
- /*
- * Allocate enough space to handle "subnex" maps at a time.
- */
- error = -ENOMEM;
- subnex = 16;
- map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
- if (!map)
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ /* Local format inode forks report no extents. */
goto out_unlock_ilock;
+ default:
+ error = -EINVAL;
+ goto out_unlock_ilock;
+ }
- bmv->bmv_entries = 0;
-
- if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
- (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
- error = 0;
- goto out_free_map;
+ if (bmv->bmv_length == -1) {
+ max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
+ bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
}
- do {
- nmap = (nex> subnex) ? subnex : nex;
- error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
- XFS_BB_TO_FSB(mp, bmv->bmv_length),
- map, &nmap, bmapi_flags);
- if (error)
- goto out_free_map;
- ASSERT(nmap <= subnex);
-
- for (i = 0; i < nmap && bmv->bmv_length &&
- cur_ext < bmv->bmv_count - 1; i++) {
- out[cur_ext].bmv_oflags = 0;
- if (map[i].br_state == XFS_EXT_UNWRITTEN)
- out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
- else if (map[i].br_startblock == DELAYSTARTBLOCK)
- out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
- out[cur_ext].bmv_offset =
- XFS_FSB_TO_BB(mp, map[i].br_startoff);
- out[cur_ext].bmv_length =
- XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- out[cur_ext].bmv_unused1 = 0;
- out[cur_ext].bmv_unused2 = 0;
+ bmv_end = bmv->bmv_offset + bmv->bmv_length;
- /*
- * delayed allocation extents that start beyond EOF can
- * occur due to speculative EOF allocation when the
- * delalloc extent is larger than the largest freespace
- * extent at conversion time. These extents cannot be
- * converted by data writeback, so can exist here even
- * if we are not supposed to be finding delalloc
- * extents.
- */
- if (map[i].br_startblock == DELAYSTARTBLOCK &&
- map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
- ASSERT((iflags & BMV_IF_DELALLOC) != 0);
-
- if (map[i].br_startblock == HOLESTARTBLOCK &&
- whichfork == XFS_ATTR_FORK) {
- /* came to the end of attribute fork */
- out[cur_ext].bmv_oflags |= BMV_OF_LAST;
- goto out_free_map;
- }
+ first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
+ len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
- /* Is this a shared block? */
- error = xfs_getbmap_adjust_shared(ip, whichfork,
- &map[i], &out[cur_ext], &inject_map);
- if (error)
- goto out_free_map;
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, whichfork);
+ if (error)
+ goto out_unlock_ilock;
+ }
- if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
- &out[cur_ext], prealloced, bmvend,
- map[i].br_startblock,
- inject_map.br_startblock != NULLFSBLOCK))
- goto out_free_map;
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
+ /*
+ * Report a whole-file hole if the delalloc flag is set to
+ * stay compatible with the old implementation.
+ */
+ if (iflags & BMV_IF_DELALLOC)
+ xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
+ XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+ goto out_unlock_ilock;
+ }
- bmv->bmv_offset =
- out[cur_ext].bmv_offset +
- out[cur_ext].bmv_length;
- bmv->bmv_length =
- max_t(int64_t, 0, bmvend - bmv->bmv_offset);
+ while (!xfs_getbmap_full(bmv)) {
+ xfs_trim_extent(&got, first_bno, len);
- /*
- * In case we don't want to return the hole,
- * don't increase cur_ext so that we can reuse
- * it in the next loop.
- */
- if ((iflags & BMV_IF_NO_HOLES) &&
- map[i].br_startblock == HOLESTARTBLOCK) {
- memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
- continue;
- }
+ /*
+ * Report an entry for a hole if this extent doesn't directly
+ * follow the previous one.
+ */
+ if (got.br_startoff > bno) {
+ xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
+ got.br_startoff);
+ if (xfs_getbmap_full(bmv))
+ break;
+ }
- /*
- * In order to report shared extents accurately,
- * we report each distinct shared/unshared part
- * of a single bmbt record using multiple bmap
- * extents. To make that happen, we iterate the
- * same map array item multiple times, each
- * time trimming out the subextent that we just
- * reported.
- *
- * Because of this, we must check the out array
- * index (cur_ext) directly against bmv_count-1
- * to avoid overflows.
- */
- if (inject_map.br_startblock != NULLFSBLOCK) {
- map[i] = inject_map;
- i--;
+ /*
+ * In order to report shared extents accurately, we report each
+ * distinct shared / unshared part of a single bmbt record with
+ * an individual getbmapx record.
+ */
+ bno = got.br_startoff + got.br_blockcount;
+ rec = got;
+ do {
+ error = xfs_getbmap_report_one(ip, bmv, out, bmv_end,
+ &rec);
+ if (error || xfs_getbmap_full(bmv))
+ goto out_unlock_ilock;
+ } while (xfs_getbmap_next_rec(&rec, bno));
+
+ if (!xfs_iext_next_extent(ifp, &icur, &got)) {
+ xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+
+ out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST;
+
+ if (whichfork != XFS_ATTR_FORK && bno < end &&
+ !xfs_getbmap_full(bmv)) {
+ xfs_getbmap_report_hole(ip, bmv, out, bmv_end,
+ bno, end);
}
- bmv->bmv_entries++;
- cur_ext++;
+ break;
}
- } while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
- out_free_map:
- kmem_free(map);
- out_unlock_ilock:
- xfs_iunlock(ip, lock);
- out_unlock_iolock:
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- for (i = 0; i < cur_ext; i++) {
- /* format results & advance arg */
- error = formatter(&arg, &out[i]);
- if (error)
+ if (bno >= first_bno + len)
break;
}
- kmem_free(out);
+out_unlock_ilock:
+ xfs_iunlock(ip, lock);
+out_unlock_iolock:
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return error;
}
@@ -1387,53 +1262,12 @@ out:
}
-/*
- * @next_fsb will keep track of the extent currently undergoing shift.
- * @stop_fsb will keep track of the extent at which we have to stop.
- * If we are shifting left, we will start with block (offset + len) and
- * shift each extent till last extent.
- * If we are shifting right, we will start with last extent inside file space
- * and continue until we reach the block corresponding to offset.
- */
static int
-xfs_shift_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len,
- enum shift_direction direction)
+xfs_prepare_shift(
+ struct xfs_inode *ip,
+ loff_t offset)
{
- int done = 0;
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
int error;
- struct xfs_defer_ops dfops;
- xfs_fsblock_t first_block;
- xfs_fileoff_t stop_fsb;
- xfs_fileoff_t next_fsb;
- xfs_fileoff_t shift_fsb;
- uint resblks;
-
- ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
-
- if (direction == SHIFT_LEFT) {
- /*
- * Reserve blocks to cover potential extent merges after left
- * shift operations.
- */
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- next_fsb = XFS_B_TO_FSB(mp, offset + len);
- stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
- } else {
- /*
- * If right shift, delegate the work of initialization of
- * next_fsb to xfs_bmap_shift_extent as it has ilock held.
- */
- resblks = 0;
- next_fsb = NULLFSBLOCK;
- stop_fsb = XFS_B_TO_FSB(mp, offset);
- }
-
- shift_fsb = XFS_B_TO_FSB(mp, len);
/*
* Trim eofblocks to avoid shifting uninitialized post-eof preallocation
@@ -1449,8 +1283,7 @@ xfs_shift_file_space(
* Writeback and invalidate cache for the remainder of the file as we're
* about to shift down every extent from offset to EOF.
*/
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- offset, -1);
+ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1);
if (error)
return error;
error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
@@ -1459,16 +1292,62 @@ xfs_shift_file_space(
return error;
/*
- * The extent shiting code works on extent granularity. So, if
- * stop_fsb is not the starting block of extent, we need to split
- * the extent at stop_fsb.
+ * Clean out anything hanging around in the cow fork now that
+ * we've flushed all the dirty data out to disk to avoid having
+ * CoW extents at the wrong offsets.
*/
- if (direction == SHIFT_RIGHT) {
- error = xfs_bmap_split_extent(ip, stop_fsb);
+ if (xfs_is_reflink_inode(ip)) {
+ error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
+ true);
if (error)
return error;
}
+ return 0;
+}
+
+/*
+ * xfs_collapse_file_space()
+ * This routine frees disk space and shift extent for the given file.
+ * The first thing we do is to free data blocks in the specified range
+ * by calling xfs_free_file_space(). It would also sync dirty data
+ * and invalidate page cache over the region on which collapse range
+ * is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t first_block;
+ xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+ xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
+ xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
+ uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ bool done = false;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
+ trace_xfs_collapse_file_space(ip);
+
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ return error;
+
+ error = xfs_prepare_shift(ip, offset);
+ if (error)
+ return error;
+
while (!error && !done) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
&tp);
@@ -1481,25 +1360,17 @@ xfs_shift_file_space(
XFS_QMOPT_RES_REGBLKS);
if (error)
goto out_trans_cancel;
-
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_defer_init(&dfops, &first_block);
-
- /*
- * We are using the write transaction in which max 2 bmbt
- * updates are allowed
- */
- error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
- &done, stop_fsb, &first_block, &dfops,
- direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
+ error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
+ &done, stop_fsb, &first_block, &dfops);
if (error)
goto out_bmap_cancel;
error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
-
error = xfs_trans_commit(tp);
}
@@ -1513,36 +1384,6 @@ out_trans_cancel:
}
/*
- * xfs_collapse_file_space()
- * This routine frees disk space and shift extent for the given file.
- * The first thing we do is to free data blocks in the specified range
- * by calling xfs_free_file_space(). It would also sync dirty data
- * and invalidate page cache over the region on which collapse range
- * is working. And Shift extent records to the left to cover a hole.
- * RETURNS:
- * 0 on success
- * errno on error
- *
- */
-int
-xfs_collapse_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len)
-{
- int error;
-
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- trace_xfs_collapse_file_space(ip);
-
- error = xfs_free_file_space(ip, offset, len);
- if (error)
- return error;
-
- return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
-}
-
-/*
* xfs_insert_file_space()
* This routine create hole space by shifting extents for the given file.
* The first thing we do is to sync dirty data and invalidate page cache
@@ -1560,10 +1401,60 @@ xfs_insert_file_space(
loff_t offset,
loff_t len)
{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t first_block;
+ xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, offset);
+ xfs_fileoff_t next_fsb = NULLFSBLOCK;
+ xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
+ bool done = false;
+
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
trace_xfs_insert_file_space(ip);
- return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
+ error = xfs_prepare_shift(ip, offset);
+ if (error)
+ return error;
+
+ /*
+ * The extent shifting code works on extent granularity. So, if stop_fsb
+ * is not the starting block of extent, we need to split the extent at
+ * stop_fsb.
+ */
+ error = xfs_bmap_split_extent(ip, stop_fsb);
+ if (error)
+ return error;
+
+ while (!error && !done) {
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0,
+ &tp);
+ if (error)
+ break;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_defer_init(&dfops, &first_block);
+ error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
+ &done, stop_fsb, &first_block, &dfops);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_defer_finish(&tp, &dfops);
+ if (error)
+ goto out_bmap_cancel;
+ error = xfs_trans_commit(tp);
+ }
+
+ return error;
+
+out_bmap_cancel:
+ xfs_defer_cancel(&dfops);
+ xfs_trans_cancel(tp);
+ return error;
}
/*
@@ -1818,7 +1709,6 @@ xfs_swap_extent_forks(
xfs_filblks_t aforkblks = 0;
xfs_filblks_t taforkblks = 0;
xfs_extnum_t junk;
- xfs_extnum_t nextents;
uint64_t tmp;
int error;
@@ -1893,13 +1783,6 @@ xfs_swap_extent_forks(
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /*
- * If the extents fit in the inode, fix the pointer. Otherwise
- * it's already NULL or pointing to the extent.
- */
- nextents = xfs_iext_count(&ip->i_df);
- if (nextents <= XFS_INLINE_EXTS)
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
@@ -1911,13 +1794,6 @@ xfs_swap_extent_forks(
switch (tip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /*
- * If the extents fit in the inode, fix the pointer. Otherwise
- * it's already NULL or pointing to the extent.
- */
- nextents = xfs_iext_count(&tip->i_df);
- if (nextents <= XFS_INLINE_EXTS)
- tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext;
(*target_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
@@ -2110,11 +1986,31 @@ xfs_swap_extents(
ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+ }
+
+ /* Swap the cow forks. */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ xfs_extnum_t extnum;
+
+ ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+
+ extnum = ip->i_cnextents;
+ ip->i_cnextents = tip->i_cnextents;
+ tip->i_cnextents = extnum;
+
cowfp = ip->i_cowfp;
ip->i_cowfp = tip->i_cowfp;
tip->i_cowfp = cowfp;
- xfs_inode_set_cowblocks_tag(ip);
- xfs_inode_set_cowblocks_tag(tip);
+
+ if (ip->i_cowfp && ip->i_cnextents)
+ xfs_inode_set_cowblocks_tag(ip);
+ else
+ xfs_inode_clear_cowblocks_tag(ip);
+ if (tip->i_cowfp && tip->i_cnextents)
+ xfs_inode_set_cowblocks_tag(tip);
+ else
+ xfs_inode_clear_cowblocks_tag(tip);
}
xfs_trans_log_inode(tp, ip, src_log_flags);
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 0eaa81dc49be..4d4ae48bd4f6 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -28,16 +28,33 @@ struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
+#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+#else /* !CONFIG_XFS_RT */
+/*
+ * Attempts to allocate RT extents when RT is disable indicates corruption and
+ * should trigger a shutdown.
+ */
+static inline int
+xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
+{
+ return -EFSCORRUPTED;
+}
+#endif /* CONFIG_XFS_RT */
+
int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
int whichfork, int *eof);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_fileoff_t start_fsb, xfs_fileoff_t length);
-/* bmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *);
+struct kgetbmap {
+ __s64 bmv_offset; /* file offset of segment in blocks */
+ __s64 bmv_block; /* starting block (64-bit daddr_t) */
+ __s64 bmv_length; /* length of segment, blocks */
+ __s32 bmv_oflags; /* output flags */
+};
int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
- xfs_bmap_format_t formatter, void *arg);
+ struct kgetbmap *out);
/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index da14658da310..4db6e8d780f6 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -42,6 +42,8 @@
#include "xfs_mount.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
static kmem_zone_t *xfs_buf_zone;
@@ -1258,8 +1260,6 @@ xfs_buf_ioapply_map(
int size;
int offset;
- total_nr_pages = bp->b_page_count;
-
/* skip the pages in the buffer before the start offset */
page_index = 0;
offset = *buf_offset;
@@ -2131,3 +2131,17 @@ xfs_buf_terminate(void)
{
kmem_zone_destroy(xfs_buf_zone);
}
+
+void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
+{
+ /*
+ * Set the lru reference count to 0 based on the error injection tag.
+ * This allows userspace to disrupt buffer caching for debug/testing
+ * purposes.
+ */
+ if (XFS_TEST_ERROR(false, bp->b_target->bt_mount,
+ XFS_ERRTAG_BUF_LRU_REF))
+ lru_ref = 0;
+
+ atomic_set(&bp->b_lru_ref, lru_ref);
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index bf71507ddb16..f873bb786824 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -352,10 +352,7 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
-static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
-{
- atomic_set(&bp->b_lru_ref, lru_ref);
-}
+void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
static inline int xfs_buf_ispinned(struct xfs_buf *bp)
{
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index ba2638d37031..0c58918bc0ad 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -41,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = {
DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
};
-static unsigned char
+unsigned char
xfs_dir3_get_dtype(
struct xfs_mount *mp,
uint8_t filetype)
@@ -266,7 +266,7 @@ xfs_dir2_leaf_readbuf(
xfs_dablk_t next_ra;
xfs_dablk_t map_off;
xfs_dablk_t last_da;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
int ra_want;
int error = 0;
@@ -283,7 +283,7 @@ xfs_dir2_leaf_readbuf(
*/
last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
- if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map))
+ if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map))
goto out;
if (map.br_startoff >= last_da)
goto out;
@@ -311,7 +311,7 @@ xfs_dir2_leaf_readbuf(
if (next_ra >= last_da)
goto out_no_ra;
if (map.br_blockcount < geo->fsbcount &&
- !xfs_iext_get_extent(ifp, ++idx, &map))
+ !xfs_iext_next_extent(ifp, &icur, &map))
goto out_no_ra;
if (map.br_startoff >= last_da)
goto out_no_ra;
@@ -334,7 +334,7 @@ xfs_dir2_leaf_readbuf(
ra_want -= geo->fsbcount;
next_ra += geo->fsbcount;
}
- if (!xfs_iext_get_extent(ifp, ++idx, &map)) {
+ if (!xfs_iext_next_extent(ifp, &icur, &map)) {
*ra_blk = last_da;
break;
}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
index 0f070f9e44e1..de92d9cc958f 100644
--- a/fs/xfs/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef XFS_DISCARD_H
#define XFS_DISCARD_H 1
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index cd82429d8df7..d57c2db64e59 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -53,13 +53,6 @@
* otherwise by the lowest id first, see xfs_dqlock2.
*/
-#ifdef DEBUG
-xfs_buftarg_t *xfs_dqerror_target;
-int xfs_do_dqerror;
-int xfs_dqreq_num;
-int xfs_dqerror_mod = 33;
-#endif
-
struct kmem_zone *xfs_qm_dqtrxzone;
static struct kmem_zone *xfs_qm_dqzone;
@@ -703,7 +696,7 @@ xfs_dq_get_next_id(
xfs_dqid_t next_id = *id + 1; /* simple advance */
uint lock_flags;
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor cur;
xfs_fsblock_t start;
int error = 0;
@@ -727,7 +720,7 @@ xfs_dq_get_next_id(
return error;
}
- if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &idx, &got)) {
+ if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &cur, &got)) {
/* contiguous chunk, bump startoff for the id calculation */
if (got.br_startoff < start)
got.br_startoff = start;
@@ -770,15 +763,6 @@ xfs_qm_dqget(
return -ESRCH;
}
-#ifdef DEBUG
- if (xfs_do_dqerror) {
- if ((xfs_dqerror_target == mp->m_ddev_targp) &&
- (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
- xfs_debug(mp, "Returning error in dqget");
- return -EIO;
- }
- }
-
ASSERT(type == XFS_DQ_USER ||
type == XFS_DQ_PROJ ||
type == XFS_DQ_GROUP);
@@ -786,7 +770,6 @@ xfs_qm_dqget(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(xfs_inode_dquot(ip, type) == NULL);
}
-#endif
restart:
mutex_lock(&qi->qi_tree_lock);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index bd786a9ac2c3..4c9f35d983b2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -21,6 +21,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_sysfs.h"
@@ -58,6 +59,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_DROP_WRITES,
XFS_RANDOM_LOG_BAD_CRC,
XFS_RANDOM_LOG_ITEM_PIN,
+ XFS_RANDOM_BUF_LRU_REF,
};
struct xfs_errortag_attr {
@@ -163,6 +165,7 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
+XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -196,10 +199,11 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(drop_writes),
XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
XFS_ERRORTAG_ATTR_LIST(log_item_pin),
+ XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
NULL,
};
-struct kobj_type xfs_errortag_ktype = {
+static struct kobj_type xfs_errortag_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_errortag_sysfs_ops,
.default_attrs = xfs_errortag_attrs,
@@ -347,7 +351,7 @@ xfs_verifier_error(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
+ xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
__return_address, bp->b_ops->name, bp->b_bn);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 7c4bef3bddb7..ea816c1bf8db 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -63,87 +63,6 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
} \
}
-/*
- * error injection tags - the labels can be anything you want
- * but each tag should have its own unique number
- */
-
-#define XFS_ERRTAG_NOERROR 0
-#define XFS_ERRTAG_IFLUSH_1 1
-#define XFS_ERRTAG_IFLUSH_2 2
-#define XFS_ERRTAG_IFLUSH_3 3
-#define XFS_ERRTAG_IFLUSH_4 4
-#define XFS_ERRTAG_IFLUSH_5 5
-#define XFS_ERRTAG_IFLUSH_6 6
-#define XFS_ERRTAG_DA_READ_BUF 7
-#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8
-#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9
-#define XFS_ERRTAG_ALLOC_READ_AGF 10
-#define XFS_ERRTAG_IALLOC_READ_AGI 11
-#define XFS_ERRTAG_ITOBP_INOTOBP 12
-#define XFS_ERRTAG_IUNLINK 13
-#define XFS_ERRTAG_IUNLINK_REMOVE 14
-#define XFS_ERRTAG_DIR_INO_VALIDATE 15
-#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16
-#define XFS_ERRTAG_IODONE_IOERR 17
-#define XFS_ERRTAG_STRATREAD_IOERR 18
-#define XFS_ERRTAG_STRATCMPL_IOERR 19
-#define XFS_ERRTAG_DIOWRITE_IOERR 20
-#define XFS_ERRTAG_BMAPIFORMAT 21
-#define XFS_ERRTAG_FREE_EXTENT 22
-#define XFS_ERRTAG_RMAP_FINISH_ONE 23
-#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24
-#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
-#define XFS_ERRTAG_BMAP_FINISH_ONE 26
-#define XFS_ERRTAG_AG_RESV_CRITICAL 27
-/*
- * DEBUG mode instrumentation to test and/or trigger delayed allocation
- * block killing in the event of failed writes. When enabled, all
- * buffered writes are silenty dropped and handled as if they failed.
- * All delalloc blocks in the range of the write (including pre-existing
- * delalloc blocks!) are tossed as part of the write failure error
- * handling sequence.
- */
-#define XFS_ERRTAG_DROP_WRITES 28
-#define XFS_ERRTAG_LOG_BAD_CRC 29
-#define XFS_ERRTAG_LOG_ITEM_PIN 30
-#define XFS_ERRTAG_MAX 31
-
-/*
- * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
- */
-#define XFS_RANDOM_DEFAULT 100
-#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
-#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_FREE_EXTENT 1
-#define XFS_RANDOM_RMAP_FINISH_ONE 1
-#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1
-#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
-#define XFS_RANDOM_BMAP_FINISH_ONE 1
-#define XFS_RANDOM_AG_RESV_CRITICAL 4
-#define XFS_RANDOM_DROP_WRITES 1
-#define XFS_RANDOM_LOG_BAD_CRC 1
-#define XFS_RANDOM_LOG_ITEM_PIN 1
-
#ifdef DEBUG
extern int xfs_errortag_init(struct xfs_mount *mp);
extern void xfs_errortag_del(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ebdd0bd2b261..18146873a8b3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -58,7 +58,7 @@ xfs_zero_range(
xfs_off_t count,
bool *did_zero)
{
- return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
+ return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops);
}
int
@@ -237,11 +237,13 @@ xfs_file_dax_read(
if (!count)
return 0; /* skip atime */
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
return -EAGAIN;
+ } else {
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
+
ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -259,9 +261,10 @@ xfs_file_buffered_aio_read(
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
return -EAGAIN;
+ } else {
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
ret = generic_file_read_iter(iocb, to);
@@ -377,8 +380,6 @@ restart:
*/
spin_lock(&ip->i_flags_lock);
if (iocb->ki_pos > i_size_read(inode)) {
- bool zero = false;
-
spin_unlock(&ip->i_flags_lock);
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
@@ -399,7 +400,7 @@ restart:
drained_dio = true;
goto restart;
}
- error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
+ error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL);
if (error)
return error;
} else
@@ -436,7 +437,6 @@ xfs_dio_write_end_io(
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
loff_t offset = iocb->ki_pos;
- bool update_size = false;
int error = 0;
trace_xfs_end_io_direct_write(ip, offset, size);
@@ -447,6 +447,21 @@ xfs_dio_write_end_io(
if (size <= 0)
return size;
+ if (flags & IOMAP_DIO_COW) {
+ error = xfs_reflink_end_cow(ip, offset, size);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Unwritten conversion updates the in-core isize after extent
+ * conversion but before updating the on-disk size. Updating isize any
+ * earlier allows a racing dio read to find unwritten extents before
+ * they are converted.
+ */
+ if (flags & IOMAP_DIO_UNWRITTEN)
+ return xfs_iomap_write_unwritten(ip, offset, size, true);
+
/*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
@@ -461,20 +476,11 @@ xfs_dio_write_end_io(
spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode)) {
i_size_write(inode, offset + size);
- update_size = true;
- }
- spin_unlock(&ip->i_flags_lock);
-
- if (flags & IOMAP_DIO_COW) {
- error = xfs_reflink_end_cow(ip, offset, size);
- if (error)
- return error;
- }
-
- if (flags & IOMAP_DIO_UNWRITTEN)
- error = xfs_iomap_write_unwritten(ip, offset, size);
- else if (update_size)
+ spin_unlock(&ip->i_flags_lock);
error = xfs_setfilesize(ip, offset, size);
+ } else {
+ spin_unlock(&ip->i_flags_lock);
+ }
return error;
}
@@ -549,9 +555,10 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
- if (!xfs_ilock_nowait(ip, iolock)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, iolock))
return -EAGAIN;
+ } else {
xfs_ilock(ip, iolock);
}
@@ -603,9 +610,10 @@ xfs_file_dax_write(
size_t count;
loff_t pos;
- if (!xfs_ilock_nowait(ip, iolock)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, iolock))
return -EAGAIN;
+ } else {
xfs_ilock(ip, iolock);
}
@@ -761,7 +769,7 @@ xfs_file_fallocate(
enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL;
loff_t new_size = 0;
- bool do_file_insert = 0;
+ bool do_file_insert = false;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
@@ -822,7 +830,7 @@ xfs_file_fallocate(
error = -EINVAL;
goto out_unlock;
}
- do_file_insert = 1;
+ do_file_insert = true;
} else {
flags |= XFS_PREALLOC_SET;
@@ -976,7 +984,7 @@ xfs_file_readdir(
* point we can change the ->readdir prototype to include the
* buffer size. For now we use the current glibc buffer size.
*/
- bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
return xfs_readdir(NULL, ip, ctx, bufsize);
}
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 814ed729881d..43cfc07996a4 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -367,29 +367,6 @@ xfs_getfsmap_datadev_helper(
return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
}
-/* Transform a rtbitmap "record" into a fsmap */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap_helper(
- struct xfs_trans *tp,
- struct xfs_rtalloc_rec *rec,
- void *priv)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_getfsmap_info *info = priv;
- struct xfs_rmap_irec irec;
- xfs_daddr_t rec_daddr;
-
- rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
-
- irec.rm_startblock = rec->ar_startblock;
- irec.rm_blockcount = rec->ar_blockcount;
- irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
- irec.rm_offset = 0;
- irec.rm_flags = 0;
-
- return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
-}
-
/* Transform a bnobt irec into a fsmap */
STATIC int
xfs_getfsmap_datadev_bnobt_helper(
@@ -475,6 +452,30 @@ xfs_getfsmap_logdev(
return xfs_getfsmap_helper(tp, info, &rmap, 0);
}
+#ifdef CONFIG_XFS_RT
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_daddr_t rec_daddr;
+
+ rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
+
+ irec.rm_startblock = rec->ar_startblock;
+ irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
+
/* Execute a getfsmap query against the realtime device. */
STATIC int
__xfs_getfsmap_rtdev(
@@ -561,6 +562,7 @@ xfs_getfsmap_rtdev_rtbitmap(
return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
info);
}
+#endif /* CONFIG_XFS_RT */
/* Execute a getfsmap query against the regular data device. */
STATIC int
@@ -795,7 +797,15 @@ xfs_getfsmap_check_keys(
return false;
}
+/*
+ * There are only two devices if we didn't configure RT devices at build time.
+ */
+#ifdef CONFIG_XFS_RT
#define XFS_GETFSMAP_DEVS 3
+#else
+#define XFS_GETFSMAP_DEVS 2
+#endif /* CONFIG_XFS_RT */
+
/*
* Get filesystem's extents as described in head, and format for
* output. Calls formatter to fill the user's buffer until all
@@ -853,10 +863,12 @@ xfs_getfsmap(
handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
handlers[1].fn = xfs_getfsmap_logdev;
}
+#ifdef CONFIG_XFS_RT
if (mp->m_rtdev_targp) {
handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
}
+#endif /* CONFIG_XFS_RT */
xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
xfs_getfsmap_dev_compare);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 34227115a5d6..43005fbe8b1e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -610,7 +610,7 @@ again:
} else {
rcu_read_unlock();
if (flags & XFS_IGET_INCORE) {
- error = -ENOENT;
+ error = -ENODATA;
goto out_error_or_again;
}
XFS_STATS_INC(mp, xs_ig_missed);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5599dda4727a..d8226f7a5dde 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -39,6 +39,7 @@
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_filestream.h"
@@ -384,14 +385,6 @@ xfs_isilocked(
}
#endif
-#ifdef DEBUG
-int xfs_locked_n;
-int xfs_small_retries;
-int xfs_middle_retries;
-int xfs_lots_retries;
-int xfs_lock_delays;
-#endif
-
/*
* xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
* DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
@@ -544,24 +537,11 @@ again:
if ((attempts % 5) == 0) {
delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
- xfs_lock_delays++;
-#endif
}
i = 0;
try_lock = 0;
goto again;
}
-
-#ifdef DEBUG
- if (attempts) {
- if (attempts < 5) xfs_small_retries++;
- else if (attempts < 100) xfs_middle_retries++;
- else xfs_lots_retries++;
- } else {
- xfs_locked_n++;
- }
-#endif
}
/*
@@ -767,7 +747,7 @@ xfs_ialloc(
xfs_inode_t *pip,
umode_t mode,
xfs_nlink_t nlink,
- xfs_dev_t rdev,
+ dev_t rdev,
prid_t prid,
int okalloc,
xfs_buf_t **ialloc_context,
@@ -819,6 +799,7 @@ xfs_ialloc(
set_nlink(inode, nlink);
ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
+ inode->i_rdev = rdev;
xfs_set_projid(ip, prid);
if (pip && XFS_INHERIT_GID(pip)) {
@@ -867,7 +848,6 @@ xfs_ialloc(
case S_IFBLK:
case S_IFSOCK:
ip->i_d.di_format = XFS_DINODE_FMT_DEV;
- ip->i_df.if_u2.if_rdev = rdev;
ip->i_df.if_flags = 0;
flags |= XFS_ILOG_DEV;
break;
@@ -933,7 +913,7 @@ xfs_ialloc(
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_flags = XFS_IFEXTENTS;
ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
- ip->i_df.if_u1.if_extents = NULL;
+ ip->i_df.if_u1.if_root = NULL;
break;
default:
ASSERT(0);
@@ -975,7 +955,7 @@ xfs_dir_ialloc(
the inode. */
umode_t mode,
xfs_nlink_t nlink,
- xfs_dev_t rdev,
+ dev_t rdev,
prid_t prid, /* project id */
int okalloc, /* ok to allocate new space */
xfs_inode_t **ipp, /* pointer to inode; it will be
@@ -1147,7 +1127,7 @@ xfs_create(
xfs_inode_t *dp,
struct xfs_name *name,
umode_t mode,
- xfs_dev_t rdev,
+ dev_t rdev,
xfs_inode_t **ipp)
{
int is_dir = S_ISDIR(mode);
@@ -1183,7 +1163,6 @@ xfs_create(
return error;
if (is_dir) {
- rdev = 0;
resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
tres = &M_RES(mp)->tr_mkdir;
} else {
@@ -1624,10 +1603,12 @@ xfs_itruncate_extents(
goto out;
/*
- * Clear the reflink flag if we truncated everything.
+ * Clear the reflink flag if there are no data fork blocks and
+ * there are no extents staged in the cow fork.
*/
- if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
+ if (ip->i_d.di_nblocks == 0)
+ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
xfs_inode_clear_cowblocks_tag(ip);
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0ee453de239a..cc13c3763721 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -391,7 +391,7 @@ void xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
- umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
+ umode_t mode, dev_t rdev, struct xfs_inode **ipp);
int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
umode_t mode, struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
@@ -428,7 +428,7 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
- xfs_nlink_t, xfs_dev_t, prid_t, int,
+ xfs_nlink_t, dev_t, prid_t, int,
struct xfs_inode **, int *);
/* from xfs_file.c */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6d0f74ec31e8..6ee5c3bf19ad 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -72,7 +72,6 @@ xfs_inode_item_data_fork_size(
break;
case XFS_DINODE_FMT_DEV:
- case XFS_DINODE_FMT_UUID:
break;
default:
ASSERT(0);
@@ -156,15 +155,13 @@ xfs_inode_item_format_data_fork(
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
ip->i_d.di_nextents > 0 &&
ip->i_df.if_bytes > 0) {
struct xfs_bmbt_rec *p;
- ASSERT(ip->i_df.if_u1.if_extents != NULL);
ASSERT(xfs_iext_count(&ip->i_df) > 0);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
@@ -181,8 +178,7 @@ xfs_inode_item_format_data_fork(
break;
case XFS_DINODE_FMT_BTREE:
iip->ili_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
@@ -200,8 +196,7 @@ xfs_inode_item_format_data_fork(
break;
case XFS_DINODE_FMT_LOCAL:
iip->ili_fields &=
- ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
+ ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
/*
@@ -224,17 +219,9 @@ xfs_inode_item_format_data_fork(
break;
case XFS_DINODE_FMT_DEV:
iip->ili_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEXT | XFS_ILOG_UUID);
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT);
if (iip->ili_fields & XFS_ILOG_DEV)
- ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
- break;
- case XFS_DINODE_FMT_UUID:
- iip->ili_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEXT | XFS_ILOG_DEV);
- if (iip->ili_fields & XFS_ILOG_UUID)
- ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
+ ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev);
break;
default:
ASSERT(0);
@@ -264,7 +251,6 @@ xfs_inode_item_format_attr_fork(
ASSERT(xfs_iext_count(ip->i_afp) ==
ip->i_d.di_anextents);
- ASSERT(ip->i_afp->if_u1.if_extents != NULL);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
@@ -364,6 +350,9 @@ xfs_inode_to_log_dinode(
to->di_dmstate = from->di_dmstate;
to->di_flags = from->di_flags;
+ /* log a dummy value to ensure log structure is fully initialised */
+ to->di_next_unlinked = NULLAGINO;
+
if (from->di_version == 3) {
to->di_changecount = inode->i_version;
to->di_crtime.t_sec = from->di_crtime.t_sec;
@@ -404,6 +393,11 @@ xfs_inode_item_format_core(
* the second with the on-disk inode structure, and a possible third and/or
* fourth with the inode data/extents/b-tree root and inode attributes
* data/extents/b-tree root.
+ *
+ * Note: Always use the 64 bit inode log format structure so we don't
+ * leave an uninitialised hole in the format item on 64 bit systems. Log
+ * recovery on 32 bit systems handles this just fine, so there's no reason
+ * for not using an initialising the properly padded structure all the time.
*/
STATIC void
xfs_inode_item_format(
@@ -412,8 +406,8 @@ xfs_inode_item_format(
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- struct xfs_inode_log_format *ilf;
struct xfs_log_iovec *vecp = NULL;
+ struct xfs_inode_log_format *ilf;
ASSERT(ip->i_d.di_version > 1);
@@ -425,7 +419,17 @@ xfs_inode_item_format(
ilf->ilf_boffset = ip->i_imap.im_boffset;
ilf->ilf_fields = XFS_ILOG_CORE;
ilf->ilf_size = 2; /* format + core */
- xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+
+ /*
+ * make sure we don't leak uninitialised data into the log in the case
+ * when we don't log every field in the inode.
+ */
+ ilf->ilf_dsize = 0;
+ ilf->ilf_asize = 0;
+ ilf->ilf_pad = 0;
+ memset(&ilf->ilf_u, 0, sizeof(ilf->ilf_u));
+
+ xlog_finish_iovec(lv, vecp, sizeof(*ilf));
xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
@@ -745,7 +749,7 @@ xfs_iflush_done(
*/
iip = INODE_ITEM(blip);
if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
- lip->li_flags & XFS_LI_FAILED)
+ (blip->li_flags & XFS_LI_FAILED))
need_ail++;
blip = next;
@@ -855,44 +859,28 @@ xfs_istale_done(
}
/*
- * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
- * (which can have different field alignments) to the native version
+ * convert an xfs_inode_log_format struct from the old 32 bit version
+ * (which can have different field alignments) to the native 64 bit version
*/
int
xfs_inode_item_format_convert(
- xfs_log_iovec_t *buf,
- xfs_inode_log_format_t *in_f)
+ struct xfs_log_iovec *buf,
+ struct xfs_inode_log_format *in_f)
{
- if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
- xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
-
- in_f->ilf_type = in_f32->ilf_type;
- in_f->ilf_size = in_f32->ilf_size;
- in_f->ilf_fields = in_f32->ilf_fields;
- in_f->ilf_asize = in_f32->ilf_asize;
- in_f->ilf_dsize = in_f32->ilf_dsize;
- in_f->ilf_ino = in_f32->ilf_ino;
- /* copy biggest field of ilf_u */
- uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid);
- in_f->ilf_blkno = in_f32->ilf_blkno;
- in_f->ilf_len = in_f32->ilf_len;
- in_f->ilf_boffset = in_f32->ilf_boffset;
- return 0;
- } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
- xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
-
- in_f->ilf_type = in_f64->ilf_type;
- in_f->ilf_size = in_f64->ilf_size;
- in_f->ilf_fields = in_f64->ilf_fields;
- in_f->ilf_asize = in_f64->ilf_asize;
- in_f->ilf_dsize = in_f64->ilf_dsize;
- in_f->ilf_ino = in_f64->ilf_ino;
- /* copy biggest field of ilf_u */
- uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid);
- in_f->ilf_blkno = in_f64->ilf_blkno;
- in_f->ilf_len = in_f64->ilf_len;
- in_f->ilf_boffset = in_f64->ilf_boffset;
- return 0;
- }
- return -EFSCORRUPTED;
+ struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
+
+ if (buf->i_len != sizeof(*in_f32))
+ return -EFSCORRUPTED;
+
+ in_f->ilf_type = in_f32->ilf_type;
+ in_f->ilf_size = in_f32->ilf_size;
+ in_f->ilf_fields = in_f32->ilf_fields;
+ in_f->ilf_asize = in_f32->ilf_asize;
+ in_f->ilf_dsize = in_f32->ilf_dsize;
+ in_f->ilf_ino = in_f32->ilf_ino;
+ memcpy(&in_f->ilf_u, &in_f32->ilf_u, sizeof(in_f->ilf_u));
+ in_f->ilf_blkno = in_f32->ilf_blkno;
+ in_f->ilf_len = in_f32->ilf_len;
+ in_f->ilf_boffset = in_f32->ilf_boffset;
+ return 0;
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 4c7722e325b3..b72373a33cd9 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -48,7 +48,7 @@ extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
extern void xfs_iflush_abort(struct xfs_inode *, bool);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
- xfs_inode_log_format_t *);
+ struct xfs_inode_log_format *);
extern struct kmem_zone *xfs_ili_zone;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5049e8ab6e30..20dc65fef6a4 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -44,6 +44,7 @@
#include "xfs_btree.h"
#include <linux/fsmap.h>
#include "xfs_fsmap.h"
+#include "scrub/xfs_scrub.h"
#include <linux/capability.h>
#include <linux/cred.h>
@@ -310,8 +311,8 @@ xfs_readlink_by_handle(
int
xfs_set_dmattrs(
xfs_inode_t *ip,
- u_int evmask,
- u_int16_t state)
+ uint evmask,
+ uint16_t state)
{
xfs_mount_t *mp = ip->i_mount;
xfs_trans_t *tp;
@@ -1088,6 +1089,7 @@ xfs_ioctl_setattr_dax_invalidate(
int *join_flags)
{
struct inode *inode = VFS_I(ip);
+ struct super_block *sb = inode->i_sb;
int error;
*join_flags = 0;
@@ -1100,7 +1102,7 @@ xfs_ioctl_setattr_dax_invalidate(
if (fa->fsx_xflags & FS_XFLAG_DAX) {
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
return -EINVAL;
- if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
+ if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
return -EINVAL;
}
@@ -1200,6 +1202,8 @@ out_unlock:
* 8. for non-realtime files, the extent size hint must be limited
* to half the AG size to avoid alignment extending the extent beyond the
* limits of the AG.
+ *
+ * Please keep this function in sync with xfs_scrub_inode_extsize.
*/
static int
xfs_ioctl_setattr_check_extsize(
@@ -1256,6 +1260,8 @@ xfs_ioctl_setattr_check_extsize(
* 5. Extent size must be a multiple of the appropriate block size.
* 6. The extent size hint must be limited to half the AG size to avoid
* alignment extending the extent beyond the limits of the AG.
+ *
+ * Please keep this function in sync with xfs_scrub_inode_cowextsize.
*/
static int
xfs_ioctl_setattr_check_cowextsize(
@@ -1539,17 +1545,26 @@ out_drop_write:
return error;
}
-STATIC int
-xfs_getbmap_format(void **ap, struct getbmapx *bmv)
+static bool
+xfs_getbmap_format(
+ struct kgetbmap *p,
+ struct getbmapx __user *u,
+ size_t recsize)
{
- struct getbmap __user *base = (struct getbmap __user *)*ap;
-
- /* copy only getbmap portion (not getbmapx) */
- if (copy_to_user(base, bmv, sizeof(struct getbmap)))
- return -EFAULT;
-
- *ap += sizeof(struct getbmap);
- return 0;
+ if (put_user(p->bmv_offset, &u->bmv_offset) ||
+ put_user(p->bmv_block, &u->bmv_block) ||
+ put_user(p->bmv_length, &u->bmv_length) ||
+ put_user(0, &u->bmv_count) ||
+ put_user(0, &u->bmv_entries))
+ return false;
+ if (recsize < sizeof(struct getbmapx))
+ return true;
+ if (put_user(0, &u->bmv_iflags) ||
+ put_user(p->bmv_oflags, &u->bmv_oflags) ||
+ put_user(0, &u->bmv_unused1) ||
+ put_user(0, &u->bmv_unused2))
+ return false;
+ return true;
}
STATIC int
@@ -1559,68 +1574,57 @@ xfs_ioc_getbmap(
void __user *arg)
{
struct getbmapx bmx = { 0 };
- int error;
-
- /* struct getbmap is a strict subset of struct getbmapx. */
- if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags)))
- return -EFAULT;
+ struct kgetbmap *buf;
+ size_t recsize;
+ int error, i;
- if (bmx.bmv_count < 2)
+ switch (cmd) {
+ case XFS_IOC_GETBMAPA:
+ bmx.bmv_iflags = BMV_IF_ATTRFORK;
+ /*FALLTHRU*/
+ case XFS_IOC_GETBMAP:
+ if (file->f_mode & FMODE_NOCMTIME)
+ bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
+ /* struct getbmap is a strict subset of struct getbmapx. */
+ recsize = sizeof(struct getbmap);
+ break;
+ case XFS_IOC_GETBMAPX:
+ recsize = sizeof(struct getbmapx);
+ break;
+ default:
return -EINVAL;
+ }
- bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
- if (file->f_mode & FMODE_NOCMTIME)
- bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
-
- error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
- (__force struct getbmap *)arg+1);
- if (error)
- return error;
-
- /* copy back header - only size of getbmap */
- if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
- return -EFAULT;
- return 0;
-}
-
-STATIC int
-xfs_getbmapx_format(void **ap, struct getbmapx *bmv)
-{
- struct getbmapx __user *base = (struct getbmapx __user *)*ap;
-
- if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
- return -EFAULT;
-
- *ap += sizeof(struct getbmapx);
- return 0;
-}
-
-STATIC int
-xfs_ioc_getbmapx(
- struct xfs_inode *ip,
- void __user *arg)
-{
- struct getbmapx bmx;
- int error;
-
- if (copy_from_user(&bmx, arg, sizeof(bmx)))
+ if (copy_from_user(&bmx, arg, recsize))
return -EFAULT;
if (bmx.bmv_count < 2)
return -EINVAL;
+ if (bmx.bmv_count > ULONG_MAX / recsize)
+ return -ENOMEM;
- if (bmx.bmv_iflags & (~BMV_IF_VALID))
- return -EINVAL;
+ buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0);
+ if (!buf)
+ return -ENOMEM;
- error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
- (__force struct getbmapx *)arg+1);
+ error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, buf);
if (error)
- return error;
+ goto out_free_buf;
- /* copy back header */
- if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
- return -EFAULT;
+ error = -EFAULT;
+ if (copy_to_user(arg, &bmx, recsize))
+ goto out_free_buf;
+ arg += recsize;
+
+ for (i = 0; i < bmx.bmv_entries; i++) {
+ if (!xfs_getbmap_format(buf + i, arg, recsize))
+ goto out_free_buf;
+ arg += recsize;
+ }
+ error = 0;
+out_free_buf:
+ kmem_free(buf);
return 0;
}
@@ -1702,6 +1706,30 @@ xfs_ioc_getfsmap(
return 0;
}
+STATIC int
+xfs_ioc_scrub_metadata(
+ struct xfs_inode *ip,
+ void __user *arg)
+{
+ struct xfs_scrub_metadata scrub;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&scrub, arg, sizeof(scrub)))
+ return -EFAULT;
+
+ error = xfs_scrub_metadata(ip, &scrub);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &scrub, sizeof(scrub)))
+ return -EFAULT;
+
+ return 0;
+}
+
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
@@ -1877,14 +1905,15 @@ xfs_file_ioctl(
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
- return xfs_ioc_getbmap(filp, cmd, arg);
-
case XFS_IOC_GETBMAPX:
- return xfs_ioc_getbmapx(ip, arg);
+ return xfs_ioc_getbmap(filp, cmd, arg);
case FS_IOC_GETFSMAP:
return xfs_ioc_getfsmap(ip, arg);
+ case XFS_IOC_SCRUB_METADATA:
+ return xfs_ioc_scrub_metadata(ip, arg);
+
case XFS_IOC_FD_TO_HANDLE:
case XFS_IOC_PATH_TO_HANDLE:
case XFS_IOC_PATH_TO_FSHANDLE: {
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index e86c3ea137d2..8de879f0c7d5 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -86,7 +86,7 @@ xfs_file_compat_ioctl(
extern int
xfs_set_dmattrs(
struct xfs_inode *ip,
- u_int evmask,
- u_int16_t state);
+ uint evmask,
+ uint16_t state);
#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index fa0bc4d46065..35c79e246fde 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -556,6 +556,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_ERROR_INJECTION:
case XFS_IOC_ERROR_CLEARALL:
case FS_IOC_GETFSMAP:
+ case XFS_IOC_SCRUB_METADATA:
return xfs_file_ioctl(filp, cmd, p);
#ifndef BROKEN_X86_ALIGNMENT
/* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a1909bc064e9..18077e2189a9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -30,6 +30,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
@@ -54,13 +55,13 @@ xfs_bmbt_to_iomap(
struct xfs_mount *mp = ip->i_mount;
if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
- iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
if (imap->br_state == XFS_EXT_UNWRITTEN)
iomap->type = IOMAP_UNWRITTEN;
else
@@ -389,7 +390,7 @@ xfs_iomap_prealloc_size(
struct xfs_inode *ip,
loff_t offset,
loff_t count,
- xfs_extnum_t idx)
+ struct xfs_iext_cursor *icur)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
@@ -414,7 +415,7 @@ xfs_iomap_prealloc_size(
*/
if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
- !xfs_iext_get_extent(ifp, idx - 1, &prev) ||
+ !xfs_iext_peek_prev_extent(ifp, icur, &prev) ||
prev.br_startoff + prev.br_blockcount < offset_fsb)
return mp->m_writeio_blocks;
@@ -532,7 +533,7 @@ xfs_file_iomap_begin_delay(
xfs_fileoff_t end_fsb;
int error = 0, eof = 0;
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
xfs_fsblock_t prealloc_blocks = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
@@ -557,7 +558,7 @@ xfs_file_iomap_begin_delay(
goto out_unlock;
}
- eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
+ eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
if (!eof && got.br_startoff <= offset_fsb) {
if (xfs_is_reflink_inode(ip)) {
bool shared;
@@ -591,7 +592,8 @@ xfs_file_iomap_begin_delay(
end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
+ &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -613,7 +615,8 @@ xfs_file_iomap_begin_delay(
retry:
error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof);
+ end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
+ eof);
switch (error) {
case 0:
break;
@@ -829,7 +832,8 @@ int
xfs_iomap_write_unwritten(
xfs_inode_t *ip,
xfs_off_t offset,
- xfs_off_t count)
+ xfs_off_t count,
+ bool update_isize)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
@@ -840,6 +844,7 @@ xfs_iomap_write_unwritten(
xfs_trans_t *tp;
xfs_bmbt_irec_t imap;
struct xfs_defer_ops dfops;
+ struct inode *inode = VFS_I(ip);
xfs_fsize_t i_size;
uint resblks;
int error;
@@ -899,7 +904,8 @@ xfs_iomap_write_unwritten(
i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
if (i_size > offset + count)
i_size = offset + count;
-
+ if (update_isize && i_size > i_size_read(inode))
+ i_size_write(inode, i_size);
i_size = xfs_new_eof(ip, i_size);
if (i_size) {
ip->i_d.di_size = i_size;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 00db3ecea084..ee535065c5d0 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
struct xfs_bmbt_irec *);
-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 17081c77ef86..56475fcd76f2 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -160,7 +160,6 @@ xfs_generic_create(
if (S_ISCHR(mode) || S_ISBLK(mode)) {
if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
return -EINVAL;
- rdev = sysv_encode_dev(rdev);
} else {
rdev = 0;
}
@@ -535,8 +534,7 @@ xfs_vn_getattr(
case S_IFBLK:
case S_IFCHR:
stat->blksize = BLKDEV_IOSIZE;
- stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
- sysv_minor(ip->i_df.if_u2.if_rdev));
+ stat->rdev = inode->i_rdev;
break;
default:
if (XFS_IS_REALTIME_INODE(ip)) {
@@ -886,22 +884,6 @@ xfs_setattr_size(
return error;
/*
- * We are going to log the inode size change in this transaction so
- * any previous writes that are beyond the on disk EOF and the new
- * EOF that have not been written out need to be written here. If we
- * do not write the data out, we expose ourselves to the null files
- * problem. Note that this includes any block zeroing we did above;
- * otherwise those blocks may not be zeroed after a crash.
- */
- if (did_zeroing ||
- (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- ip->i_d.di_size, newsize);
- if (error)
- return error;
- }
-
- /*
* We've already locked out new page faults, so now we can safely remove
* pages from the page cache knowing they won't get refaulted until we
* drop the XFS_MMAP_EXCL lock after the extent manipulations are
@@ -917,9 +899,29 @@ xfs_setattr_size(
* user visible changes). There's not much we can do about this, except
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
+ *
+ * And we update in-core i_size and truncate page cache beyond newsize
+ * before writeback the [di_size, newsize] range, so we're guaranteed
+ * not to write stale data past the new EOF on truncate down.
*/
truncate_setsize(inode, newsize);
+ /*
+ * We are going to log the inode size change in this transaction so
+ * any previous writes that are beyond the on disk EOF and the new
+ * EOF that have not been written out need to be written here. If we
+ * do not write the data out, we expose ourselves to the null files
+ * problem. Note that this includes any block zeroing we did above;
+ * otherwise those blocks may not be zeroed after a crash.
+ */
+ if (did_zeroing ||
+ (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
+ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ip->i_d.di_size, newsize - 1);
+ if (error)
+ return error;
+ }
+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error)
return error;
@@ -1231,18 +1233,6 @@ xfs_setup_inode(
inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
- switch (inode->i_mode & S_IFMT) {
- case S_IFBLK:
- case S_IFCHR:
- inode->i_rdev =
- MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
- sysv_minor(ip->i_df.if_u2.if_rdev));
- break;
- default:
- inode->i_rdev = 0;
- break;
- }
-
i_size_write(inode, ip->i_d.di_size);
xfs_diflags_to_iflags(inode, ip);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c393a2f6d8c3..d58310514423 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -31,16 +31,6 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
-int
-xfs_internal_inum(
- xfs_mount_t *mp,
- xfs_ino_t ino)
-{
- return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
- (xfs_sb_version_hasquota(&mp->m_sb) &&
- xfs_is_quota_inode(&mp->m_sb, ino)));
-}
-
/*
* Return stat information for one inode.
* Return 0 if ok, else errno.
@@ -119,12 +109,11 @@ xfs_bulkstat_one_int(
switch (dic->di_format) {
case XFS_DINODE_FMT_DEV:
- buf->bs_rdev = ip->i_df.if_u2.if_rdev;
+ buf->bs_rdev = sysv_encode_dev(inode->i_rdev);
buf->bs_blksize = BLKDEV_IOSIZE;
buf->bs_blocks = 0;
break;
case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_UUID:
buf->bs_rdev = 0;
buf->bs_blksize = mp->m_sb.sb_blocksize;
buf->bs_blocks = 0;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 17e86e0541af..6ea8b3912fa4 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -96,6 +96,4 @@ xfs_inumbers(
void __user *buffer, /* buffer with inode info */
inumbers_fmt_pf formatter);
-int xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
-
#endif /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index dcd1292664b3..6282bfc1afa9 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -142,6 +142,13 @@ typedef __u32 xfs_nlink_t;
#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
+/*
+ * Return the address of a label. Use barrier() so that the optimizer
+ * won't reorder code to refactor the error jumpouts into a single
+ * return, which throws off the reported address.
+ */
+#define __this_address ({ __label__ __here; __here: barrier(); &&__here; })
+
#define XFS_PROJID_DEFAULT 0
#define MIN(a,b) (min(a,b))
@@ -243,10 +250,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef STATIC
-# define STATIC noinline
-#endif
-
#else /* !DEBUG */
#ifdef XFS_WARN
@@ -254,21 +257,15 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
#define ASSERT(expr) \
(likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-
#else /* !DEBUG && !XFS_WARN */
#define ASSERT(expr) ((void)0)
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-
#endif /* XFS_WARN */
#endif /* DEBUG */
+#define STATIC static noinline
+
#ifdef CONFIG_XFS_RT
/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c5107c7bc4bf..38d4227895ae 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -22,6 +22,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -608,6 +609,7 @@ xfs_log_mount(
xfs_daddr_t blk_offset,
int num_bblks)
{
+ bool fatal = xfs_sb_version_hascrc(&mp->m_sb);
int error = 0;
int min_logfsbs;
@@ -659,9 +661,20 @@ xfs_log_mount(
XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
XFS_MAX_LOG_BYTES);
error = -EINVAL;
+ } else if (mp->m_sb.sb_logsunit > 1 &&
+ mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) {
+ xfs_warn(mp,
+ "log stripe unit %u bytes must be a multiple of block size",
+ mp->m_sb.sb_logsunit);
+ error = -EINVAL;
+ fatal = true;
}
if (error) {
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ /*
+ * Log check errors are always fatal on v5; or whenever bad
+ * metadata leads to a crash.
+ */
+ if (fatal) {
xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
ASSERT(0);
goto out_free_log;
@@ -744,6 +757,7 @@ xfs_log_mount_finish(
{
int error = 0;
bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+ bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED;
if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
@@ -780,6 +794,21 @@ xfs_log_mount_finish(
mp->m_super->s_flags &= ~MS_ACTIVE;
evict_inodes(mp->m_super);
+ /*
+ * Drain the buffer LRU after log recovery. This is required for v4
+ * filesystems to avoid leaving around buffers with NULL verifier ops,
+ * but we do it unconditionally to make sure we're always in a clean
+ * cache state after mount.
+ *
+ * Don't push in the error case because the AIL may have pending intents
+ * that aren't removed until recovery is cancelled.
+ */
+ if (!error && recovered) {
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ xfs_ail_push_all_sync(mp->m_ail);
+ }
+ xfs_wait_buftarg(mp->m_ddev_targp);
+
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -2515,7 +2544,7 @@ next_lv:
if (lv)
vecp = lv->lv_iovecp;
}
- if (record_cnt == 0 && ordered == false) {
+ if (record_cnt == 0 && !ordered) {
if (!lv)
return 0;
break;
@@ -3734,7 +3763,7 @@ xlog_ticket_alloc(
* one of the iclogs. This uses backup pointers stored in a different
* part of the log in case we trash the log structure.
*/
-void
+STATIC void
xlog_verify_dest_ptr(
struct xlog *log,
void *ptr)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 51bf7b827387..129975970d99 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -592,9 +592,9 @@ xlog_valid_lsn(
* a transiently forward state. Instead, we can see the LSN in a
* transiently behind state if we happen to race with a cycle wrap.
*/
- cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+ cur_cycle = READ_ONCE(log->l_curr_cycle);
smp_rmb();
- cur_block = ACCESS_ONCE(log->l_curr_block);
+ cur_block = READ_ONCE(log->l_curr_block);
if ((CYCLE_LSN(lsn) > cur_cycle) ||
(CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index ee34899396b2..87b1c331f9eb 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -85,17 +85,21 @@ struct xfs_buf_cancel {
*/
/*
- * Verify the given count of basic blocks is valid number of blocks
- * to specify for an operation involving the given XFS log buffer.
- * Returns nonzero if the count is valid, 0 otherwise.
+ * Verify the log-relative block number and length in basic blocks are valid for
+ * an operation involving the given XFS log buffer. Returns true if the fields
+ * are valid, false otherwise.
*/
-
-static inline int
-xlog_buf_bbcount_valid(
+static inline bool
+xlog_verify_bp(
struct xlog *log,
+ xfs_daddr_t blk_no,
int bbcount)
{
- return bbcount > 0 && bbcount <= log->l_logBBsize;
+ if (blk_no < 0 || blk_no >= log->l_logBBsize)
+ return false;
+ if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
+ return false;
+ return true;
}
/*
@@ -110,7 +114,11 @@ xlog_get_bp(
{
struct xfs_buf *bp;
- if (!xlog_buf_bbcount_valid(log, nbblks)) {
+ /*
+ * Pass log block 0 since we don't have an addr yet, buffer will be
+ * verified on read.
+ */
+ if (!xlog_verify_bp(log, 0, nbblks)) {
xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
@@ -180,9 +188,10 @@ xlog_bread_noalign(
{
int error;
- if (!xlog_buf_bbcount_valid(log, nbblks)) {
- xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
- nbblks);
+ if (!xlog_verify_bp(log, blk_no, nbblks)) {
+ xfs_warn(log->l_mp,
+ "Invalid log block/length (0x%llx, 0x%x) for buffer",
+ blk_no, nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return -EFSCORRUPTED;
}
@@ -265,9 +274,10 @@ xlog_bwrite(
{
int error;
- if (!xlog_buf_bbcount_valid(log, nbblks)) {
- xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
- nbblks);
+ if (!xlog_verify_bp(log, blk_no, nbblks)) {
+ xfs_warn(log->l_mp,
+ "Invalid log block/length (0x%llx, 0x%x) for buffer",
+ blk_no, nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return -EFSCORRUPTED;
}
@@ -753,7 +763,7 @@ xlog_find_head(
* in the in-core log. The following number can be made tighter if
* we actually look at the block size of the filesystem.
*/
- num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+ num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
if (head_blk >= num_scan_bblks) {
/*
* We are guaranteed that the entire check can be performed
@@ -2975,7 +2985,7 @@ xlog_recover_inode_pass2(
struct xlog_recover_item *item,
xfs_lsn_t current_lsn)
{
- xfs_inode_log_format_t *in_f;
+ struct xfs_inode_log_format *in_f;
xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
xfs_dinode_t *dip;
@@ -2989,10 +2999,10 @@ xlog_recover_inode_pass2(
uint isize;
int need_free = 0;
- if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
} else {
- in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
+ in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
need_free = 1;
error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
if (error)
@@ -3163,16 +3173,8 @@ xlog_recover_inode_pass2(
}
fields = in_f->ilf_fields;
- switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
- case XFS_ILOG_DEV:
+ if (fields & XFS_ILOG_DEV)
xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
- break;
- case XFS_ILOG_UUID:
- memcpy(XFS_DFORK_DPTR(dip),
- &in_f->ilf_u.ilfu_uuid,
- sizeof(uuid_t));
- break;
- }
if (in_f->ilf_size == 2)
goto out_owner_change;
@@ -4297,7 +4299,7 @@ xlog_recover_add_to_trans(
char *dp,
int len)
{
- xfs_inode_log_format_t *in_f; /* any will do */
+ struct xfs_inode_log_format *in_f; /* any will do */
xlog_recover_item_t *item;
char *ptr;
@@ -4331,7 +4333,7 @@ xlog_recover_add_to_trans(
ptr = kmem_alloc(len, KM_SLEEP);
memcpy(ptr, dp, len);
- in_f = (xfs_inode_log_format_t *)ptr;
+ in_f = (struct xfs_inode_log_format *)ptr;
/* take the tail entry */
item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
@@ -5823,7 +5825,7 @@ xlog_recover_cancel(
* Read all of the agf and agi counters and check that they
* are consistent with the superblock counters.
*/
-void
+STATIC void
xlog_recover_check_summary(
struct xlog *log)
{
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 85401155750e..34447dca97d1 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __XFS_MESSAGE_H
#define __XFS_MESSAGE_H 1
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ea7d4b4e50d0..c879b517cc94 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -704,7 +704,7 @@ xfs_mountfs(
xfs_set_maxicount(mp);
/* enable fail_at_unmount as default */
- mp->m_fail_unmount = 1;
+ mp->m_fail_unmount = true;
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
if (error)
@@ -1022,10 +1022,21 @@ xfs_mountfs(
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
+ /*
+ * Cancel all delayed reclaim work and reclaim the inodes directly.
+ * We have to do this /after/ rtunmount and qm_unmount because those
+ * two will have scheduled delayed reclaim for the rt/quota inodes.
+ *
+ * This is slightly different from the unmountfs call sequence
+ * because we could be tearing down a partially set up mount. In
+ * particular, if log_mount_finish fails we bail out without calling
+ * qm_unmount_quotas and therefore rely on qm_unmount to release the
+ * quota inodes.
+ */
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
out_log_dealloc:
mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 0c381d71b242..0492436a053f 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
- XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
}
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 2f2dc3c09ad0..aa6c5c193f45 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
@@ -274,7 +275,7 @@ xfs_fs_commit_blocks(
(end - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(error);
- error = xfs_iomap_write_unwritten(ip, start, length);
+ error = xfs_iomap_write_unwritten(ip, start, length, false);
if (error)
goto out_drop_iolock;
}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index b587cb99b2b7..bf45951e28fe 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFS_PNFS_H
#define _XFS_PNFS_H 1
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 3246815c24d6..cc041a29eb70 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -273,7 +273,7 @@ xfs_reflink_reserve_cow(
struct xfs_bmbt_irec got;
int error = 0;
bool eof = false, trimmed;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
/*
* Search the COW fork extent list first. This serves two purposes:
@@ -284,7 +284,7 @@ xfs_reflink_reserve_cow(
* tree.
*/
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
+ if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
eof = true;
if (!eof && got.br_startoff <= imap->br_startoff) {
trace_xfs_reflink_cow_found(ip, imap);
@@ -312,7 +312,7 @@ xfs_reflink_reserve_cow(
return error;
error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &idx, eof);
+ imap->br_blockcount, 0, &got, &icur, eof);
if (error == -ENOSPC || error == -EDQUOT)
trace_xfs_reflink_cow_enospc(ip, imap);
if (error)
@@ -353,29 +353,22 @@ xfs_reflink_convert_cow(
xfs_off_t offset,
xfs_off_t count)
{
- struct xfs_bmbt_irec got;
- struct xfs_defer_ops dfops;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
- xfs_extnum_t idx;
- bool found;
- int error = 0;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ struct xfs_bmbt_irec imap;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t first_block = NULLFSBLOCK;
+ int nimaps = 1, error = 0;
- /* Convert all the extents to real from unwritten. */
- for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
- found && got.br_startoff < end_fsb;
- found = xfs_iext_get_extent(ifp, ++idx, &got)) {
- error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
- end_fsb - offset_fsb, &dfops);
- if (error)
- break;
- }
+ ASSERT(count != 0);
- /* Finish up. */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
+ XFS_BMAPI_CONVERT_ONLY, &first_block, 0, &imap, &nimaps,
+ &dfops);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -399,7 +392,7 @@ xfs_reflink_allocate_cow(
bool trimmed;
xfs_filblks_t resaligned;
xfs_extlen_t resblks = 0;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
retry:
ASSERT(xfs_is_reflink_inode(ip));
@@ -409,7 +402,7 @@ retry:
* Even if the extent is not shared we might have a preallocation for
* it in the COW fork. If so use it.
*/
- if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) &&
+ if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
got.br_startoff <= offset_fsb) {
*shared = true;
@@ -496,13 +489,13 @@ xfs_reflink_find_cow_mapping(
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
xfs_fileoff_t offset_fsb;
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
ASSERT(xfs_is_reflink_inode(ip));
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
- if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
return false;
if (got.br_startoff > offset_fsb)
return false;
@@ -524,18 +517,18 @@ xfs_reflink_trim_irec_to_next_cow(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
if (!xfs_is_reflink_inode(ip))
return;
/* Find the extent in the CoW fork. */
- if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
return;
/* This is the extent before; try sliding up one. */
if (got.br_startoff < offset_fsb) {
- if (!xfs_iext_get_extent(ifp, idx + 1, &got))
+ if (!xfs_iext_next_extent(ifp, &icur, &got))
return;
}
@@ -562,24 +555,32 @@ xfs_reflink_cancel_cow_blocks(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got, del;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
xfs_fsblock_t firstfsb;
struct xfs_defer_ops dfops;
int error = 0;
if (!xfs_is_reflink_inode(ip))
return 0;
- if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
return 0;
- while (got.br_startoff < end_fsb) {
+ /* Walk backwards until we're out of the I/O range... */
+ while (got.br_startoff + got.br_blockcount > offset_fsb) {
del = got;
xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+
+ /* Extent delete may have bumped ext forward */
+ if (!del.br_blockcount) {
+ xfs_iext_prev(ifp, &icur);
+ goto next_extent;
+ }
+
trace_xfs_reflink_cancel_cow(ip, &del);
if (isnullstartblock(del.br_startblock)) {
error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
- &idx, &got, &del);
+ &icur, &got, &del);
if (error)
break;
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
@@ -610,10 +611,10 @@ xfs_reflink_cancel_cow_blocks(
}
/* Remove the mapping from the CoW fork. */
- xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
+ xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
}
-
- if (!xfs_iext_get_extent(ifp, ++idx, &got))
+next_extent:
+ if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@@ -698,7 +699,7 @@ xfs_reflink_end_cow(
int error;
unsigned int resblks;
xfs_filblks_t rlen;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
trace_xfs_reflink_end_cow(ip, offset, count);
@@ -733,21 +734,22 @@ xfs_reflink_end_cow(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- /* If there is a hole at end_fsb - 1 go to the previous extent */
- if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
- got.br_startoff > end_fsb) {
- ASSERT(idx > 0);
- xfs_iext_get_extent(ifp, --idx, &got);
- }
+ /*
+ * In case of racing, overlapping AIO writes no COW extents might be
+ * left by the time I/O completes for the loser of the race. In that
+ * case we are done.
+ */
+ if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
+ goto out_cancel;
/* Walk backwards until we're out of the I/O range... */
while (got.br_startoff + got.br_blockcount > offset_fsb) {
del = got;
xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
- /* Extent delete may have bumped idx forward */
+ /* Extent delete may have bumped ext forward */
if (!del.br_blockcount) {
- idx--;
+ xfs_iext_prev(ifp, &icur);
goto next_extent;
}
@@ -759,7 +761,7 @@ xfs_reflink_end_cow(
* allocated but have not yet been involved in a write.
*/
if (got.br_state == XFS_EXT_UNWRITTEN) {
- idx--;
+ xfs_iext_prev(ifp, &icur);
goto next_extent;
}
@@ -790,14 +792,14 @@ xfs_reflink_end_cow(
goto out_defer;
/* Remove the mapping from the CoW fork. */
- xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
+ xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
xfs_defer_ijoin(&dfops, ip);
error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_defer;
next_extent:
- if (!xfs_iext_get_extent(ifp, idx, &got))
+ if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@@ -809,6 +811,7 @@ next_extent:
out_defer:
xfs_defer_cancel(&dfops);
+out_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
@@ -1426,7 +1429,7 @@ xfs_reflink_inode_has_shared_extents(
xfs_extlen_t aglen;
xfs_agblock_t rbno;
xfs_extlen_t rlen;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
bool found;
int error;
@@ -1438,7 +1441,7 @@ xfs_reflink_inode_has_shared_extents(
}
*has_shared = false;
- found = xfs_iext_lookup_extent(ip, ifp, 0, &idx, &got);
+ found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
while (found) {
if (isnullstartblock(got.br_startblock) ||
got.br_state != XFS_EXT_NORM)
@@ -1457,7 +1460,7 @@ xfs_reflink_inode_has_shared_extents(
return 0;
}
next:
- found = xfs_iext_get_extent(ifp, ++idx, &got);
+ found = xfs_iext_next_extent(ifp, &icur, &got);
}
return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 79defa722bf1..3f30f846d7f2 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -138,6 +138,7 @@ int xfs_rtalloc_query_range(struct xfs_trans *tp,
int xfs_rtalloc_query_all(struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv);
+bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
@@ -146,6 +147,7 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp,
# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
+# define xfs_verify_rtbno(m, r) (false)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c996f4ae4a5f..f663022353c0 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1637,7 +1637,7 @@ xfs_fs_fill_super(
/* version 5 superblocks support inode version counters. */
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
- sb->s_flags |= MS_I_VERSION;
+ sb->s_flags |= SB_I_VERSION;
if (mp->m_flags & XFS_MOUNT_DAX) {
xfs_warn(mp,
@@ -1654,6 +1654,16 @@ xfs_fs_fill_super(
"DAX and reflink have not been tested together!");
}
+ if (mp->m_flags & XFS_MOUNT_DISCARD) {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+
+ if (!blk_queue_discard(q)) {
+ xfs_warn(mp, "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ }
+ }
+
if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
if (mp->m_sb.sb_rblocks) {
xfs_alert(mp,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bb5514688d47..515ba042d75c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -218,53 +218,15 @@ TRACE_EVENT(xfs_attr_list_node_descend,
__entry->bt_before)
);
-TRACE_EVENT(xfs_iext_insert,
- TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
- struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
- TP_ARGS(ip, idx, r, state, caller_ip),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_extnum_t, idx)
- __field(xfs_fileoff_t, startoff)
- __field(xfs_fsblock_t, startblock)
- __field(xfs_filblks_t, blockcount)
- __field(xfs_exntst_t, state)
- __field(int, bmap_state)
- __field(unsigned long, caller_ip)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->idx = idx;
- __entry->startoff = r->br_startoff;
- __entry->startblock = r->br_startblock;
- __entry->blockcount = r->br_blockcount;
- __entry->state = r->br_state;
- __entry->bmap_state = state;
- __entry->caller_ip = caller_ip;
- ),
- TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
- "offset %lld block %lld count %lld flag %d caller %ps",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
- (long)__entry->idx,
- __entry->startoff,
- (int64_t)__entry->startblock,
- __entry->blockcount,
- __entry->state,
- (char *)__entry->caller_ip)
-);
-
DECLARE_EVENT_CLASS(xfs_bmap_class,
- TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state,
unsigned long caller_ip),
- TP_ARGS(ip, idx, state, caller_ip),
+ TP_ARGS(ip, cur, state, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(xfs_extnum_t, idx)
+ __field(void *, leaf);
+ __field(int, pos);
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -277,10 +239,11 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
struct xfs_bmbt_irec r;
ifp = xfs_iext_state_to_fork(ip, state);
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
+ xfs_iext_get_extent(ifp, cur, &r);
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->idx = idx;
+ __entry->leaf = cur->leaf;
+ __entry->pos = cur->pos;
__entry->startoff = r.br_startoff;
__entry->startblock = r.br_startblock;
__entry->blockcount = r.br_blockcount;
@@ -288,12 +251,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
__entry->bmap_state = state;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+ TP_printk("dev %d:%d ino 0x%llx state %s cur 0x%p/%d "
"offset %lld block %lld count %lld flag %d caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
- (long)__entry->idx,
+ __entry->leaf,
+ __entry->pos,
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount,
@@ -303,13 +267,15 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
#define DEFINE_BMAP_EVENT(name) \
DEFINE_EVENT(xfs_bmap_class, name, \
- TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, \
unsigned long caller_ip), \
- TP_ARGS(ip, idx, state, caller_ip))
+ TP_ARGS(ip, cur, state, caller_ip))
+DEFINE_BMAP_EVENT(xfs_iext_insert);
DEFINE_BMAP_EVENT(xfs_iext_remove);
DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
DEFINE_BMAP_EVENT(xfs_bmap_post_update);
-DEFINE_BMAP_EVENT(xfs_extlist);
+DEFINE_BMAP_EVENT(xfs_read_extent);
+DEFINE_BMAP_EVENT(xfs_write_extent);
DECLARE_EVENT_CLASS(xfs_buf_class,
TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 354368a906e5..cef89f7127d3 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -25,6 +25,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_trace.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_log.h"
@@ -514,11 +515,26 @@ xfsaild(
current->flags |= PF_MEMALLOC;
set_freezable();
- while (!kthread_should_stop()) {
+ while (1) {
if (tout && tout <= 20)
- __set_current_state(TASK_KILLABLE);
+ set_current_state(TASK_KILLABLE);
else
- __set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /*
+ * Check kthread_should_stop() after we set the task state
+ * to guarantee that we either see the stop bit and exit or
+ * the task state is reset to runnable such that it's not
+ * scheduled out indefinitely and detects the stop bit at
+ * next iteration.
+ *
+ * A memory barrier is included in above task state set to
+ * serialize again kthread_stop().
+ */
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
spin_lock(&ailp->xa_lock);