summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c100
-rw-r--r--fs/9p/v9fs.c7
-rw-r--r--fs/9p/vfs_inode.c26
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/Kconfig.binfmt5
-rw-r--r--fs/Makefile2
-rw-r--r--fs/afs/cache.c150
-rw-r--r--fs/afs/cell.c12
-rw-r--r--fs/afs/file.c6
-rw-r--r--fs/afs/inode.c49
-rw-r--r--fs/afs/internal.h8
-rw-r--r--fs/afs/rxrpc.c18
-rw-r--r--fs/afs/server.c6
-rw-r--r--fs/afs/volume.c6
-rw-r--r--fs/aio.c39
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/block_dev.c55
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c14
-rw-r--r--fs/btrfs/backref.c14
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c4
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c263
-rw-r--r--fs/btrfs/ctree.h93
-rw-r--r--fs/btrfs/delayed-inode.c62
-rw-r--r--fs/btrfs/delayed-inode.h8
-rw-r--r--fs/btrfs/delayed-ref.c8
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/dev-replace.c130
-rw-r--r--fs/btrfs/dev-replace.h9
-rw-r--r--fs/btrfs/dir-item.c1
-rw-r--r--fs/btrfs/disk-io.c226
-rw-r--r--fs/btrfs/disk-io.h15
-rw-r--r--fs/btrfs/extent-tree.c331
-rw-r--r--fs/btrfs/extent_io.c82
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/extent_map.h2
-rw-r--r--fs/btrfs/file.c30
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/hash.c54
-rw-r--r--fs/btrfs/hash.h43
-rw-r--r--fs/btrfs/inode-item.c45
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c346
-rw-r--r--fs/btrfs/ioctl.c129
-rw-r--r--fs/btrfs/locking.c2
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/print-tree.c10
-rw-r--r--fs/btrfs/props.c8
-rw-r--r--fs/btrfs/qgroup.c406
-rw-r--r--fs/btrfs/qgroup.h106
-rw-r--r--fs/btrfs/raid56.c29
-rw-r--r--fs/btrfs/reada.c10
-rw-r--r--fs/btrfs/ref-verify.c7
-rw-r--r--fs/btrfs/relocation.c48
-rw-r--r--fs/btrfs/scrub.c126
-rw-r--r--fs/btrfs/send.c38
-rw-r--r--fs/btrfs/super.c261
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c2
-rw-r--r--fs/btrfs/transaction.c231
-rw-r--r--fs/btrfs/transaction.h25
-rw-r--r--fs/btrfs/tree-checker.c151
-rw-r--r--fs/btrfs/tree-checker.h7
-rw-r--r--fs/btrfs/tree-defrag.c5
-rw-r--r--fs/btrfs/tree-log.c202
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/uuid-tree.c2
-rw-r--r--fs/btrfs/volumes.c173
-rw-r--r--fs/btrfs/volumes.h31
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/btrfs/xattr.h7
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/cachefiles/interface.c61
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/main.c1
-rw-r--r--fs/cachefiles/namei.c75
-rw-r--r--fs/cachefiles/rdwr.c1
-rw-r--r--fs/cachefiles/xattr.c8
-rw-r--r--fs/ceph/cache.c113
-rw-r--r--fs/ceph/caps.c26
-rw-r--r--fs/ceph/dir.c29
-rw-r--r--fs/ceph/file.c9
-rw-r--r--fs/ceph/super.c27
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/char_dev.c10
-rw-r--r--fs/cifs/Kconfig8
-rw-r--r--fs/cifs/cache.c168
-rw-r--r--fs/cifs/cifsencrypt.c85
-rw-r--r--fs/cifs/cifsfs.c1
-rw-r--r--fs/cifs/cifsglob.h9
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c6
-rw-r--r--fs/cifs/connect.c18
-rw-r--r--fs/cifs/fscache.c130
-rw-r--r--fs/cifs/fscache.h13
-rw-r--r--fs/cifs/inode.c38
-rw-r--r--fs/cifs/link.c27
-rw-r--r--fs/cifs/misc.c54
-rw-r--r--fs/cifs/smb1ops.c1
-rw-r--r--fs/cifs/smb2maperror.c2
-rw-r--r--fs/cifs/smb2misc.c89
-rw-r--r--fs/cifs/smb2ops.c76
-rw-r--r--fs/cifs/smb2pdu.c57
-rw-r--r--fs/cifs/smb2pdu.h3
-rw-r--r--fs/cifs/smb2proto.h5
-rw-r--r--fs/cifs/smb2transport.c97
-rw-r--r--fs/cifs/smbdirect.c23
-rw-r--r--fs/cifs/smbencrypt.c27
-rw-r--r--fs/cifs/transport.c20
-rw-r--r--fs/d_path.c470
-rw-r--r--fs/dcache.c971
-rw-r--r--fs/dcookies.c11
-rw-r--r--fs/debugfs/inode.c5
-rw-r--r--fs/devpts/inode.c66
-rw-r--r--fs/direct-io.c21
-rw-r--r--fs/dlm/lowcomms.c7
-rw-r--r--fs/efivarfs/file.c6
-rw-r--r--fs/eventfd.c9
-rw-r--r--fs/eventpoll.c23
-rw-r--r--fs/exec.c6
-rw-r--r--fs/ext2/super.c4
-rw-r--r--fs/ext4/balloc.c19
-rw-r--r--fs/ext4/dir.c8
-rw-r--r--fs/ext4/ext4.h17
-rw-r--r--fs/ext4/ext4_jbd2.c7
-rw-r--r--fs/ext4/extents.c23
-rw-r--r--fs/ext4/ialloc.c54
-rw-r--r--fs/ext4/inode.c46
-rw-r--r--fs/ext4/ioctl.c13
-rw-r--r--fs/ext4/move_extent.c4
-rw-r--r--fs/ext4/super.c65
-rw-r--r--fs/ext4/sysfs.c72
-rw-r--r--fs/ext4/xattr.c121
-rw-r--r--fs/ext4/xattr.h11
-rw-r--r--fs/f2fs/checkpoint.c101
-rw-r--r--fs/f2fs/data.c85
-rw-r--r--fs/f2fs/dir.c32
-rw-r--r--fs/f2fs/extent_cache.c5
-rw-r--r--fs/f2fs/f2fs.h188
-rw-r--r--fs/f2fs/file.c94
-rw-r--r--fs/f2fs/gc.c23
-rw-r--r--fs/f2fs/inline.c3
-rw-r--r--fs/f2fs/inode.c11
-rw-r--r--fs/f2fs/namei.c147
-rw-r--r--fs/f2fs/node.c55
-rw-r--r--fs/f2fs/node.h5
-rw-r--r--fs/f2fs/recovery.c25
-rw-r--r--fs/f2fs/segment.c133
-rw-r--r--fs/f2fs/segment.h27
-rw-r--r--fs/f2fs/super.c348
-rw-r--r--fs/f2fs/sysfs.c73
-rw-r--r--fs/fcntl.c12
-rw-r--r--fs/file.c17
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/fscache/cache.c2
-rw-r--r--fs/fscache/cookie.c393
-rw-r--r--fs/fscache/fsdef.c55
-rw-r--r--fs/fscache/internal.h44
-rw-r--r--fs/fscache/main.c1
-rw-r--r--fs/fscache/netfs.c71
-rw-r--r--fs/fscache/object-list.c28
-rw-r--r--fs/fscache/object.c66
-rw-r--r--fs/fscache/operation.c26
-rw-r--r--fs/fscache/page.c84
-rw-r--r--fs/fscache/stats.c1
-rw-r--r--fs/fuse/inode.c3
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c41
-rw-r--r--fs/gfs2/dir.c13
-rw-r--r--fs/gfs2/file.c34
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.c10
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/recovery.c20
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/trace_gfs2.h9
-rw-r--r--fs/gfs2/xattr.c8
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hostfs/hostfs_user.c2
-rw-r--r--fs/hugetlbfs/inode.c23
-rw-r--r--fs/inode.c12
-rw-r--r--fs/internal.h15
-rw-r--r--fs/ioctl.c7
-rw-r--r--fs/jbd2/journal.c30
-rw-r--r--fs/jbd2/recovery.c4
-rw-r--r--fs/jffs2/erase.c37
-rw-r--r--fs/lockd/svc.c4
-rw-r--r--fs/locks.c2
-rw-r--r--fs/minix/Kconfig2
-rw-r--r--fs/namei.c121
-rw-r--r--fs/namespace.c19
-rw-r--r--fs/nfs/callback_proc.c14
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/fscache-index.c159
-rw-r--r--fs/nfs/fscache.c89
-rw-r--r--fs/nfs/fscache.h15
-rw-r--r--fs/nfs/inode.c5
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs4client.c6
-rw-r--r--fs/nfs/nfs4xdr.c1
-rw-r--r--fs/nfs/pagelist.c6
-rw-r--r--fs/nfs/pnfs.c13
-rw-r--r--fs/nfs/pnfs_nfs.c2
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/write.c89
-rw-r--r--fs/nfsd/nfs3proc.c18
-rw-r--r--fs/nfsd/nfs3xdr.c67
-rw-r--r--fs/nfsd/nfs4callback.c4
-rw-r--r--fs/nfsd/nfs4layouts.c16
-rw-r--r--fs/nfsd/nfs4proc.c38
-rw-r--r--fs/nfsd/nfs4state.c348
-rw-r--r--fs/nfsd/nfs4xdr.c22
-rw-r--r--fs/nfsd/nfsfh.c12
-rw-r--r--fs/nfsd/nfsproc.c23
-rw-r--r--fs/nfsd/nfsxdr.c63
-rw-r--r--fs/nfsd/trace.h98
-rw-r--r--fs/nfsd/vfs.c65
-rw-r--r--fs/nfsd/vfs.h11
-rw-r--r--fs/nfsd/xdr.h3
-rw-r--r--fs/nfsd/xdr3.h3
-rw-r--r--fs/nfsd/xdr4.h5
-rw-r--r--fs/notify/fanotify/fanotify.c28
-rw-r--r--fs/notify/fanotify/fanotify.h3
-rw-r--r--fs/notify/fanotify/fanotify_user.c16
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c8
-rw-r--r--fs/notify/inotify/inotify_user.c23
-rw-r--r--fs/notify/notification.c3
-rw-r--r--fs/nsfs.c1
-rw-r--r--fs/ntfs/mft.c4
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c11
-rw-r--r--fs/ocfs2/cluster/tcp.c6
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmast.c2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c29
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h25
-rw-r--r--fs/ocfs2/dlm/dlmlock.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c25
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c41
-rw-r--r--fs/ocfs2/dlmglue.c23
-rw-r--r--fs/ocfs2/file.c16
-rw-r--r--fs/ocfs2/filecheck.c357
-rw-r--r--fs/ocfs2/filecheck.h29
-rw-r--r--fs/ocfs2/inode.c8
-rw-r--r--fs/ocfs2/namei.c6
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h6
-rw-r--r--fs/ocfs2/refcounttree.c10
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/suballoc.c53
-rw-r--r--fs/ocfs2/super.c49
-rw-r--r--fs/ocfs2/uptodate.c3
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/open.c115
-rw-r--r--fs/orangefs/acl.c1
-rw-r--r--fs/orangefs/orangefs-utils.c2
-rw-r--r--fs/overlayfs/Kconfig14
-rw-r--r--fs/overlayfs/export.c216
-rw-r--r--fs/overlayfs/inode.c58
-rw-r--r--fs/overlayfs/namei.c6
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/super.c1
-rw-r--r--fs/pipe.c9
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/quota/compat.c13
-rw-r--r--fs/quota/quota.c10
-rw-r--r--fs/read_write.c45
-rw-r--r--fs/readdir.c11
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/select.c29
-rw-r--r--fs/signalfd.c46
-rw-r--r--fs/splice.c12
-rw-r--r--fs/stat.c12
-rw-r--r--fs/sync.c25
-rw-r--r--fs/sysfs/symlink.c1
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/udf/file.c10
-rw-r--r--fs/udf/ialloc.c4
-rw-r--r--fs/udf/inode.c23
-rw-r--r--fs/udf/super.c260
-rw-r--r--fs/udf/udf_sb.h15
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/utimes.c25
-rw-r--r--fs/xfs/kmem.c6
-rw-r--r--fs/xfs/kmem.h8
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c39
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h31
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c139
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c51
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h14
-rw-r--r--fs/xfs/libxfs/xfs_btree.c125
-rw-r--r--fs/xfs/libxfs/xfs_btree.h19
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c59
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c78
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c13
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c16
-rw-r--r--fs/xfs/libxfs/xfs_format.h13
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c124
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h5
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c27
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c12
-rw-r--r--fs/xfs/libxfs/xfs_sb.c1
-rw-r--r--fs/xfs/scrub/agheader.c9
-rw-r--r--fs/xfs/scrub/attr.c2
-rw-r--r--fs/xfs/scrub/bmap.c174
-rw-r--r--fs/xfs/scrub/common.c24
-rw-r--r--fs/xfs/scrub/common.h13
-rw-r--r--fs/xfs/scrub/dir.c2
-rw-r--r--fs/xfs/scrub/ialloc.c5
-rw-r--r--fs/xfs/scrub/inode.c298
-rw-r--r--fs/xfs/scrub/parent.c12
-rw-r--r--fs/xfs/scrub/quota.c2
-rw-r--r--fs/xfs/scrub/rtbitmap.c3
-rw-r--r--fs/xfs/scrub/trace.h31
-rw-r--r--fs/xfs/xfs_aops.c22
-rw-r--r--fs/xfs/xfs_bmap_util.c44
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_buf_item.c10
-rw-r--r--fs/xfs/xfs_dquot.c6
-rw-r--r--fs/xfs/xfs_dquot_item.c11
-rw-r--r--fs/xfs/xfs_error.c29
-rw-r--r--fs/xfs/xfs_error.h3
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_extent_busy.c5
-rw-r--r--fs/xfs/xfs_file.c52
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_icache.c23
-rw-r--r--fs/xfs/xfs_inode.c11
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_item.c29
-rw-r--r--fs/xfs/xfs_iomap.c42
-rw-r--r--fs/xfs/xfs_iops.c17
-rw-r--r--fs/xfs/xfs_log.c376
-rw-r--r--fs/xfs/xfs_log.h15
-rw-r--r--fs/xfs/xfs_log_cil.c2
-rw-r--r--fs/xfs/xfs_log_recover.c100
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h13
-rw-r--r--fs/xfs/xfs_refcount_item.c9
-rw-r--r--fs/xfs/xfs_reflink.c25
-rw-r--r--fs/xfs/xfs_rmap_item.c4
-rw-r--r--fs/xfs/xfs_super.c73
-rw-r--r--fs/xfs/xfs_trace.h9
-rw-r--r--fs/xfs/xfs_trans.c32
-rw-r--r--fs/xfs/xfs_trans_ail.c152
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c14
-rw-r--r--fs/xfs/xfs_trans_priv.h42
373 files changed, 8957 insertions, 6981 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 64c58eb26159..9eb34701a566 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -55,42 +55,27 @@ int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
}
-static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- struct v9fs_session_info *v9ses;
- uint16_t klen = 0;
-
- v9ses = (struct v9fs_session_info *)cookie_netfs_data;
- p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
- v9ses, buffer, bufmax);
-
- if (v9ses->cachetag)
- klen = strlen(v9ses->cachetag);
-
- if (klen > bufmax)
- return 0;
-
- memcpy(buffer, v9ses->cachetag, klen);
- p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
- return klen;
-}
-
const struct fscache_cookie_def v9fs_cache_session_index_def = {
.name = "9P.session",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = v9fs_cache_session_get_key,
};
void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
{
/* If no cache session tag was specified, we generate a random one. */
- if (!v9ses->cachetag)
- v9fs_random_cachetag(v9ses);
+ if (!v9ses->cachetag) {
+ if (v9fs_random_cachetag(v9ses) < 0) {
+ v9ses->fscache = NULL;
+ return;
+ }
+ }
v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
&v9fs_cache_session_index_def,
- v9ses, true);
+ v9ses->cachetag,
+ strlen(v9ses->cachetag),
+ NULL, 0,
+ v9ses, 0, true);
p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
v9ses, v9ses->fscache);
}
@@ -99,45 +84,15 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
{
p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
v9ses, v9ses->fscache);
- fscache_relinquish_cookie(v9ses->fscache, 0);
+ fscache_relinquish_cookie(v9ses->fscache, NULL, false);
v9ses->fscache = NULL;
}
-
-static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct v9fs_inode *v9inode = cookie_netfs_data;
- memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
- p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
- &v9inode->vfs_inode, v9inode->qid.path);
- return sizeof(v9inode->qid.path);
-}
-
-static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
- uint64_t *size)
-{
- const struct v9fs_inode *v9inode = cookie_netfs_data;
- *size = i_size_read(&v9inode->vfs_inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
- &v9inode->vfs_inode, *size);
-}
-
-static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen)
-{
- const struct v9fs_inode *v9inode = cookie_netfs_data;
- memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
- p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
- &v9inode->vfs_inode, v9inode->qid.version);
- return sizeof(v9inode->qid.version);
-}
-
static enum
fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
const void *buffer,
- uint16_t buflen)
+ uint16_t buflen,
+ loff_t object_size)
{
const struct v9fs_inode *v9inode = cookie_netfs_data;
@@ -154,9 +109,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
const struct fscache_cookie_def v9fs_cache_inode_index_def = {
.name = "9p.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = v9fs_cache_inode_get_key,
- .get_attr = v9fs_cache_inode_get_attr,
- .get_aux = v9fs_cache_inode_get_aux,
.check_aux = v9fs_cache_inode_check_aux,
};
@@ -175,7 +127,13 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
v9ses = v9fs_inode2v9ses(inode);
v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
&v9fs_cache_inode_index_def,
- v9inode, true);
+ &v9inode->qid.path,
+ sizeof(v9inode->qid.path),
+ &v9inode->qid.version,
+ sizeof(v9inode->qid.version),
+ v9inode,
+ i_size_read(&v9inode->vfs_inode),
+ true);
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9inode->fscache);
@@ -190,7 +148,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode)
p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
inode, v9inode->fscache);
- fscache_relinquish_cookie(v9inode->fscache, 0);
+ fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version,
+ false);
v9inode->fscache = NULL;
}
@@ -203,7 +162,7 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
inode, v9inode->fscache);
- fscache_relinquish_cookie(v9inode->fscache, 1);
+ fscache_relinquish_cookie(v9inode->fscache, NULL, true);
v9inode->fscache = NULL;
}
@@ -236,12 +195,18 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
old = v9inode->fscache;
mutex_lock(&v9inode->fscache_lock);
- fscache_relinquish_cookie(v9inode->fscache, 1);
+ fscache_relinquish_cookie(v9inode->fscache, NULL, true);
v9ses = v9fs_inode2v9ses(inode);
v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
&v9fs_cache_inode_index_def,
- v9inode, true);
+ &v9inode->qid.path,
+ sizeof(v9inode->qid.path),
+ &v9inode->qid.version,
+ sizeof(v9inode->qid.version),
+ v9inode,
+ i_size_read(&v9inode->vfs_inode),
+ true);
p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
inode, old, v9inode->fscache);
@@ -367,7 +332,8 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
const struct v9fs_inode *v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
+ ret = fscache_write_page(v9inode->fscache, page,
+ i_size_read(&v9inode->vfs_inode), GFP_KERNEL);
p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret);
if (ret != 0)
v9fs_uncache_page(inode, page);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 8fb89ddc6cc7..e622f0f10502 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -292,6 +292,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#ifdef CONFIG_9P_FSCACHE
kfree(v9ses->cachetag);
v9ses->cachetag = match_strdup(&args[0]);
+ if (!v9ses->cachetag) {
+ ret = -ENOMEM;
+ goto free_and_return;
+ }
#endif
break;
case Opt_cache:
@@ -471,6 +475,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
return fid;
err_clnt:
+#ifdef CONFIG_9P_FSCACHE
+ kfree(v9ses->cachetag);
+#endif
p9_client_destroy(v9ses->clnt);
err_names:
kfree(v9ses->uname);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index bdabb2765d1b..9ee534159cc6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -579,6 +579,24 @@ static int v9fs_at_to_dotl_flags(int flags)
}
/**
+ * v9fs_dec_count - helper functon to drop i_nlink.
+ *
+ * If a directory had nlink <= 2 (including . and ..), then we should not drop
+ * the link count, which indicates the underlying exported fs doesn't maintain
+ * nlink accurately. e.g.
+ * - overlayfs sets nlink to 1 for merged dir
+ * - ext4 (with dir_nlink feature enabled) sets nlink to 1 if a dir has more
+ * than EXT4_LINK_MAX (65000) links.
+ *
+ * @inode: inode whose nlink is being dropped
+ */
+static void v9fs_dec_count(struct inode *inode)
+{
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+ drop_nlink(inode);
+}
+
+/**
* v9fs_remove - helper function to remove files and directories
* @dir: directory inode that is being deleted
* @dentry: dentry that is being deleted
@@ -621,9 +639,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
*/
if (flags & AT_REMOVEDIR) {
clear_nlink(inode);
- drop_nlink(dir);
+ v9fs_dec_count(dir);
} else
- drop_nlink(inode);
+ v9fs_dec_count(inode);
v9fs_invalidate_inode_attr(inode);
v9fs_invalidate_inode_attr(dir);
@@ -1024,12 +1042,12 @@ clunk_newdir:
if (S_ISDIR(new_inode->i_mode))
clear_nlink(new_inode);
else
- drop_nlink(new_inode);
+ v9fs_dec_count(new_inode);
}
if (S_ISDIR(old_inode->i_mode)) {
if (!new_inode)
inc_nlink(new_dir);
- drop_nlink(old_dir);
+ v9fs_dec_count(old_dir);
}
v9fs_invalidate_inode_attr(old_inode);
v9fs_invalidate_inode_attr(old_dir);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index af03c2a901eb..48ce50484e80 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -94,7 +94,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
- sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME;
+ sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
sb->s_flags |= SB_SYNCHRONOUS;
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 58c2bbd385ad..57a27c42b5ac 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -1,6 +1,6 @@
config BINFMT_ELF
bool "Kernel support for ELF binaries"
- depends on MMU && (BROKEN || !FRV)
+ depends on MMU
select ELFCORE
default y
---help---
@@ -35,7 +35,7 @@ config ARCH_BINFMT_ELF_STATE
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on (ARM || FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
+ depends on (ARM || (SUPERH32 && !MMU) || C6X)
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
@@ -90,7 +90,6 @@ config BINFMT_SCRIPT
config BINFMT_FLAT
bool "Kernel support for flat binaries"
depends on !MMU || ARM || M68K
- depends on !FRV || BROKEN
help
Support uClinux FLAT format binaries.
diff --git a/fs/Makefile b/fs/Makefile
index add789ea270a..c9375fd2c8c4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
ioctl.o readdir.o select.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
- pnode.o splice.o sync.o utimes.o \
+ pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
ifeq ($(CONFIG_BLOCK),y)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index f62ff71d28c9..b1c31ec4523a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -12,167 +12,39 @@
#include <linux/sched.h>
#include "internal.h"
-static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen);
-static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen);
-
-static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen);
-static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
- uint64_t *size);
-static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen);
static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
const void *buffer,
- uint16_t buflen);
+ uint16_t buflen,
+ loff_t object_size);
struct fscache_netfs afs_cache_netfs = {
.name = "afs",
- .version = 1,
+ .version = 2,
};
struct fscache_cookie_def afs_cell_cache_index_def = {
.name = "AFS.cell",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = afs_cell_cache_get_key,
};
struct fscache_cookie_def afs_volume_cache_index_def = {
.name = "AFS.volume",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = afs_volume_cache_get_key,
};
struct fscache_cookie_def afs_vnode_cache_index_def = {
- .name = "AFS.vnode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = afs_vnode_cache_get_key,
- .get_attr = afs_vnode_cache_get_attr,
- .get_aux = afs_vnode_cache_get_aux,
- .check_aux = afs_vnode_cache_check_aux,
+ .name = "AFS.vnode",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .check_aux = afs_vnode_cache_check_aux,
};
/*
- * set the key for the index entry
- */
-static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct afs_cell *cell = cookie_netfs_data;
- uint16_t klen;
-
- _enter("%p,%p,%u", cell, buffer, bufmax);
-
- klen = strlen(cell->name);
- if (klen > bufmax)
- return 0;
-
- memcpy(buffer, cell->name, klen);
- return klen;
-}
-
-/*****************************************************************************/
-/*
- * set the key for the volume index entry
- */
-static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct afs_volume *volume = cookie_netfs_data;
- struct {
- u64 volid;
- } __packed key;
-
- _enter("{%u},%p,%u", volume->type, buffer, bufmax);
-
- if (bufmax < sizeof(key))
- return 0;
-
- key.volid = volume->vid;
- memcpy(buffer, &key, sizeof(key));
- return sizeof(key);
-}
-
-/*****************************************************************************/
-/*
- * set the key for the index entry
- */
-static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct afs_vnode *vnode = cookie_netfs_data;
- struct {
- u32 vnode_id[3];
- } __packed key;
-
- _enter("{%x,%x,%llx},%p,%u",
- vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
- buffer, bufmax);
-
- /* Allow for a 96-bit key */
- memset(&key, 0, sizeof(key));
- key.vnode_id[0] = vnode->fid.vnode;
- key.vnode_id[1] = 0;
- key.vnode_id[2] = 0;
-
- if (sizeof(key) > bufmax)
- return 0;
-
- memcpy(buffer, &key, sizeof(key));
- return sizeof(key);
-}
-
-/*
- * provide updated file attributes
- */
-static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
- uint64_t *size)
-{
- const struct afs_vnode *vnode = cookie_netfs_data;
-
- _enter("{%x,%x,%llx},",
- vnode->fid.vnode, vnode->fid.unique,
- vnode->status.data_version);
-
- *size = vnode->status.size;
-}
-
-struct afs_vnode_cache_aux {
- u64 data_version;
- u32 fid_unique;
-} __packed;
-
-/*
- * provide new auxiliary cache data
- */
-static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct afs_vnode *vnode = cookie_netfs_data;
- struct afs_vnode_cache_aux aux;
-
- _enter("{%x,%x,%Lx},%p,%u",
- vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
- buffer, bufmax);
-
- memset(&aux, 0, sizeof(aux));
- aux.data_version = vnode->status.data_version;
- aux.fid_unique = vnode->fid.unique;
-
- if (bufmax < sizeof(aux))
- return 0;
-
- memcpy(buffer, &aux, sizeof(aux));
- return sizeof(aux);
-}
-
-/*
* check that the auxiliary data indicates that the entry is still valid
*/
static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
const void *buffer,
- uint16_t buflen)
+ uint16_t buflen,
+ loff_t object_size)
{
struct afs_vnode *vnode = cookie_netfs_data;
struct afs_vnode_cache_aux aux;
@@ -189,12 +61,6 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OBSOLETE;
}
- if (vnode->fid.unique != aux.fid_unique) {
- _leave(" = OBSOLETE [uniq %x != %x]",
- aux.fid_unique, vnode->fid.unique);
- return FSCACHE_CHECKAUX_OBSOLETE;
- }
-
if (vnode->status.data_version != aux.data_version) {
_leave(" = OBSOLETE [vers %llx != %llx]",
aux.data_version, vnode->status.data_version);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 9bb921d120d0..4235a05afc76 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -25,7 +25,7 @@ static void afs_manage_cell(struct work_struct *);
static void afs_dec_cells_outstanding(struct afs_net *net)
{
if (atomic_dec_and_test(&net->cells_outstanding))
- wake_up_atomic_t(&net->cells_outstanding);
+ wake_up_var(&net->cells_outstanding);
}
/*
@@ -522,7 +522,9 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
#ifdef CONFIG_AFS_FSCACHE
cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
&afs_cell_cache_index_def,
- cell, true);
+ cell->name, strlen(cell->name),
+ NULL, 0,
+ cell, 0, true);
#endif
ret = afs_proc_cell_setup(net, cell);
if (ret < 0)
@@ -547,7 +549,7 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
spin_unlock(&net->proc_cells_lock);
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(cell->cache, 0);
+ fscache_relinquish_cookie(cell->cache, NULL, false);
cell->cache = NULL;
#endif
@@ -764,7 +766,7 @@ void afs_cell_purge(struct afs_net *net)
afs_queue_cell_manager(net);
_debug("wait");
- wait_on_atomic_t(&net->cells_outstanding, atomic_t_wait,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&net->cells_outstanding,
+ !atomic_read(&net->cells_outstanding));
_leave("");
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a39192ced99e..79e665a35fea 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -339,7 +339,8 @@ int afs_page_filler(void *data, struct page *page)
/* send the page to the cache */
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page) &&
- fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+ fscache_write_page(vnode->cache, page, vnode->status.size,
+ GFP_KERNEL) != 0) {
fscache_uncache_page(vnode->cache, page);
BUG_ON(PageFsCache(page));
}
@@ -403,7 +404,8 @@ static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
/* send the page to the cache */
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page) &&
- fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+ fscache_write_page(vnode->cache, page, vnode->status.size,
+ GFP_KERNEL) != 0) {
fscache_uncache_page(vnode->cache, page);
BUG_ON(PageFsCache(page));
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 6b39d0255b72..65c5b1edd338 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -243,6 +243,33 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
}
/*
+ * Get a cache cookie for an inode.
+ */
+static void afs_get_inode_cache(struct afs_vnode *vnode)
+{
+#ifdef CONFIG_AFS_FSCACHE
+ struct {
+ u32 vnode_id;
+ u32 unique;
+ u32 vnode_id_ext[2]; /* Allow for a 96-bit key */
+ } __packed key;
+ struct afs_vnode_cache_aux aux;
+
+ key.vnode_id = vnode->fid.vnode;
+ key.unique = vnode->fid.unique;
+ key.vnode_id_ext[0] = 0;
+ key.vnode_id_ext[1] = 0;
+ aux.data_version = vnode->status.data_version;
+
+ vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+ &afs_vnode_cache_index_def,
+ &key, sizeof(key),
+ &aux, sizeof(aux),
+ vnode, vnode->status.size, true);
+#endif
+}
+
+/*
* inode retrieval
*/
struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -307,11 +334,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
/* set up caching before mapping the status, as map-status reads the
* first page of symlinks to see if they're really mountpoints */
inode->i_size = vnode->status.size;
-#ifdef CONFIG_AFS_FSCACHE
- vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
- &afs_vnode_cache_index_def,
- vnode, true);
-#endif
+ afs_get_inode_cache(vnode);
ret = afs_inode_map_status(vnode, key);
if (ret < 0)
@@ -327,7 +350,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
/* failure */
bad_inode:
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(vnode->cache, 0);
+ fscache_relinquish_cookie(vnode->cache, NULL, ret == -ENOENT);
vnode->cache = NULL;
#endif
iget_failed(inode);
@@ -343,6 +366,10 @@ void afs_zap_data(struct afs_vnode *vnode)
{
_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_invalidate(vnode->cache);
+#endif
+
/* nuke all the non-dirty pages that aren't locked, mapped or being
* written back in a regular file and completely discard the pages in a
* directory or symlink */
@@ -507,8 +534,14 @@ void afs_evict_inode(struct inode *inode)
}
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(vnode->cache, 0);
- vnode->cache = NULL;
+ {
+ struct afs_vnode_cache_aux aux;
+
+ aux.data_version = vnode->status.data_version;
+ fscache_relinquish_cookie(vnode->cache, &aux,
+ test_bit(AFS_VNODE_DELETED, &vnode->flags));
+ vnode->cache = NULL;
+ }
#endif
afs_put_permits(vnode->permit_cache);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index f38d6a561a84..a6a1d75eee41 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -118,6 +118,7 @@ struct afs_call {
bool ret_reply0; /* T if should return reply[0] on success */
bool upgrade; /* T to request service upgrade */
u16 service_id; /* Actual service ID (after upgrade) */
+ unsigned int debug_id; /* Trace ID */
u32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
__be32 tmp; /* place to extract temporary data */
@@ -558,6 +559,13 @@ struct afs_fs_cursor {
#define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */
};
+/*
+ * Cache auxiliary data.
+ */
+struct afs_vnode_cache_aux {
+ u64 data_version;
+} __packed;
+
#include <trace/events/afs.h>
/*****************************************************************************/
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index e1126659f043..f7ae54b6a393 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -103,8 +103,8 @@ void afs_close_socket(struct afs_net *net)
}
_debug("outstanding %u", atomic_read(&net->nr_outstanding_calls));
- wait_on_atomic_t(&net->nr_outstanding_calls, atomic_t_wait,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&net->nr_outstanding_calls,
+ !atomic_read(&net->nr_outstanding_calls));
_debug("no outstanding calls");
kernel_sock_shutdown(net->socket, SHUT_RDWR);
@@ -131,6 +131,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
call->type = type;
call->net = net;
+ call->debug_id = atomic_inc_return(&rxrpc_debug_id);
atomic_set(&call->usage, 1);
INIT_WORK(&call->async_work, afs_process_async_call);
init_waitqueue_head(&call->waitq);
@@ -169,13 +170,14 @@ void afs_put_call(struct afs_call *call)
afs_put_server(call->net, call->cm_server);
afs_put_cb_interest(call->net, call->cbi);
kfree(call->request);
- kfree(call);
- o = atomic_dec_return(&net->nr_outstanding_calls);
trace_afs_call(call, afs_call_trace_free, 0, o,
__builtin_return_address(0));
+ kfree(call);
+
+ o = atomic_dec_return(&net->nr_outstanding_calls);
if (o == 0)
- wake_up_atomic_t(&net->nr_outstanding_calls);
+ wake_up_var(&net->nr_outstanding_calls);
}
}
@@ -378,7 +380,8 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
(async ?
afs_wake_up_async_call :
afs_wake_up_call_waiter),
- call->upgrade);
+ call->upgrade,
+ call->debug_id);
if (IS_ERR(rxcall)) {
ret = PTR_ERR(rxcall);
goto error_kill_call;
@@ -727,7 +730,8 @@ void afs_charge_preallocation(struct work_struct *work)
afs_wake_up_async_call,
afs_rx_attach,
(unsigned long)call,
- GFP_KERNEL) < 0)
+ GFP_KERNEL,
+ call->debug_id) < 0)
break;
call = NULL;
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 1880f1b6a9f1..a43ef77dabae 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -25,7 +25,7 @@ static void afs_inc_servers_outstanding(struct afs_net *net)
static void afs_dec_servers_outstanding(struct afs_net *net)
{
if (atomic_dec_and_test(&net->servers_outstanding))
- wake_up_atomic_t(&net->servers_outstanding);
+ wake_up_var(&net->servers_outstanding);
}
/*
@@ -521,8 +521,8 @@ void afs_purge_servers(struct afs_net *net)
afs_queue_server_manager(net);
_debug("wait");
- wait_on_atomic_t(&net->servers_outstanding, atomic_t_wait,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&net->servers_outstanding,
+ !atomic_read(&net->servers_outstanding));
_leave("");
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index b517a588781f..3037bd01f617 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -225,7 +225,9 @@ void afs_activate_volume(struct afs_volume *volume)
#ifdef CONFIG_AFS_FSCACHE
volume->cache = fscache_acquire_cookie(volume->cell->cache,
&afs_volume_cache_index_def,
- volume, true);
+ &volume->vid, sizeof(volume->vid),
+ NULL, 0,
+ volume, 0, true);
#endif
write_lock(&volume->cell->proc_lock);
@@ -245,7 +247,7 @@ void afs_deactivate_volume(struct afs_volume *volume)
write_unlock(&volume->cell->proc_lock);
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(volume->cache,
+ fscache_relinquish_cookie(volume->cache, NULL,
test_bit(AFS_VOLUME_DELETED, &volume->flags));
volume->cache = NULL;
#endif
diff --git a/fs/aio.c b/fs/aio.c
index a062d75109cb..88d7927ffbc6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,9 +68,9 @@ struct aio_ring {
#define AIO_RING_PAGES 8
struct kioctx_table {
- struct rcu_head rcu;
- unsigned nr;
- struct kioctx *table[];
+ struct rcu_head rcu;
+ unsigned nr;
+ struct kioctx __rcu *table[];
};
struct kioctx_cpu {
@@ -115,7 +115,7 @@ struct kioctx {
struct page **ring_pages;
long nr_pages;
- struct work_struct free_work;
+ struct rcu_work free_rwork; /* see free_ioctx() */
/*
* signals when all in-flight requests are done
@@ -329,7 +329,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
for (i = 0; i < table->nr; i++) {
struct kioctx *ctx;
- ctx = table->table[i];
+ ctx = rcu_dereference(table->table[i]);
if (ctx && ctx->aio_ring_file == file) {
if (!atomic_read(&ctx->dead)) {
ctx->user_id = ctx->mmap_base = vma->vm_start;
@@ -588,10 +588,15 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
return cancel(&kiocb->common);
}
+/*
+ * free_ioctx() should be RCU delayed to synchronize against the RCU
+ * protected lookup_ioctx() and also needs process context to call
+ * aio_free_ring(). Use rcu_work.
+ */
static void free_ioctx(struct work_struct *work)
{
- struct kioctx *ctx = container_of(work, struct kioctx, free_work);
-
+ struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
+ free_rwork);
pr_debug("freeing %p\n", ctx);
aio_free_ring(ctx);
@@ -609,8 +614,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
complete(&ctx->rq_wait->comp);
- INIT_WORK(&ctx->free_work, free_ioctx);
- schedule_work(&ctx->free_work);
+ /* Synchronize against RCU protected table->table[] dereferences */
+ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
+ queue_rcu_work(system_wq, &ctx->free_rwork);
}
/*
@@ -651,9 +657,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
while (1) {
if (table)
for (i = 0; i < table->nr; i++)
- if (!table->table[i]) {
+ if (!rcu_access_pointer(table->table[i])) {
ctx->id = i;
- table->table[i] = ctx;
+ rcu_assign_pointer(table->table[i], ctx);
spin_unlock(&mm->ioctx_lock);
/* While kioctx setup is in progress,
@@ -834,11 +840,11 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
}
table = rcu_dereference_raw(mm->ioctx_table);
- WARN_ON(ctx != table->table[ctx->id]);
- table->table[ctx->id] = NULL;
+ WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
+ RCU_INIT_POINTER(table->table[ctx->id], NULL);
spin_unlock(&mm->ioctx_lock);
- /* percpu_ref_kill() will do the necessary call_rcu() */
+ /* free_ioctx_reqs() will do the necessary RCU synchronization */
wake_up_all(&ctx->wait);
/*
@@ -880,7 +886,8 @@ void exit_aio(struct mm_struct *mm)
skipped = 0;
for (i = 0; i < table->nr; ++i) {
- struct kioctx *ctx = table->table[i];
+ struct kioctx *ctx =
+ rcu_dereference_protected(table->table[i], true);
if (!ctx) {
skipped++;
@@ -1069,7 +1076,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
if (!table || id >= table->nr)
goto out;
- ctx = table->table[id];
+ ctx = rcu_dereference(table->table[id]);
if (ctx && ctx->user_id == ctx_id) {
percpu_ref_get(&ctx->users);
ret = ctx;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index b7c816f39404..26f6b4f41ce6 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -310,7 +310,7 @@ static int autofs_dev_ioctl_closemount(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- return sys_close(param->ioctlfd);
+ return ksys_close(param->ioctlfd);
}
/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index a7c5a9861bef..a41b48f82a70 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -241,7 +241,7 @@ ret:
return retval;
error:
if (fd_binary > 0)
- sys_close(fd_binary);
+ ksys_close(fd_binary);
bprm->interp_flags = 0;
bprm->interp_data = 0;
goto ret;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4a181fcb5175..7a506c55a993 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1058,6 +1058,27 @@ retry:
return 0;
}
+static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
+{
+ struct gendisk *disk = get_gendisk(bdev->bd_dev, partno);
+
+ if (!disk)
+ return NULL;
+ /*
+ * Now that we hold gendisk reference we make sure bdev we looked up is
+ * not stale. If it is, it means device got removed and created before
+ * we looked up gendisk and we fail open in such case. Associating
+ * unhashed bdev with newly created gendisk could lead to two bdevs
+ * (and thus two independent caches) being associated with one device
+ * which is bad.
+ */
+ if (inode_unhashed(bdev->bd_inode)) {
+ put_disk_and_module(disk);
+ return NULL;
+ }
+ return disk;
+}
+
/**
* bd_start_claiming - start claiming a block device
* @bdev: block device of interest
@@ -1094,7 +1115,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
* @bdev might not have been initialized properly yet, look up
* and grab the outer block device the hard way.
*/
- disk = get_gendisk(bdev->bd_dev, &partno);
+ disk = bdev_get_gendisk(bdev, &partno);
if (!disk)
return ERR_PTR(-ENXIO);
@@ -1111,8 +1132,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
else
whole = bdgrab(bdev);
- module_put(disk->fops->owner);
- put_disk(disk);
+ put_disk_and_module(disk);
if (!whole)
return ERR_PTR(-ENOMEM);
@@ -1304,7 +1324,8 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
* @bdev: struct bdev to adjust.
*
* This routine checks to see if the bdev size does not match the disk size
- * and adjusts it if it differs.
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
+ * are freed.
*/
void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
{
@@ -1317,7 +1338,8 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
"%s: detected capacity change from %lld to %lld\n",
disk->disk_name, bdev_size, disk_size);
i_size_write(bdev->bd_inode, disk_size);
- flush_disk(bdev, false);
+ if (bdev_size > disk_size)
+ flush_disk(bdev, false);
}
}
EXPORT_SYMBOL(check_disk_size_change);
@@ -1407,10 +1429,10 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
struct gendisk *disk;
- struct module *owner;
int ret;
int partno;
int perm = 0;
+ bool first_open = false;
if (mode & FMODE_READ)
perm |= MAY_READ;
@@ -1430,14 +1452,14 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
restart:
ret = -ENXIO;
- disk = get_gendisk(bdev->bd_dev, &partno);
+ disk = bdev_get_gendisk(bdev, &partno);
if (!disk)
goto out;
- owner = disk->fops->owner;
disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
+ first_open = true;
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
@@ -1463,8 +1485,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_queue = NULL;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
- put_disk(disk);
- module_put(owner);
+ put_disk_and_module(disk);
goto restart;
}
}
@@ -1524,15 +1545,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (ret)
goto out_unlock_bdev;
}
- /* only one opener holds refs to the module and disk */
- put_disk(disk);
- module_put(owner);
}
bdev->bd_openers++;
if (for_part)
bdev->bd_part_count++;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
+ /* only one opener holds refs to the module and disk */
+ if (!first_open)
+ put_disk_and_module(disk);
return 0;
out_clear:
@@ -1546,8 +1567,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
- put_disk(disk);
- module_put(owner);
+ put_disk_and_module(disk);
out:
bdput(bdev);
@@ -1770,8 +1790,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
disk->fops->release(disk, mode);
}
if (!bdev->bd_openers) {
- struct module *owner = disk->fops->owner;
-
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
@@ -1779,8 +1797,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
victim = bdev->bd_contains;
bdev->bd_contains = NULL;
- put_disk(disk);
- module_put(owner);
+ put_disk_and_module(disk);
}
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 273351ee4c46..167e5dc7eadd 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,7 +1,6 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
- select CRYPTO
- select CRYPTO_CRC32C
+ select LIBCRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4373628eb4..ca693dd554e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o
+ uuid-tree.o props.o free-space-tree.o tree-checker.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1ba49ebe67da..0066d95b133f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -46,12 +46,12 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
BUG();
}
- size = __btrfs_getxattr(inode, name, "", 0);
+ size = btrfs_getxattr(inode, name, "", 0);
if (size > 0) {
value = kzalloc(size, GFP_KERNEL);
if (!value)
return ERR_PTR(-ENOMEM);
- size = __btrfs_getxattr(inode, name, value, size);
+ size = btrfs_getxattr(inode, name, value, size);
}
if (size > 0) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
@@ -65,9 +65,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
return acl;
}
-/*
- * Needs to be called with fs_mutex held
- */
static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct posix_acl *acl, int type)
{
@@ -101,7 +98,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
goto out;
}
- ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+ ret = btrfs_setxattr(trans, inode, name, value, size, 0);
out:
kfree(value);
@@ -127,11 +124,6 @@ int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return ret;
}
-/*
- * btrfs_init_acl is already generally called under fs_mutex, so the locking
- * stuff has been fixed to work with that. If the locking stuff changes, we
- * need to re-evaluate the acl locking stuff.
- */
int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir)
{
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f94b2d8c744a..571024bc632e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -170,7 +170,7 @@ int __init btrfs_prelim_ref_init(void)
return 0;
}
-void btrfs_prelim_ref_exit(void)
+void __cold btrfs_prelim_ref_exit(void)
{
kmem_cache_destroy(btrfs_prelim_ref_cache);
}
@@ -738,7 +738,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(ref->key_for_search.type);
BUG_ON(!ref->wanted_disk_byte);
- eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0);
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0,
+ ref->level - 1, NULL);
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
@@ -773,15 +774,12 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
struct btrfs_key tmp_op_key;
- struct btrfs_key *op_key = NULL;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key) {
+ if (extent_op && extent_op->update_key)
btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
- op_key = &tmp_op_key;
- }
spin_lock(&head->lock);
for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
@@ -1291,7 +1289,8 @@ again:
ref->level == 0) {
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, ref->parent, 0);
+ eb = read_tree_block(fs_info, ref->parent, 0,
+ ref->level, NULL);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
@@ -1519,6 +1518,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
if (!node)
break;
bytenr = node->val;
+ shared.share_count = 0;
cond_resched();
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0c2fab8514ff..0a30028d5196 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -73,7 +73,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
int __init btrfs_prelim_ref_init(void);
-void btrfs_prelim_ref_exit(void);
+void __cold btrfs_prelim_ref_exit(void);
struct prelim_ref {
struct rb_node rbnode;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 63f0ccc92a71..ca15be569d69 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -195,7 +195,6 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
- long delayed_iput_count;
/*
* To avoid races between lockless (i_mutex not held) direct IO writes
@@ -365,6 +364,4 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
logical_start, csum, csum_expected, mirror_num);
}
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
-
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d51b5a5b505..3baebbc021c5 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -96,9 +96,9 @@
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
+#include <linux/crc32c.h>
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
#include "extent_io.h"
#include "volumes.h"
@@ -1736,7 +1736,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
size_t sublen = i ? PAGE_SIZE :
(PAGE_SIZE - BTRFS_CSUM_SIZE);
- crc = btrfs_crc32c(crc, data, sublen);
+ crc = crc32c(crc, data, sublen);
}
btrfs_csum_final(crc, csum);
if (memcmp(csum, h->csum, state->csum_size))
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 07d049c0c20f..562c3e633403 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1133,7 +1133,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
return ret;
}
-void btrfs_exit_compress(void)
+void __cold btrfs_exit_compress(void)
{
free_workspaces();
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 677fa4aa0bd7..ce796557a918 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -76,7 +76,7 @@ struct compressed_bio {
};
void __init btrfs_init_compress(void);
-void btrfs_exit_compress(void);
+void __cold btrfs_exit_compress(void);
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
u64 start, struct page **pages,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b88a79e69ddf..a2c9d21176e2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -41,8 +41,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
int level, int slot);
-static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb);
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -301,11 +299,6 @@ enum mod_log_op {
MOD_LOG_ROOT_REPLACE,
};
-struct tree_mod_move {
- int dst_slot;
- int nr_items;
-};
-
struct tree_mod_root {
u64 logical;
u8 level;
@@ -328,32 +321,15 @@ struct tree_mod_elem {
u64 blockptr;
/* this is used for op == MOD_LOG_MOVE_KEYS */
- struct tree_mod_move move;
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
/* this is used for op == MOD_LOG_ROOT_REPLACE */
struct tree_mod_root old_root;
};
-static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
-{
- read_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
-{
- read_unlock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
-{
- write_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
-{
- write_unlock(&fs_info->tree_mod_log_lock);
-}
-
/*
* Pull a new tree mod seq number for our operation.
*/
@@ -373,14 +349,14 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
- tree_mod_log_write_lock(fs_info);
+ write_lock(&fs_info->tree_mod_log_lock);
spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
spin_unlock(&fs_info->tree_mod_seq_lock);
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
return elem->seq;
}
@@ -422,7 +398,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
- tree_mod_log_write_lock(fs_info);
+ write_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
@@ -432,7 +408,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
rb_erase(node, tm_root);
kfree(tm);
}
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
}
/*
@@ -443,7 +419,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* for root replace operations, or the logical address of the affected
* block for all other operations.
*
- * Note: must be called with write lock (tree_mod_log_write_lock).
+ * Note: must be called with write lock for fs_info::tree_mod_log_lock.
*/
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -481,7 +457,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
* Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
* returns zero with the tree_mod_log_lock acquired. The caller must hold
* this until all tree mod log insertions are recorded in the rb tree and then
- * call tree_mod_log_write_unlock() to release.
+ * write unlock fs_info::tree_mod_log_lock.
*/
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb) {
@@ -491,9 +467,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
if (eb && btrfs_header_level(eb) == 0)
return 1;
- tree_mod_log_write_lock(fs_info);
+ write_lock(&fs_info->tree_mod_log_lock);
if (list_empty(&(fs_info)->tree_mod_seq_list)) {
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
return 1;
}
@@ -536,38 +512,34 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
return tm;
}
-static noinline int
-tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
+static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
{
struct tree_mod_elem *tm;
int ret;
- if (!tree_mod_need_log(fs_info, eb))
+ if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm = alloc_tree_mod_elem(eb, slot, op, flags);
if (!tm)
return -ENOMEM;
- if (tree_mod_dont_log(fs_info, eb)) {
+ if (tree_mod_dont_log(eb->fs_info, eb)) {
kfree(tm);
return 0;
}
- ret = __tree_mod_log_insert(fs_info, tm);
- tree_mod_log_write_unlock(fs_info);
+ ret = __tree_mod_log_insert(eb->fs_info, tm);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
kfree(tm);
return ret;
}
-static noinline int
-tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int dst_slot, int src_slot,
- int nr_items)
+static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot, int nr_items)
{
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
@@ -575,7 +547,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
int i;
int locked = 0;
- if (!tree_mod_need_log(fs_info, eb))
+ if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
@@ -603,7 +575,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
}
}
- if (tree_mod_dont_log(fs_info, eb))
+ if (tree_mod_dont_log(eb->fs_info, eb))
goto free_tms;
locked = 1;
@@ -613,26 +585,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
* buffer, i.e. dst_slot < src_slot.
*/
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+ ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
if (ret)
goto free_tms;
}
- ret = __tree_mod_log_insert(fs_info, tm);
+ ret = __tree_mod_log_insert(eb->fs_info, tm);
if (ret)
goto free_tms;
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
free_tms:
for (i = 0; i < nr_items; i++) {
if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
kfree(tm_list[i]);
}
if (locked)
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
kfree(tm);
@@ -660,12 +632,10 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
return 0;
}
-static noinline int
-tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
- struct extent_buffer *old_root,
- struct extent_buffer *new_root,
- int log_removal)
+static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root, int log_removal)
{
+ struct btrfs_fs_info *fs_info = old_root->fs_info;
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
int nritems = 0;
@@ -713,7 +683,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (!ret)
ret = __tree_mod_log_insert(fs_info, tm);
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
@@ -740,7 +710,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
struct tree_mod_elem *cur = NULL;
struct tree_mod_elem *found = NULL;
- tree_mod_log_read_lock(fs_info);
+ read_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
node = tm_root->rb_node;
while (node) {
@@ -768,7 +738,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
break;
}
}
- tree_mod_log_read_unlock(fs_info);
+ read_unlock(&fs_info->tree_mod_log_lock);
return found;
}
@@ -849,7 +819,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
goto free_tms;
}
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
@@ -861,36 +831,13 @@ free_tms:
kfree(tm_list[i]);
}
if (locked)
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return ret;
}
-static inline void
-tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
- int dst_offset, int src_offset, int nr_items)
-{
- int ret;
- ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
- nr_items);
- BUG_ON(ret < 0);
-}
-
-static noinline void
-tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int slot, int atomic)
-{
- int ret;
-
- ret = tree_mod_log_insert_key(fs_info, eb, slot,
- MOD_LOG_KEY_REPLACE,
- atomic ? GFP_ATOMIC : GFP_NOFS);
- BUG_ON(ret < 0);
-}
-
-static noinline int
-tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
{
struct tree_mod_elem **tm_list = NULL;
int nritems = 0;
@@ -900,7 +847,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
if (btrfs_header_level(eb) == 0)
return 0;
- if (!tree_mod_need_log(fs_info, NULL))
+ if (!tree_mod_need_log(eb->fs_info, NULL))
return 0;
nritems = btrfs_header_nritems(eb);
@@ -917,11 +864,11 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
}
}
- if (tree_mod_dont_log(fs_info, eb))
+ if (tree_mod_dont_log(eb->fs_info, eb))
goto free_tms;
- ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
- tree_mod_log_write_unlock(fs_info);
+ ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
@@ -936,17 +883,6 @@ free_tms:
return ret;
}
-static noinline void
-tree_mod_log_set_root_pointer(struct btrfs_root *root,
- struct extent_buffer *new_root_node,
- int log_removal)
-{
- int ret;
- ret = tree_mod_log_insert_root(root->fs_info, root->node,
- new_root_node, log_removal);
- BUG_ON(ret < 0);
-}
-
/*
* check if the tree block can be shared by multiple trees
*/
@@ -1173,7 +1109,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
extent_buffer_get(cow);
- tree_mod_log_set_root_pointer(root, cow, 1);
+ ret = tree_mod_log_insert_root(root->node, cow, 1);
+ BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
@@ -1182,7 +1119,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
- tree_mod_log_insert_key(fs_info, parent, parent_slot,
+ tree_mod_log_insert_key(parent, parent_slot,
MOD_LOG_KEY_REPLACE, GFP_NOFS);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
@@ -1190,7 +1127,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid);
btrfs_mark_buffer_dirty(parent);
if (last_ref) {
- ret = tree_mod_log_free_eb(fs_info, buf);
+ ret = tree_mod_log_free_eb(buf);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -1211,9 +1148,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
* returns the logical address of the oldest predecessor of the given root.
* entries older than time_seq are ignored.
*/
-static struct tree_mod_elem *
-__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb_root, u64 time_seq)
+static struct tree_mod_elem *__tree_mod_log_oldest_root(
+ struct extent_buffer *eb_root, u64 time_seq)
{
struct tree_mod_elem *tm;
struct tree_mod_elem *found = NULL;
@@ -1230,7 +1166,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
* first operation that's logged for this root.
*/
while (1) {
- tm = tree_mod_log_search_oldest(fs_info, root_logical,
+ tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
time_seq);
if (!looped && !tm)
return NULL;
@@ -1279,7 +1215,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
- tree_mod_log_read_lock(fs_info);
+ read_lock(&fs_info->tree_mod_log_lock);
while (tm && tm->seq >= time_seq) {
/*
* all the operations are recorded with the operator used for
@@ -1334,7 +1270,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
if (tm->logical != first_tm->logical)
break;
}
- tree_mod_log_read_unlock(fs_info);
+ read_unlock(&fs_info->tree_mod_log_lock);
btrfs_set_header_nritems(eb, n);
}
@@ -1418,9 +1354,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
+ int level;
eb_root = btrfs_read_lock_root_node(root);
- tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq);
+ tm = __tree_mod_log_oldest_root(eb_root, time_seq);
if (!tm)
return eb_root;
@@ -1428,15 +1365,17 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
old_root = &tm->old_root;
old_generation = tm->generation;
logical = old_root->logical;
+ level = old_root->level;
} else {
logical = eb_root->start;
+ level = btrfs_header_level(eb_root);
}
tm = tree_mod_log_search(fs_info, logical, time_seq);
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, 0);
+ old = read_tree_block(fs_info, logical, 0, level, NULL);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
@@ -1484,7 +1423,7 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
int level;
struct extent_buffer *eb_root = btrfs_root_node(root);
- tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+ tm = __tree_mod_log_oldest_root(eb_root, time_seq);
if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
level = tm->old_root.level;
} else {
@@ -1502,8 +1441,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
if (btrfs_is_testing(root->fs_info))
return 0;
- /* ensure we can see the force_cow */
- smp_rmb();
+ /* Ensure we can see the FORCE_COW bit */
+ smp_mb__before_atomic();
/*
* We do not need to cow a block if
@@ -1656,6 +1595,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(parent);
for (i = start_slot; i <= end_slot; i++) {
+ struct btrfs_key first_key;
int close = 1;
btrfs_node_key(parent, &disk_key, i);
@@ -1665,6 +1605,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
progress_passed = 1;
blocknr = btrfs_node_blockptr(parent, i);
gen = btrfs_node_ptr_generation(parent, i);
+ btrfs_node_key_to_cpu(parent, &first_key, i);
if (last_block == 0)
last_block = blocknr;
@@ -1688,7 +1629,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
uptodate = 0;
if (!cur || !uptodate) {
if (!cur) {
- cur = read_tree_block(fs_info, blocknr, gen);
+ cur = read_tree_block(fs_info, blocknr, gen,
+ parent_level - 1,
+ &first_key);
if (IS_ERR(cur)) {
return PTR_ERR(cur);
} else if (!extent_buffer_uptodate(cur)) {
@@ -1696,7 +1639,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
return -EIO;
}
} else if (!uptodate) {
- err = btrfs_read_buffer(cur, gen);
+ err = btrfs_read_buffer(cur, gen,
+ parent_level - 1,&first_key);
if (err) {
free_extent_buffer(cur);
return err;
@@ -1849,14 +1793,17 @@ read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
{
int level = btrfs_header_level(parent);
struct extent_buffer *eb;
+ struct btrfs_key first_key;
if (slot < 0 || slot >= btrfs_header_nritems(parent))
return ERR_PTR(-ENOENT);
BUG_ON(level == 0);
+ btrfs_node_key_to_cpu(parent, &first_key, slot);
eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
- btrfs_node_ptr_generation(parent, slot));
+ btrfs_node_ptr_generation(parent, slot),
+ level - 1, &first_key);
if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
eb = ERR_PTR(-EIO);
@@ -1928,7 +1875,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
- tree_mod_log_set_root_pointer(root, child, 1);
+ ret = tree_mod_log_insert_root(root->node, child, 1);
+ BUG_ON(ret < 0);
rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
@@ -2007,8 +1955,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
- tree_mod_log_set_node_key(fs_info, parent,
- pslot + 1, 0);
+ ret = tree_mod_log_insert_key(parent, pslot + 1,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -2052,7 +2001,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
+ ret = tree_mod_log_insert_key(parent, pslot,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
}
@@ -2153,7 +2104,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
- tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
+ ret = tree_mod_log_insert_key(parent, pslot,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -2207,8 +2160,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
- tree_mod_log_set_node_key(fs_info, parent,
- pslot + 1, 0);
+ ret = tree_mod_log_insert_key(parent, pslot + 1,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2445,10 +2399,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
u64 gen;
struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
+ struct btrfs_key first_key;
int ret;
+ int parent_level;
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
+ parent_level = btrfs_header_level(b);
+ btrfs_node_key_to_cpu(b, &first_key, slot);
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
@@ -2467,7 +2425,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_set_path_blocking(p);
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_buffer(tmp, gen);
+ ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
if (!ret) {
*eb_ret = tmp;
return 0;
@@ -2494,7 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_release_path(p);
ret = -EAGAIN;
- tmp = read_tree_block(fs_info, blocknr, 0);
+ tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
+ &first_key);
if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
@@ -3161,13 +3120,17 @@ static void fixup_low_keys(struct btrfs_fs_info *fs_info,
{
int i;
struct extent_buffer *t;
+ int ret;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
int tslot = path->slots[i];
+
if (!path->nodes[i])
break;
t = path->nodes[i];
- tree_mod_log_set_node_key(fs_info, t, tslot, 1);
+ ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
+ GFP_ATOMIC);
+ BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -3264,8 +3227,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
if (push_items < src_nritems) {
/*
- * don't call tree_mod_log_eb_move here, key removal was already
- * fully logged by tree_mod_log_eb_copy above.
+ * Don't call tree_mod_log_insert_move here, key removal was
+ * already fully logged by tree_mod_log_eb_copy above.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
@@ -3320,7 +3283,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
- tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems);
+ ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+ BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
@@ -3363,6 +3327,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct extent_buffer *c;
struct extent_buffer *old;
struct btrfs_disk_key lower_key;
+ int ret;
BUG_ON(path->nodes[level]);
BUG_ON(path->nodes[level-1] != root->node);
@@ -3401,7 +3366,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- tree_mod_log_set_root_pointer(root, c, 0);
+ ret = tree_mod_log_insert_root(root->node, c, 0);
+ BUG_ON(ret < 0);
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -3438,17 +3404,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(slot > nritems);
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
if (slot != nritems) {
- if (level)
- tree_mod_log_eb_move(fs_info, lower, slot + 1,
- slot, nritems - slot);
+ if (level) {
+ ret = tree_mod_log_insert_move(lower, slot + 1, slot,
+ nritems - slot);
+ BUG_ON(ret < 0);
+ }
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
btrfs_node_key_ptr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
- ret = tree_mod_log_insert_key(fs_info, lower, slot,
- MOD_LOG_KEY_ADD, GFP_NOFS);
+ ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
+ GFP_NOFS);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@@ -4911,17 +4879,19 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
- if (level)
- tree_mod_log_eb_move(fs_info, parent, slot,
- slot + 1, nritems - slot - 1);
+ if (level) {
+ ret = tree_mod_log_insert_move(parent, slot, slot + 1,
+ nritems - slot - 1);
+ BUG_ON(ret < 0);
+ }
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(slot),
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
- ret = tree_mod_log_insert_key(fs_info, parent, slot,
- MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
+ GFP_NOFS);
BUG_ON(ret < 0);
}
@@ -5145,9 +5115,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
* into min_key, so you can call btrfs_search_slot with cow=1 on the
* key and get a writable path.
*
- * This does lock as it descends, and path->keep_locks should be set
- * to 1 by the caller.
- *
* This honors path->lowest_level to prevent descent past a given level
* of the tree.
*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a462ab85c49..0eb55825862a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,6 +40,7 @@
#include <linux/sizes.h>
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
+#include <linux/crc32c.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -65,6 +66,8 @@ struct btrfs_ordered_sum;
#define BTRFS_MAX_LEVEL 8
+#define BTRFS_OLDEST_GENERATION 0ULL
+
#define BTRFS_COMPAT_EXTENT_TREE_V0
/*
@@ -86,9 +89,9 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_LINK_MAX 65535U
+/* four bytes for CRC32 */
static const int btrfs_csum_sizes[] = { 4 };
-/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
/* ioprio of readahead is set to idle */
@@ -98,6 +101,7 @@ static const int btrfs_csum_sizes[] = { 4 };
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+
/*
* Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
*/
@@ -381,8 +385,9 @@ struct btrfs_dev_replace {
/* For raid type sysfs entries */
struct raid_kobject {
- int raid_type;
+ u64 flags;
struct kobject kobj;
+ struct list_head list;
};
struct btrfs_space_info {
@@ -707,7 +712,6 @@ struct btrfs_delayed_root;
#define BTRFS_FS_LOG_RECOVERING 4
#define BTRFS_FS_OPEN 5
#define BTRFS_FS_QUOTA_ENABLED 6
-#define BTRFS_FS_QUOTA_ENABLING 7
#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9
#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10
#define BTRFS_FS_BTREE_ERR 11
@@ -788,7 +792,7 @@ struct btrfs_fs_info {
unsigned long pending_changes;
unsigned long compress_type:4;
unsigned int compress_level;
- int commit_interval;
+ u32 commit_interval;
/*
* It is a suggestive number, the read side is safe even it gets a
* wrong number because we will write out the data into a regular
@@ -877,7 +881,6 @@ struct btrfs_fs_info {
struct rb_root tree_mod_log;
atomic_t async_delalloc_pages;
- atomic_t open_ioctl_trans;
/*
* this is used to protect the following list -- ordered_roots.
@@ -935,9 +938,11 @@ struct btrfs_fs_info {
struct btrfs_workqueue *extent_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
- int thread_pool_size;
+ u32 thread_pool_size;
struct kobject *space_info_kobj;
+ struct list_head pending_raid_kobjs;
+ spinlock_t pending_raid_kobjs_lock; /* uncontended */
u64 total_pinned;
@@ -952,9 +957,9 @@ struct btrfs_fs_info {
struct btrfs_fs_devices *fs_devices;
/*
- * the space_info list is almost entirely read only. It only changes
- * when we add a new raid type to the FS, and that happens
- * very rarely. RCU is used to protect it.
+ * The space_info list is effectively read only after initial
+ * setup. It is populated at mount time and cleaned up after
+ * all block groups are removed. RCU is used to protect it.
*/
struct list_head space_info;
@@ -993,8 +998,8 @@ struct btrfs_fs_info {
struct btrfs_balance_control *balance_ctl;
wait_queue_head_t balance_wait_q;
- unsigned data_chunk_allocations;
- unsigned metadata_ratio;
+ u32 data_chunk_allocations;
+ u32 metadata_ratio;
void *bdev_holder;
@@ -1260,12 +1265,13 @@ struct btrfs_root {
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
- /* For qgroup metadata space reserve */
- atomic64_t qgroup_meta_rsv;
+ /* For qgroup metadata reserved space */
+ spinlock_t qgroup_meta_rsv_lock;
+ u64 qgroup_meta_rsv_pertrans;
+ u64 qgroup_meta_rsv_prealloc;
};
struct btrfs_file_private {
- struct btrfs_trans_handle *trans;
void *filldir_buf;
};
@@ -2554,6 +2560,20 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
btrfs_item_offset_nr(leaf, slot)))
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return crc32c((u32)~1, name, len);
+}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+ int len)
+{
+ return (u64) crc32c(parent_objectid, name, len);
+}
+
static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
@@ -2608,7 +2628,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, unsigned long count);
+ unsigned long count);
int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
unsigned long count, u64 transid, int wait);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
@@ -2628,7 +2648,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
u64 bytenr);
void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-int get_block_group_index(struct btrfs_block_group_cache *cache);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2668,15 +2687,13 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset);
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
@@ -2688,6 +2705,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
@@ -2697,8 +2715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
@@ -2730,11 +2747,10 @@ int btrfs_check_data_free_space(struct inode *inode,
void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len);
+ struct extent_changeset *reserved,
+ u64 start, u64 len, bool qgroup_free);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
@@ -2745,10 +2761,12 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
u64 *qgroup_reserved, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free);
int btrfs_delalloc_reserve_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
@@ -2792,7 +2810,6 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-int __get_raid_index(u64 flags);
int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
@@ -2974,7 +2991,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
security_free_mnt_opts(&fs_info->security_opts);
- kfree(fs_info);
+ kvfree(fs_info);
}
/* tree mod log functions from ctree.c */
@@ -3095,7 +3112,10 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow);
-int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
+ const char *name,
+ int name_len, struct btrfs_inode_ref **ref_ret);
+int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
u64 ref_objectid, const char *name,
int name_len,
struct btrfs_inode_extref **extref_ret);
@@ -3192,8 +3212,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
-void btrfs_destroy_cachep(void);
-long btrfs_ioctl_trans_end(struct file *file);
+void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
@@ -3243,7 +3262,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
/* file.c */
int __init btrfs_auto_defrag_init(void);
-void btrfs_auto_defrag_exit(void);
+void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
@@ -3278,25 +3297,23 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/* sysfs.c */
int __init btrfs_init_sysfs(void);
-void btrfs_exit_sysfs(void);
+void __cold btrfs_exit_sysfs(void);
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
-/* xattr.c */
-ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-
/* super.c */
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
-static inline __printf(2, 3)
+static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
#ifdef CONFIG_PRINTK
__printf(2, 3)
+__cold
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#else
#define btrfs_printk(fs_info, fmt, args...) \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0530f6f2e4ba..86ec2edc05e8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -23,6 +23,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "ctree.h"
+#include "qgroup.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -42,7 +43,7 @@ int __init btrfs_delayed_inode_init(void)
return 0;
}
-void btrfs_delayed_inode_exit(void)
+void __cold btrfs_delayed_inode_exit(void)
{
kmem_cache_destroy(delayed_node_cache);
}
@@ -552,11 +553,12 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 num_bytes;
int ret;
@@ -578,15 +580,17 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
return ret;
}
-static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info,
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
if (!item->bytes_reserved)
return;
rsv = &fs_info->delayed_block_rsv;
+ btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved);
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid, item->bytes_reserved,
0);
@@ -611,6 +615,9 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ if (ret < 0)
+ return ret;
/*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
* which doesn't reserve space for speed. This is a problem since we
@@ -630,8 +637,10 @@ static int btrfs_delayed_inode_reserve_metadata(
* EAGAIN to make us stop the transaction we have, so return
* ENOSPC instead so that btrfs_dirty_inode knows what to do.
*/
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
ret = -ENOSPC;
+ btrfs_qgroup_free_meta_prealloc(root, num_bytes);
+ }
if (!ret) {
node->bytes_reserved = num_bytes;
trace_btrfs_space_reservation(fs_info,
@@ -653,7 +662,8 @@ static int btrfs_delayed_inode_reserve_metadata(
}
static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_node *node)
+ struct btrfs_delayed_node *node,
+ bool qgroup_free)
{
struct btrfs_block_rsv *rsv;
@@ -665,6 +675,12 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
node->inode_id, node->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, rsv,
node->bytes_reserved);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(node->root,
+ node->bytes_reserved);
+ else
+ btrfs_qgroup_convert_reserved_meta(node->root,
+ node->bytes_reserved);
node->bytes_reserved = 0;
}
@@ -766,7 +782,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
curr->data_len);
slot++;
- btrfs_delayed_item_release_metadata(fs_info, curr);
+ btrfs_delayed_item_release_metadata(root, curr);
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
@@ -788,7 +804,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *delayed_item)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
char *ptr;
int ret;
@@ -806,7 +821,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
delayed_item->data_len);
btrfs_mark_buffer_dirty(leaf);
- btrfs_delayed_item_release_metadata(fs_info, delayed_item);
+ btrfs_delayed_item_release_metadata(root, delayed_item);
return 0;
}
@@ -858,7 +873,6 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -908,7 +922,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
goto out;
list_for_each_entry_safe(curr, next, &head, tree_list) {
- btrfs_delayed_item_release_metadata(fs_info, curr);
+ btrfs_delayed_item_release_metadata(root, curr);
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
@@ -1051,7 +1065,7 @@ out:
no_iref:
btrfs_release_path(path);
err_out:
- btrfs_delayed_inode_release_metadata(fs_info, node);
+ btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
return ret;
@@ -1115,9 +1129,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
* Returns < 0 on error and returns with an aborted transaction with any
* outstanding delayed items cleaned up.
*/
-static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, int nr)
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_path *path;
@@ -1162,16 +1176,14 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
{
- return __btrfs_run_delayed_items(trans, fs_info, -1);
+ return __btrfs_run_delayed_items(trans, -1);
}
-int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, int nr)
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
{
- return __btrfs_run_delayed_items(trans, fs_info, nr);
+ return __btrfs_run_delayed_items(trans, nr);
}
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
@@ -1443,7 +1455,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_type(dir_item, type);
memcpy((char *)(dir_item + 1), name, name_len);
- ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, delayed_item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible
@@ -1480,7 +1492,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
return 1;
}
- btrfs_delayed_item_release_metadata(fs_info, item);
+ btrfs_delayed_item_release_metadata(node->root, item);
btrfs_release_delayed_item(item);
mutex_unlock(&node->mutex);
return 0;
@@ -1515,7 +1527,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
item->key = item_key;
- ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
@@ -1880,7 +1892,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
mutex_lock(&delayed_node->mutex);
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(fs_info, curr_item);
+ btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
@@ -1888,7 +1900,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(fs_info, curr_item);
+ btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
@@ -1898,7 +1910,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
btrfs_release_delayed_iref(delayed_node);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- btrfs_delayed_inode_release_metadata(fs_info, delayed_node);
+ btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
btrfs_release_delayed_inode(delayed_node);
}
mutex_unlock(&delayed_node->mutex);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index c4189d495934..100a91e26b55 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -111,10 +111,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode);
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, int nr);
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr);
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info);
@@ -151,7 +149,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
/* for init */
int __init btrfs_delayed_inode_init(void);
-void btrfs_delayed_inode_exit(void);
+void __cold btrfs_delayed_inode_exit(void);
/* for debugging */
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7ab5e0128f0c..2677257c149d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -216,7 +216,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- assert_spin_locked(&delayed_refs->lock);
+ lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
return 0;
@@ -239,7 +239,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
- assert_spin_locked(&head->lock);
+ lockdep_assert_held(&head->lock);
rb_erase(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
@@ -307,7 +307,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct rb_node *node;
u64 seq = 0;
- assert_spin_locked(&head->lock);
+ lockdep_assert_held(&head->lock);
if (RB_EMPTY_ROOT(&head->ref_tree))
return;
@@ -930,7 +930,7 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
return find_ref_head(&delayed_refs->href_root, bytenr, 0);
}
-void btrfs_delayed_ref_exit(void)
+void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c4f625e5a691..9e3e5aff0937 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -204,7 +204,7 @@ extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
int __init btrfs_delayed_ref_init(void);
-void btrfs_delayed_ref_exit(void);
+void __cold btrfs_delayed_ref_exit(void);
static inline struct btrfs_delayed_extent_op *
btrfs_alloc_delayed_extent_op(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7efbc4d1128b..0d203633bb96 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -44,7 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev);
-static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
static int btrfs_dev_replace_kthread(void *data);
static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
@@ -174,8 +173,14 @@ no_valid_dev_replace_entry_found:
}
set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&dev_replace->tgtdev->dev_state);
- btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
- dev_replace->tgtdev);
+
+ WARN_ON(fs_info->fs_devices->rw_devices == 0);
+ dev_replace->tgtdev->io_width = fs_info->sectorsize;
+ dev_replace->tgtdev->io_align = fs_info->sectorsize;
+ dev_replace->tgtdev->sector_size = fs_info->sectorsize;
+ dev_replace->tgtdev->fs_info = fs_info;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &dev_replace->tgtdev->dev_state);
}
break;
}
@@ -200,13 +205,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_dev_replace_item *ptr;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
if (!dev_replace->is_valid ||
!dev_replace->item_needs_writeback) {
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
return 0;
}
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +269,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_dev_replace_item);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
if (dev_replace->srcdev)
btrfs_set_dev_replace_src_devid(eb, ptr,
dev_replace->srcdev->devid);
@@ -287,7 +292,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
btrfs_set_dev_replace_cursor_right(eb, ptr,
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_mark_buffer_dirty(eb);
@@ -307,7 +312,7 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
static char* btrfs_dev_name(struct btrfs_device *device)
{
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
return rcu_str_deref(device->name);
@@ -352,7 +357,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(trans);
}
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -390,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@@ -402,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
goto leave;
}
@@ -426,7 +431,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
leave:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
}
@@ -493,18 +498,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* don't allow cancel or unmount to disturb the finishing procedure */
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
/* was the operation canceled, or is it finished? */
if (dev_replace->replace_state !=
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return 0;
}
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
/*
* flush all outstanding I/O and inode extent mappings before the
@@ -529,7 +534,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -549,7 +554,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
@@ -586,7 +591,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -679,7 +684,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
/* even if !dev_replace_is_valid, the values are good enough for
* the replace_status ioctl */
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -691,41 +696,36 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
args->status.num_uncorrectable_read_errors =
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
}
-int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
- struct btrfs_ioctl_dev_replace_args *args)
-{
- args->result = __btrfs_dev_replace_cancel(fs_info);
- return 0;
-}
-
-static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *tgt_device = NULL;
+ struct btrfs_device *src_device = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = fs_info->tree_root;
- u64 result;
+ int result;
int ret;
if (sb_rdonly(fs_info->sb))
return -EROFS;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
goto leave;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
break;
@@ -733,7 +733,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
@@ -743,6 +743,12 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
}
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
+
+ btrfs_info_in_rcu(fs_info,
+ "dev_replace from %s (devid %llu) to %s canceled",
+ btrfs_dev_name(src_device), src_device->devid,
+ btrfs_dev_name(tgt_device));
+
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
@@ -756,7 +762,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -772,7 +778,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}
@@ -782,12 +788,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
struct task_struct *task;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
break;
@@ -801,10 +807,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
@@ -873,37 +879,37 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
+void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
{
- if (rw == 1) {
- /* write */
-again:
- wait_event(dev_replace->read_lock_wq,
- atomic_read(&dev_replace->blocking_readers) == 0);
- write_lock(&dev_replace->lock);
- if (atomic_read(&dev_replace->blocking_readers)) {
- write_unlock(&dev_replace->lock);
- goto again;
- }
- } else {
- read_lock(&dev_replace->lock);
- atomic_inc(&dev_replace->read_locks);
- }
+ read_lock(&dev_replace->lock);
+ atomic_inc(&dev_replace->read_locks);
+}
+
+void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
+{
+ ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+ atomic_dec(&dev_replace->read_locks);
+ read_unlock(&dev_replace->lock);
}
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
+void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace)
{
- if (rw == 1) {
- /* write */
- ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+again:
+ wait_event(dev_replace->read_lock_wq,
+ atomic_read(&dev_replace->blocking_readers) == 0);
+ write_lock(&dev_replace->lock);
+ if (atomic_read(&dev_replace->blocking_readers)) {
write_unlock(&dev_replace->lock);
- } else {
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- atomic_dec(&dev_replace->read_locks);
- read_unlock(&dev_replace->lock);
+ goto again;
}
}
+void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
+{
+ ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+ write_unlock(&dev_replace->lock);
+}
+
/* inc blocking cnt and release read lock */
void btrfs_dev_replace_set_lock_blocking(
struct btrfs_dev_replace *dev_replace)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index f94a76844ae7..8566a02ef222 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -32,13 +32,14 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
int read_src);
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
- struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
+void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_clear_lock_blocking(
struct btrfs_dev_replace *dev_replace);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index cbe421605cd5..29e967b2c667 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -18,7 +18,6 @@
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 21f34ad0d411..07b5e6f7df67 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,10 +31,10 @@
#include <linux/uuid.h>
#include <linux/semaphore.h>
#include <linux/error-injection.h>
+#include <linux/crc32c.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
@@ -110,7 +110,7 @@ int __init btrfs_end_io_wq_init(void)
return 0;
}
-void btrfs_end_io_wq_exit(void)
+void __cold btrfs_end_io_wq_exit(void)
{
kmem_cache_destroy(btrfs_end_io_wq_cache);
}
@@ -124,8 +124,8 @@ struct async_submit_bio {
void *private_data;
struct btrfs_fs_info *fs_info;
struct bio *bio;
- extent_submit_bio_hook_t *submit_bio_start;
- extent_submit_bio_hook_t *submit_bio_done;
+ extent_submit_bio_start_t *submit_bio_start;
+ extent_submit_bio_done_t *submit_bio_done;
int mirror_num;
unsigned long bio_flags;
/*
@@ -270,7 +270,7 @@ out:
u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
{
- return btrfs_crc32c(seed, data, len);
+ return crc32c(seed, data, len);
}
void btrfs_csum_final(u32 crc, u8 *result)
@@ -403,8 +403,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
u32 crc = ~(u32)0;
- const int csum_size = sizeof(crc);
- char result[csum_size];
+ char result[sizeof(crc)];
/*
* The super_block structure does not span the whole
@@ -415,7 +414,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, result);
- if (memcmp(raw_disk_sb, result, csum_size))
+ if (memcmp(raw_disk_sb, result, sizeof(result)))
ret = 1;
}
@@ -428,13 +427,59 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
return ret;
}
+static int verify_level_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key)
+{
+ int found_level;
+ struct btrfs_key found_key;
+ int ret;
+
+ found_level = btrfs_header_level(eb);
+ if (found_level != level) {
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_ON(1);
+ btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+ eb->start, level, found_level);
+#endif
+ return -EIO;
+ }
+
+ if (!first_key)
+ return 0;
+
+ if (found_level)
+ btrfs_node_key_to_cpu(eb, &found_key, 0);
+ else
+ btrfs_item_key_to_cpu(eb, &found_key, 0);
+ ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret) {
+ WARN_ON(1);
+ btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
+ eb->start, first_key->objectid, first_key->type,
+ first_key->offset, found_key.objectid,
+ found_key.type, found_key.offset);
+ }
+#endif
+ return ret;
+}
+
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
+ *
+ * @parent_transid: expected transid, skip check if 0
+ * @level: expected level, mandatory check
+ * @first_key: expected key of first slot, skip check if NULL
*/
static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
- u64 parent_transid)
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct extent_io_tree *io_tree;
int failed = 0;
@@ -449,11 +494,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
mirror_num);
if (!ret) {
- if (!verify_parent_transid(io_tree, eb,
+ if (verify_parent_transid(io_tree, eb,
parent_transid, 0))
- break;
- else
ret = -EIO;
+ else if (verify_level_key(fs_info, eb, level,
+ first_key))
+ ret = -EUCLEAN;
+ else
+ break;
}
/*
@@ -461,7 +509,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
* there is no reason to read the other copies, they won't be
* any less wrong.
*/
- if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) ||
+ ret == -EUCLEAN)
break;
num_copies = btrfs_num_copies(fs_info,
@@ -602,12 +651,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
+ if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
- if (found_level > 0 && btrfs_check_node(root, eb))
+ if (found_level > 0 && btrfs_check_node(fs_info, eb))
ret = -EIO;
if (!ret)
@@ -710,14 +759,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
return 0;
}
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
-{
- unsigned long limit = min_t(unsigned long,
- info->thread_pool_size,
- info->fs_devices->open_devices);
- return 256 * limit;
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -725,7 +766,6 @@ static void run_one_async_start(struct btrfs_work *work)
async = container_of(work, struct async_submit_bio, work);
ret = async->submit_bio_start(async->private_data, async->bio,
- async->mirror_num, async->bio_flags,
async->bio_offset);
if (ret)
async->status = ret;
@@ -744,8 +784,7 @@ static void run_one_async_done(struct btrfs_work *work)
return;
}
- async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
- async->bio_flags, async->bio_offset);
+ async->submit_bio_done(async->private_data, async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -759,8 +798,8 @@ static void run_one_async_free(struct btrfs_work *work)
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset, void *private_data,
- extent_submit_bio_hook_t *submit_bio_start,
- extent_submit_bio_hook_t *submit_bio_done)
+ extent_submit_bio_start_t *submit_bio_start,
+ extent_submit_bio_done_t *submit_bio_done)
{
struct async_submit_bio *async;
@@ -807,8 +846,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
+static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
/*
@@ -818,9 +856,8 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio
return btree_csum_one_bio(bio);
}
-static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio,
+ int mirror_num)
{
struct inode *inode = private_data;
blk_status_t ret;
@@ -879,8 +916,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
*/
ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
bio_offset, private_data,
- __btree_submit_bio_start,
- __btree_submit_bio_done);
+ btree_submit_bio_start,
+ btree_submit_bio_done);
}
if (ret)
@@ -1062,8 +1099,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
buf->start, buf->start + buf->len - 1);
}
+/*
+ * Read tree block at logical address @bytenr and do variant basic but critical
+ * verification.
+ *
+ * @parent_transid: expected transid of this tree block, skip check if 0
+ * @level: expected level, mandatory check
+ * @first_key: expected key in slot 0, skip check if NULL
+ */
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid)
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct extent_buffer *buf = NULL;
int ret;
@@ -1072,7 +1118,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+ ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+ level, first_key);
if (ret) {
free_extent_buffer(buf);
return ERR_PTR(ret);
@@ -1108,7 +1155,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
if (!writers)
return ERR_PTR(-ENOMEM);
- ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
if (ret < 0) {
kfree(writers);
return ERR_PTR(ret);
@@ -1160,6 +1207,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
spin_lock_init(&root->accounting_lock);
spin_lock_init(&root->log_extents_lock[0]);
spin_lock_init(&root->log_extents_lock[1]);
+ spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
@@ -1176,7 +1224,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->orphan_inodes, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
- atomic64_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1401,6 +1448,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_path *path;
u64 generation;
int ret;
+ int level;
path = btrfs_alloc_path();
if (!path)
@@ -1423,9 +1471,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
}
generation = btrfs_root_generation(&root->root_item);
+ level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
- generation);
+ generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
goto find_fail;
@@ -1808,12 +1857,10 @@ sleep:
if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
&fs_info->fs_state)))
btrfs_cleanup_transaction(fs_info);
- set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
cannot_commit))
- schedule_timeout(delay);
- __set_current_state(TASK_RUNNING);
+ schedule_timeout_interruptible(delay);
} while (!kthread_should_stop());
return 0;
}
@@ -2183,7 +2230,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
- int max_active = fs_info->thread_pool_size;
+ u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
@@ -2276,6 +2323,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
+ int level = btrfs_super_log_root_level(disk_super);
if (fs_devices->rw_devices == 0) {
btrfs_warn(fs_info, "log replay required on RO media");
@@ -2289,7 +2337,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
__setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
log_tree_root->node = read_tree_block(fs_info, bytenr,
- fs_info->generation + 1);
+ fs_info->generation + 1,
+ level, NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2334,23 +2383,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.offset = 0;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->extent_root = root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
btrfs_init_devices_late(fs_info);
location.objectid = BTRFS_CSUM_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->csum_root = root;
@@ -2367,7 +2422,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (IS_ERR(root)) {
ret = PTR_ERR(root);
if (ret != -ENOENT)
- return ret;
+ goto out;
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->uuid_root = root;
@@ -2376,13 +2431,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->free_space_root = root;
}
return 0;
+out:
+ btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
+ location.objectid, ret);
+ return ret;
}
int open_ctree(struct super_block *sb,
@@ -2404,8 +2465,8 @@ int open_ctree(struct super_block *sb,
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
- int max_active;
int clear_free_space_tree = 0;
+ int level;
tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2447,6 +2508,8 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->delayed_iputs);
INIT_LIST_HEAD(&fs_info->delalloc_roots);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
+ spin_lock_init(&fs_info->pending_raid_kobjs_lock);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
@@ -2713,8 +2776,6 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- max_active = fs_info->thread_pool_size;
-
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -2741,12 +2802,13 @@ int open_ctree(struct super_block *sb,
}
generation = btrfs_super_chunk_root_generation(disk_super);
+ level = btrfs_super_chunk_root_level(disk_super);
__setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
- generation);
+ generation, level, NULL);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
btrfs_err(fs_info, "failed to read chunk root");
@@ -2768,10 +2830,10 @@ int open_ctree(struct super_block *sb,
}
/*
- * keep the device that is marked to be the target device for the
- * dev_replace procedure
+ * Keep the devid that is marked to be the target device for the
+ * device replace procedure
*/
- btrfs_close_extra_devices(fs_devices, 0);
+ btrfs_free_extra_devids(fs_devices, 0);
if (!fs_devices->latest_bdev) {
btrfs_err(fs_info, "failed to read devices");
@@ -2780,10 +2842,11 @@ int open_ctree(struct super_block *sb,
retry_root_backup:
generation = btrfs_super_generation(disk_super);
+ level = btrfs_super_root_level(disk_super);
tree_root->node = read_tree_block(fs_info,
btrfs_super_root(disk_super),
- generation);
+ generation, level, NULL);
if (IS_ERR(tree_root->node) ||
!extent_buffer_uptodate(tree_root->node)) {
btrfs_warn(fs_info, "failed to read tree root");
@@ -2834,7 +2897,7 @@ retry_root_backup:
goto fail_block_groups;
}
- btrfs_close_extra_devices(fs_devices, 1);
+ btrfs_free_extra_devids(fs_devices, 1);
ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
if (ret) {
@@ -2953,6 +3016,7 @@ retry_root_backup:
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
+ btrfs_warn(fs_info, "failed to read fs tree: %d", err);
goto fail_qgroup;
}
@@ -3290,6 +3354,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
struct buffer_head *bh;
int i;
int errors = 0;
+ bool primary_failed = false;
u64 bytenr;
if (max_mirrors == 0)
@@ -3306,11 +3371,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
errors++;
+ if (i == 0)
+ primary_failed = true;
continue;
}
wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
+ if (!buffer_uptodate(bh)) {
errors++;
+ if (i == 0)
+ primary_failed = true;
+ }
/* drop our reference */
brelse(bh);
@@ -3319,6 +3389,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
brelse(bh);
}
+ /* log error, force error return */
+ if (primary_failed) {
+ btrfs_err(device->fs_info, "error writing primary super block to device %llu",
+ device->devid);
+ return -1;
+ }
+
return errors < i ? 0 : -1;
}
@@ -3851,7 +3928,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
* So here we should only check item pointers, not item data.
*/
if (btrfs_header_level(buf) == 0 &&
- btrfs_check_leaf_relaxed(root, buf)) {
+ btrfs_check_leaf_relaxed(fs_info, buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
@@ -3890,12 +3967,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
__btrfs_btree_balance_dirty(fs_info, 0);
}
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- return btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+ return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+ level, first_key);
}
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
@@ -4314,11 +4393,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
dirty_list);
- if (!cache) {
- btrfs_err(fs_info, "orphan block group dirty_bgs list");
- spin_unlock(&cur_trans->dirty_bgs_lock);
- return;
- }
if (!list_empty(&cache->io_list)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4338,14 +4412,14 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
}
spin_unlock(&cur_trans->dirty_bgs_lock);
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
while (!list_empty(&cur_trans->io_bgs)) {
cache = list_first_entry(&cur_trans->io_bgs,
struct btrfs_block_group_cache,
io_list);
- if (!cache) {
- btrfs_err(fs_info, "orphan block group on io_bgs list");
- return;
- }
list_del_init(&cache->io_list);
spin_lock(&cache->lock);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 301151a50ac1..453ea9f5d4e9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,8 +52,9 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
-struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 parent_transid);
+struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key);
void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
int mirror_num, struct extent_buffer **eb);
@@ -123,7 +124,8 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+ struct btrfs_key *first_key);
u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, u8 *result);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -131,9 +133,8 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset, void *private_data,
- extent_submit_bio_hook_t *submit_bio_start,
- extent_submit_bio_hook_t *submit_bio_done);
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+ extent_submit_bio_start_t *submit_bio_start,
+ extent_submit_bio_done_t *submit_bio_done);
int btrfs_write_tree_block(struct extent_buffer *buf);
void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -154,7 +155,7 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
int create);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
-void btrfs_end_io_wq_exit(void);
+void __cold btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c1618ab9fecf..e08d0d45af4f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -27,7 +27,7 @@
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
#include <linux/lockdep.h>
-#include "hash.h"
+#include <linux/crc32c.h>
#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
@@ -535,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work)
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_caching_control *caching_ctl;
- struct btrfs_root *extent_root;
int ret;
caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- extent_root = fs_info->extent_root;
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
@@ -1203,11 +1201,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -2652,9 +2650,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
unsigned long nr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -2994,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
if (trans->transid > async->transid)
goto end;
- ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+ ret = btrfs_run_delayed_refs(trans, async->count);
if (ret)
async->error = ret;
end:
@@ -3053,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
* Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, unsigned long count)
+ unsigned long count)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
@@ -3078,7 +3077,7 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
trans->can_flush_pending_bgs = false;
- ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+ ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3086,7 +3085,7 @@ again:
if (run_all) {
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first(&delayed_refs->href_root);
@@ -3660,9 +3659,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
* the commit latency by getting rid of the easy block groups while
* we're still allowing others to join the commit.
*/
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
@@ -3686,7 +3685,7 @@ again:
* make sure all the block groups on our dirty list actually
* exist
*/
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
if (!path) {
path = btrfs_alloc_path();
@@ -3741,8 +3740,9 @@ again:
should_put = 0;
/*
- * the cache_write_mutex is protecting
- * the io_list
+ * The cache_write_mutex is protecting the
+ * io_list, also refer to the definition of
+ * btrfs_transaction::io_bgs for more details
*/
list_add_tail(&cache->io_list, io);
} else {
@@ -3800,7 +3800,7 @@ again:
* go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3882,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans, fs_info,
+ ret = btrfs_run_delayed_refs(trans,
(unsigned long) -1);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
@@ -3934,6 +3934,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
spin_unlock(&cur_trans->dirty_bgs_lock);
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
io_list);
@@ -3990,7 +3994,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
bg = btrfs_lookup_block_group(fs_info, bytenr);
ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
- wake_up_atomic_t(&bg->nocow_writers);
+ wake_up_var(&bg->nocow_writers);
/*
* Once for our lookup and once for the lookup done by a previous call
* to btrfs_inc_nocow_writers()
@@ -4001,8 +4005,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
{
- wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
}
static const char *alloc_name(u64 flags)
@@ -4333,8 +4336,7 @@ again:
/* commit the current transaction and try again */
commit_trans:
- if (need_commit &&
- !atomic_read(&fs_info->open_ioctl_trans)) {
+ if (need_commit) {
need_commit--;
if (need_commit > 0) {
@@ -4542,7 +4544,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
* Needed because we can end up allocating a system chunk and for an
* atomic and race free space reservation in the chunk block reserve.
*/
- ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+ lockdep_assert_held(&fs_info->chunk_mutex);
info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
@@ -4603,11 +4605,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
return -ENOSPC;
space_info = __find_space_info(fs_info, flags);
- if (!space_info) {
- ret = create_space_info(fs_info, flags, &space_info);
- if (ret)
- return ret;
- }
+ ASSERT(space_info);
again:
spin_lock(&space_info->lock);
@@ -4706,7 +4704,7 @@ out:
*/
if (trans->can_flush_pending_bgs &&
trans->chunk_bytes_reserved >= (u64)SZ_2M) {
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
}
return ret;
@@ -4827,7 +4825,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
long time_left;
unsigned long nr_pages;
int loops;
- enum btrfs_reserve_flush_enum flush;
/* Calc the number of the pages we need flush for space reservation */
items = calc_reclaim_items_nr(fs_info, to_reclaim);
@@ -4868,10 +4865,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
atomic_read(&fs_info->async_delalloc_pages) <=
(int)max_reclaim);
skip_async:
- if (!trans)
- flush = BTRFS_RESERVE_FLUSH_ALL;
- else
- flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -4994,7 +4987,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+ ret = btrfs_run_delayed_items_nr(trans, nr);
btrfs_end_transaction(trans);
break;
case FLUSH_DELALLOC:
@@ -5389,10 +5382,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
!block_rsv_use_bytes(global_rsv, orig_bytes))
ret = 0;
}
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
orig_bytes, 1);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ dump_space_info(fs_info, block_rsv->space_info,
+ orig_bytes, 0);
+ }
return ret;
}
@@ -5761,6 +5759,9 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
if (num_bytes == 0)
return 0;
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ if (ret)
+ return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -5773,11 +5774,15 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
/**
* btrfs_inode_rsv_release - release any excessive reservation.
* @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ * Unlike normal operation, qgroup meta reservation needs to know if we are
+ * freeing qgroup reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal release.
*
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
*/
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5793,6 +5798,10 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(inode->root, released);
+ else
+ btrfs_qgroup_convert_reserved_meta(inode->root, released);
}
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5893,24 +5902,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
}
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- if (!trans->block_rsv) {
- ASSERT(!trans->bytes_reserved);
- return;
- }
-
- if (!trans->bytes_reserved)
- return;
-
- ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid, trans->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
- trans->bytes_reserved = 0;
-}
/*
* To be called after all the new block groups attached to the transaction
@@ -5952,7 +5943,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
*/
u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
+ trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
num_bytes, 1);
return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
}
@@ -5996,7 +5987,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret)
return ret;
} else {
@@ -6015,7 +6006,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
if (ret && *qgroup_reserved)
- btrfs_qgroup_free_meta(root, *qgroup_reserved);
+ btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
return ret;
}
@@ -6052,7 +6043,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
@@ -6069,13 +6059,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
delalloc_lock = false;
- } else if (current->journal_info) {
- flush = BTRFS_RESERVE_FLUSH_LIMIT;
- }
+ } else {
+ if (current->journal_info)
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
- if (flush != BTRFS_RESERVE_NO_FLUSH &&
- btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
+ if (btrfs_transaction_in_commit(fs_info))
+ schedule_timeout(1);
+ }
if (delalloc_lock)
mutex_lock(&inode->delalloc_mutex);
@@ -6090,19 +6080,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- ret = btrfs_qgroup_reserve_meta(root,
- nr_extents * fs_info->nodesize, true);
- if (ret)
- goto out_fail;
- }
-
ret = btrfs_inode_rsv_refill(inode, flush);
- if (unlikely(ret)) {
- btrfs_qgroup_free_meta(root,
- nr_extents * fs_info->nodesize);
+ if (unlikely(ret))
goto out_fail;
- }
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
@@ -6116,7 +6096,7 @@ out_fail:
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, true);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return ret;
@@ -6126,12 +6106,14 @@ out_fail:
* btrfs_delalloc_release_metadata - release a metadata reservation for an inode
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
* reservations, or on error for the same reason.
*/
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
@@ -6144,13 +6126,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
* btrfs_delalloc_release_extents - release our outstanding_extents
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
@@ -6158,7 +6141,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
* temporarily tracked outstanding_extents. This _must_ be used in conjunction
* with btrfs_delalloc_reserve_metadata.
*/
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
unsigned num_extents;
@@ -6172,7 +6156,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
@@ -6228,9 +6212,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
*/
void btrfs_delalloc_release_space(struct inode *inode,
struct extent_changeset *reserved,
- u64 start, u64 len)
+ u64 start, u64 len, bool qgroup_free)
{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
@@ -6526,7 +6510,7 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
bg = btrfs_lookup_block_group(fs_info, start);
ASSERT(bg);
if (atomic_dec_and_test(&bg->reservations))
- wake_up_atomic_t(&bg->reservations);
+ wake_up_var(&bg->reservations);
btrfs_put_block_group(bg);
}
@@ -6552,8 +6536,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
down_write(&space_info->groups_sem);
up_write(&space_info->groups_sem);
- wait_on_atomic_t(&bg->reservations, atomic_t_wait,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
}
/**
@@ -6785,9 +6768,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
@@ -7353,29 +7336,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return ret;
}
-int __get_raid_index(u64 flags)
-{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
- return __get_raid_index(cache->flags);
-}
-
static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = "raid10",
[BTRFS_RAID_RAID1] = "raid1",
@@ -7490,7 +7450,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = __get_raid_index(flags);
+ int index = btrfs_bg_flags_to_raid_index(flags);
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -7576,7 +7536,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
@@ -7586,7 +7547,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
}
search:
have_caching_bg = false;
- if (index == 0 || index == __get_raid_index(flags))
+ if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7844,7 +7805,8 @@ checks:
loop:
failed_cluster_refill = false;
failed_alloc = false;
- BUG_ON(index != get_block_group_index(block_group));
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ index);
btrfs_release_block_group(block_group, delalloc);
cond_resched();
}
@@ -7998,6 +7960,51 @@ again:
up_read(&info->groups_sem);
}
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ * hole that is at least as big as @num_bytes.
+ *
+ * @root - The root that will contain this extent
+ *
+ * @ram_bytes - The amount of space in ram that @num_bytes take. This
+ * is used for accounting purposes. This value differs
+ * from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes - Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size - Indicates the minimum amount of space that the
+ * allocator should try to satisfy. In some cases
+ * @num_bytes may be larger than what is required and if
+ * the filesystem is fragmented then allocation fails.
+ * However, the presence of @min_alloc_size gives a
+ * chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size - A hint that you plan on doing more COW. This is the
+ * size in bytes the allocator should try to find free
+ * next to the block it returns. This is just a hint and
+ * may be ignored by the allocator.
+ *
+ * @hint_byte - Hint to the allocator to start searching above the byte
+ * address passed. It might be ignored.
+ *
+ * @ins - This key is modified to record the found hole. It will
+ * have the following values:
+ * ins->objectid == start position
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == the size of the hole.
+ *
+ * @is_data - Boolean flag indicating whether an extent is
+ * allocated for data (true) or metadata (false)
+ *
+ * @delalloc - Boolean flag indicating whether this allocation is for
+ * delalloc or not. If 'true' data_rwsem of block groups
+ * is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
@@ -8701,6 +8708,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 parent;
u32 blocksize;
struct btrfs_key key;
+ struct btrfs_key first_key;
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
@@ -8721,6 +8729,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+ path->slots[level]);
blocksize = fs_info->nodesize;
next = find_extent_buffer(fs_info, bytenr);
@@ -8785,7 +8795,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, generation);
+ next = read_tree_block(fs_info, bytenr, generation, level - 1,
+ &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -9650,7 +9661,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
*/
target = get_restripe_target(fs_info, block_group->flags);
if (target) {
- index = __get_raid_index(extended_to_chunk(target));
+ index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
} else {
/*
* this is just a balance, so if we were marked as full
@@ -9664,7 +9675,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
goto out;
}
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
}
if (index == BTRFS_RAID_RAID10) {
@@ -9913,10 +9924,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
return 0;
}
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+ struct raid_kobject *rkobj;
+ LIST_HEAD(list);
+ int index;
+ int ret = 0;
+
+ spin_lock(&fs_info->pending_raid_kobjs_lock);
+ list_splice_init(&fs_info->pending_raid_kobjs, &list);
+ spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+ list_for_each_entry(rkobj, &list, list) {
+ space_info = __find_space_info(fs_info, rkobj->flags);
+ index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+ "%s", get_raid_name(index));
+ if (ret) {
+ kobject_put(&rkobj->kobj);
+ break;
+ }
+ }
+ if (ret)
+ btrfs_warn(fs_info,
+ "failed to add kobject for block cache, ignoring");
+}
+
static void link_block_group(struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
- int index = get_block_group_index(cache);
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ int index = btrfs_bg_flags_to_raid_index(cache->flags);
bool first = false;
down_write(&space_info->groups_sem);
@@ -9926,27 +9967,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
up_write(&space_info->groups_sem);
if (first) {
- struct raid_kobject *rkobj;
- int ret;
-
- rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
- if (!rkobj)
- goto out_err;
- rkobj->raid_type = index;
- kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
- ret = kobject_add(&rkobj->kobj, &space_info->kobj,
- "%s", get_raid_name(index));
- if (ret) {
- kobject_put(&rkobj->kobj);
- goto out_err;
+ struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj) {
+ btrfs_warn(cache->fs_info,
+ "couldn't alloc memory for raid level kobject");
+ return;
}
+ rkobj->flags = cache->flags;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ spin_lock(&fs_info->pending_raid_kobjs_lock);
+ list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+ spin_unlock(&fs_info->pending_raid_kobjs_lock);
space_info->block_group_kobjs[index] = &rkobj->kobj;
}
-
- return;
-out_err:
- btrfs_warn(cache->fs_info,
- "failed to add kobject for block cache, ignoring");
}
static struct btrfs_block_group_cache *
@@ -10162,6 +10196,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
inc_block_group_ro(cache, 1);
}
+ btrfs_add_raid_kobjects(info);
init_global_block_rsv(info);
ret = 0;
error:
@@ -10169,9 +10204,9 @@ error:
return ret;
}
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
@@ -10256,15 +10291,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* with its ->space_info set.
*/
cache->space_info = __find_space_info(fs_info, cache->flags);
- if (!cache->space_info) {
- ret = create_space_info(fs_info, cache->flags,
- &cache->space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- return ret;
- }
- }
+ ASSERT(cache->space_info);
ret = btrfs_add_block_group_cache(fs_info, cache);
if (ret) {
@@ -10336,7 +10363,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->key.offset);
memcpy(&key, &block_group->key, sizeof(key));
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -11061,7 +11088,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
ret = btrfs_start_write_no_snapshotting(root);
if (ret)
break;
- wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&root->will_be_snapshotted,
+ !atomic_read(&root->will_be_snapshotted));
}
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dfeb74a0be77..47a8fe9d22e8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -76,8 +76,8 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&buffers)) {
eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n",
- eb->start, eb->len, atomic_read(&eb->refs));
+ pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
list_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
@@ -119,23 +119,22 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static void add_extent_changeset(struct extent_state *state, unsigned bits,
+static int add_extent_changeset(struct extent_state *state, unsigned bits,
struct extent_changeset *changeset,
int set)
{
int ret;
if (!changeset)
- return;
+ return 0;
if (set && (state->state & bits) == bits)
- return;
+ return 0;
if (!set && (state->state & bits) == 0)
- return;
+ return 0;
changeset->bytes_changed += state->end - state->start + 1;
ret = ulist_add(&changeset->range_changed, state->start, state->end,
GFP_ATOMIC);
- /* ENOMEM */
- BUG_ON(ret < 0);
+ return ret;
}
static void flush_write_bio(struct extent_page_data *epd);
@@ -187,7 +186,7 @@ free_state_cache:
return -ENOMEM;
}
-void extent_io_exit(void)
+void __cold extent_io_exit(void)
{
btrfs_leak_debug_check();
@@ -527,6 +526,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
{
struct extent_state *next;
unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -534,7 +534,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
- add_extent_changeset(state, bits_to_clear, changeset, 0);
+ ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+ BUG_ON(ret < 0);
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
@@ -805,13 +806,15 @@ static void set_state_bits(struct extent_io_tree *tree,
unsigned *bits, struct extent_changeset *changeset)
{
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+ int ret;
set_state_cb(tree, state, bits);
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- add_extent_changeset(state, bits_to_set, changeset, 1);
+ ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+ BUG_ON(ret < 0);
state->state |= bits_to_set;
}
@@ -2744,20 +2747,21 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
return blk_status_to_errno(ret);
}
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
- unsigned long offset, size_t size, struct bio *bio,
- unsigned long bio_flags)
-{
- int ret = 0;
- if (tree->ops)
- ret = tree->ops->merge_bio_hook(page, offset, size, bio,
- bio_flags);
- return ret;
-
-}
-
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
+ * @tree: tree so we can call our merge_bio hook
+ * @wbc: optional writeback control for io accounting
+ * @page: page to add to the bio
+ * @pg_offset: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @size: portion of page that we want to write
+ * @offset: starting offset in the page
+ * @bdev: attach newly created bios to this bdev
+ * @bio_ret: must be valid pointer, newly allocated bio will be stored there
+ * @end_io_func: end_io callback for new bio
+ * @mirror_num: desired mirror to read/write
+ * @prev_bio_flags: flags of previous bio to see if we can merge the current one
+ * @bio_flags: flags of the current bio to see if we can merge them
*/
static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
struct writeback_control *wbc,
@@ -2773,21 +2777,27 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
{
int ret = 0;
struct bio *bio;
- int contig = 0;
- int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
size_t page_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
- if (bio_ret && *bio_ret) {
+ ASSERT(bio_ret);
+
+ if (*bio_ret) {
+ bool contig;
+ bool can_merge = true;
+
bio = *bio_ret;
- if (old_compressed)
+ if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
- if (prev_bio_flags != bio_flags || !contig ||
+ if (tree->ops && tree->ops->merge_bio_hook(page, offset,
+ page_size, bio, bio_flags))
+ can_merge = false;
+
+ if (prev_bio_flags != bio_flags || !contig || !can_merge ||
force_bio_submit ||
- merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, pg_offset) < page_size) {
ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
@@ -2813,10 +2823,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
wbc_account_io(wbc, page, page_size);
}
- if (bio_ret)
- *bio_ret = bio;
- else
- ret = submit_one_bio(bio, mirror_num, bio_flags);
+ *bio_ret = bio;
return ret;
}
@@ -2886,8 +2893,7 @@ static int __do_readpage(struct extent_io_tree *tree,
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
- u64 end;
+ const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
@@ -2905,7 +2911,6 @@ static int __do_readpage(struct extent_io_tree *tree,
set_page_extent_mapped(page);
- end = page_end;
if (!PageUptodate(page)) {
if (cleancache_get_page(page) == 0) {
BUG_ON(blocksize != PAGE_SIZE);
@@ -5230,11 +5235,6 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
}
-int extent_buffer_uptodate(struct extent_buffer *eb)
-{
- return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-}
-
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, int wait, int mirror_num)
{
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a7a850abd600..b77d84909863 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -83,8 +83,8 @@ static inline int le_test_bit(int nr, const u8 *addr)
return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
}
-extern void le_bitmap_set(u8 *map, unsigned int start, int len);
-extern void le_bitmap_clear(u8 *map, unsigned int start, int len);
+void le_bitmap_set(u8 *map, unsigned int start, int len);
+void le_bitmap_clear(u8 *map, unsigned int start, int len);
struct extent_state;
struct btrfs_root;
@@ -95,6 +95,13 @@ struct io_failure_record;
typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset);
+
+typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
+ struct bio *bio, u64 bio_offset);
+
+typedef blk_status_t (extent_submit_bio_done_t)(void *private_data,
+ struct bio *bio, int mirror_num);
+
struct extent_io_ops {
/*
* The following callbacks must be allways defined, the function
@@ -286,7 +293,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
-void extent_io_exit(void);
+void __cold extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
@@ -455,6 +462,11 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
atomic_inc(&eb->refs);
}
+static inline int extent_buffer_uptodate(struct extent_buffer *eb)
+{
+ return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+}
+
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len);
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
@@ -489,7 +501,6 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-int extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(const struct extent_buffer *eb,
unsigned long offset, unsigned long min_len,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index d3bd02105d1c..53a0633c6ef7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2,7 +2,6 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/hardirq.h>
#include "ctree.h"
#include "extent_map.h"
#include "compression.h"
@@ -20,7 +19,7 @@ int __init extent_map_init(void)
return 0;
}
-void extent_map_exit(void)
+void __cold extent_map_exit(void)
{
kmem_cache_destroy(extent_map_cache);
}
@@ -552,6 +551,9 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
ret = 0;
existing = search_extent_mapping(em_tree, start, len);
+
+ trace_btrfs_handle_em_exist(existing, em, start, len);
+
/*
* existing will always be non-NULL, since there must be
* extent causing the -EEXIST.
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b29f77bc0732..f6f8ba114977 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -86,7 +86,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
-void extent_map_exit(void);
+void __cold extent_map_exit(void);
int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 41ab9073d1d4..f247300170e5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1691,7 +1691,7 @@ again:
force_page_uptodate);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
+ reserve_bytes, true);
break;
}
@@ -1703,7 +1703,7 @@ again:
if (extents_locked == -EAGAIN)
goto again;
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
+ reserve_bytes, true);
ret = extents_locked;
break;
}
@@ -1738,7 +1738,7 @@ again:
fs_info->sb->s_blocksize_bits;
if (only_release_metadata) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes);
+ release_bytes, true);
} else {
u64 __pos;
@@ -1747,7 +1747,7 @@ again:
(dirty_pages << PAGE_SHIFT);
btrfs_delalloc_release_space(inode,
data_reserved, __pos,
- release_bytes);
+ release_bytes, true);
}
}
@@ -1760,7 +1760,8 @@ again:
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
+ (ret != 0));
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
@@ -1800,11 +1801,11 @@ again:
if (only_release_metadata) {
btrfs_end_write_no_snapshotting(root);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes);
+ release_bytes, true);
} else {
btrfs_delalloc_release_space(inode, data_reserved,
round_down(pos, fs_info->sectorsize),
- release_bytes);
+ release_bytes, true);
}
}
@@ -1997,8 +1998,6 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
- if (private && private->trans)
- btrfs_ioctl_trans_end(filp);
if (private && private->filldir_buf)
kfree(private->filldir_buf);
kfree(private);
@@ -2190,12 +2189,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
/*
- * ok we haven't committed the transaction yet, lets do a commit
- */
- if (file->private_data)
- btrfs_ioctl_trans_end(file);
-
- /*
* We use start here because we will need to wait on the IO to complete
* in btrfs_sync_log, which could require joining a transaction (for
* example checking cross references in the nocow path). If we use join
@@ -2214,7 +2207,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
trans->sync = true;
- ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2482,7 +2475,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
if ((!ordered ||
(ordered->file_offset + ordered->len <= lockstart ||
ordered->file_offset > lockend)) &&
- !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+ !filemap_range_has_page(inode->i_mapping,
+ lockstart, lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
@@ -3378,7 +3372,7 @@ const struct file_operations btrfs_file_operations = {
.dedupe_file_range = btrfs_dedupe_file_range,
};
-void btrfs_auto_defrag_exit(void)
+void __cold btrfs_auto_defrag_exit(void)
{
kmem_cache_destroy(btrfs_inode_defrag_cachep);
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a9f22ac50d6a..d0dde9e6afd7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3547,7 +3547,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
if (ret) {
if (release_metadata)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- inode->i_size);
+ inode->i_size, true);
#ifdef DEBUG
btrfs_err(fs_info,
"failed to write free ino cache for root %llu",
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index fe5e0324dca9..af36a6a971fe 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1071,7 +1071,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path2 = btrfs_alloc_path();
if (!path2) {
@@ -1573,7 +1573,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
*/
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 1;
+ path->reada = READA_FORWARD;
info = search_free_space_info(NULL, fs_info, block_group, path, 0);
if (IS_ERR(info)) {
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
deleted file mode 100644
index baacc1866861..000000000000
--- a/fs/btrfs/hash.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-
-#include <crypto/hash.h>
-#include <linux/err.h>
-#include "hash.h"
-
-static struct crypto_shash *tfm;
-
-int __init btrfs_hash_init(void)
-{
- tfm = crypto_alloc_shash("crc32c", 0, 0);
-
- return PTR_ERR_OR_ZERO(tfm);
-}
-
-const char* btrfs_crc32c_impl(void)
-{
- return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
-}
-
-void btrfs_hash_exit(void)
-{
- crypto_free_shash(tfm);
-}
-
-u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
-{
- SHASH_DESC_ON_STACK(shash, tfm);
- u32 *ctx = (u32 *)shash_desc_ctx(shash);
- u32 retval;
- int err;
-
- shash->tfm = tfm;
- shash->flags = 0;
- *ctx = crc;
-
- err = crypto_shash_update(shash, address, length);
- BUG_ON(err);
-
- retval = *ctx;
- barrier_data(ctx);
- return retval;
-}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
deleted file mode 100644
index c3a2ec554361..000000000000
--- a/fs/btrfs/hash.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __HASH__
-#define __HASH__
-
-int __init btrfs_hash_init(void);
-
-void btrfs_hash_exit(void);
-const char* btrfs_crc32c_impl(void);
-
-u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
- return btrfs_crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
- int len)
-{
- return (u64) btrfs_crc32c(parent_objectid, name, len);
-}
-
-#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 39c968f80157..1d5631ef2738 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,14 +18,13 @@
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
#include "print-tree.h"
-static int find_name_in_backref(struct btrfs_path *path, const char *name,
- int name_len, struct btrfs_inode_ref **ref_ret)
+int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
+ const char *name,
+ int name_len, struct btrfs_inode_ref **ref_ret)
{
- struct extent_buffer *leaf;
struct btrfs_inode_ref *ref;
unsigned long ptr;
unsigned long name_ptr;
@@ -33,9 +32,8 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
u32 cur_offset = 0;
int len;
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size_nr(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
len = btrfs_inode_ref_name_len(leaf, ref);
@@ -44,18 +42,19 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
if (len != name_len)
continue;
if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
- *ref_ret = ref;
+ if (ref_ret)
+ *ref_ret = ref;
return 1;
}
}
return 0;
}
-int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
+ u64 ref_objectid,
const char *name, int name_len,
struct btrfs_inode_extref **extref_ret)
{
- struct extent_buffer *leaf;
struct btrfs_inode_extref *extref;
unsigned long ptr;
unsigned long name_ptr;
@@ -63,9 +62,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
u32 cur_offset = 0;
int ref_name_len;
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size_nr(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
/*
* Search all extended backrefs in this item. We're only
@@ -113,7 +111,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
if (ret > 0)
return NULL;
- if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+ if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+ ref_objectid, name, name_len,
+ &extref))
return NULL;
return extref;
}
@@ -155,7 +155,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
* This should always succeed so error here will make the FS
* readonly.
*/
- if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+ if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+ ref_objectid,
name, name_len, &extref)) {
btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
@@ -225,7 +226,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
} else if (ret < 0) {
goto out;
}
- if (!find_name_in_backref(path, name, name_len, &ref)) {
+ if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+ name, name_len, &ref)) {
ret = -ENOENT;
search_ext_refs = 1;
goto out;
@@ -293,7 +295,9 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
- if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+ if (btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0],
+ ref_objectid,
name, name_len, NULL))
goto out;
@@ -351,7 +355,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ret == -EEXIST) {
u32 old_size;
- if (find_name_in_backref(path, name, name_len, &ref))
+ if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+ name, name_len, &ref))
goto out;
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
@@ -365,7 +370,9 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ret = 0;
} else if (ret < 0) {
if (ret == -EOVERFLOW) {
- if (find_name_in_backref(path, name, name_len, &ref))
+ if (btrfs_find_name_in_backref(path->nodes[0],
+ path->slots[0],
+ name, name_len, &ref))
ret = -EEXIST;
else
ret = -EMLINK;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 022b19336fee..9409dcc7020d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -500,12 +500,12 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
goto out_put;
}
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
out_put:
iput(inode);
out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a79299a89b7d..1f091c2358a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -58,7 +58,6 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "backref.h"
-#include "hash.h"
#include "props.h"
#include "qgroup.h"
#include "dedupe.h"
@@ -102,7 +101,7 @@ static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
};
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
-static int btrfs_truncate(struct inode *inode);
+static int btrfs_truncate(struct inode *inode, bool skip_writeback);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
@@ -277,12 +276,12 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct btrfs_root *root,
- struct inode *inode, u64 start,
+static noinline int cow_file_range_inline(struct inode *inode, u64 start,
u64 end, size_t compressed_size,
int compress_type,
struct page **compressed_pages)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
u64 isize = i_size_read(inode);
@@ -458,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode,
int *num_added)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
u64 isize = i_size_read(inode);
@@ -580,11 +578,11 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(root, inode, start, end,
- 0, BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(root, inode, start, end,
+ ret = cow_file_range_inline(inode, start, end,
total_compressed,
compress_type, pages);
}
@@ -961,7 +959,6 @@ static noinline int cow_file_range(struct inode *inode,
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
- u64 disk_num_bytes;
u64 cur_alloc_size = 0;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
@@ -979,14 +976,14 @@ static noinline int cow_file_range(struct inode *inode,
num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
- disk_num_bytes = num_bytes;
+ ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
if (start == 0) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(root, inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL);
if (ret == 0) {
/*
* We use DO_ACCOUNTING here because we need the
@@ -1010,15 +1007,12 @@ static noinline int cow_file_range(struct inode *inode,
}
}
- BUG_ON(disk_num_bytes >
- btrfs_super_total_bytes(fs_info->super_copy));
-
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(BTRFS_I(inode), start,
start + num_bytes - 1, 0);
- while (disk_num_bytes > 0) {
- cur_alloc_size = disk_num_bytes;
+ while (num_bytes > 0) {
+ cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
fs_info->sectorsize, 0, alloc_hint,
&ins, 1, 1);
@@ -1082,11 +1076,10 @@ static noinline int cow_file_range(struct inode *inode,
delalloc_end, locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
- if (disk_num_bytes < cur_alloc_size)
- disk_num_bytes = 0;
+ if (num_bytes < cur_alloc_size)
+ num_bytes = 0;
else
- disk_num_bytes -= cur_alloc_size;
- num_bytes -= cur_alloc_size;
+ num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
extent_reserved = false;
@@ -1262,6 +1255,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
list_del(&sums->list);
kfree(sums);
}
+ if (ret < 0)
+ return ret;
return 1;
}
@@ -1394,10 +1389,23 @@ next_slot:
goto out_check;
if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out_check;
- if (btrfs_cross_ref_exist(root, ino,
- found_key.offset -
- extent_offset, disk_bytenr))
+ ret = btrfs_cross_ref_exist(root, ino,
+ found_key.offset -
+ extent_offset, disk_bytenr);
+ if (ret) {
+ /*
+ * ret could be -EIO if the above fails to read
+ * metadata.
+ */
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ }
+
+ WARN_ON_ONCE(nolock);
goto out_check;
+ }
disk_bytenr += extent_offset;
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
@@ -1415,10 +1423,22 @@ next_slot:
* this ensure that csum for a given extent are
* either valid or do not exist.
*/
- if (csum_exist_in_range(fs_info, disk_bytenr,
- num_bytes)) {
+ ret = csum_exist_in_range(fs_info, disk_bytenr,
+ num_bytes);
+ if (ret) {
if (!nolock)
btrfs_end_write_no_snapshotting(root);
+
+ /*
+ * ret could be -EIO if the above fails to read
+ * metadata.
+ */
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ }
+ WARN_ON_ONCE(nolock);
goto out_check;
}
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
@@ -1847,7 +1867,7 @@ static void btrfs_clear_bit_hook(void *private_data,
*/
if (*bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(inode, len);
+ btrfs_delalloc_release_metadata(inode, len, false);
/* For sanity tests. */
if (btrfs_is_testing(fs_info))
@@ -1921,8 +1941,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
+static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
struct inode *inode = private_data;
@@ -1941,9 +1960,8 @@ static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
+ int mirror_num)
{
struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2015,8 +2033,8 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
bio_offset, inode,
- __btrfs_submit_bio_start,
- __btrfs_submit_bio_done);
+ btrfs_submit_bio_start,
+ btrfs_submit_bio_done);
goto out;
} else if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, 0, 0);
@@ -2043,12 +2061,15 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
struct inode *inode, struct list_head *list)
{
struct btrfs_ordered_sum *sum;
+ int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- btrfs_csum_file_blocks(trans,
+ ret = btrfs_csum_file_blocks(trans,
BTRFS_I(inode)->root->fs_info->csum_root, sum);
trans->adding_csums = false;
+ if (ret)
+ return ret;
}
return 0;
}
@@ -2131,7 +2152,7 @@ again:
ClearPageChecked(page);
set_page_dirty(page);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
@@ -2751,12 +2772,10 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
struct sa_defrag_extent_backref *backref;
struct sa_defrag_extent_backref *prev = NULL;
struct inode *inode;
- struct btrfs_root *root;
struct rb_node *node;
int ret;
inode = new->inode;
- root = BTRFS_I(inode)->root;
path = btrfs_alloc_path();
if (!path)
@@ -3062,7 +3081,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- add_pending_csums(trans, inode, &ordered_extent->list);
+ ret = add_pending_csums(trans, inode, &ordered_extent->list);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
ret = btrfs_update_inode_fallback(trans, root, inode);
@@ -3240,6 +3263,16 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
start, (size_t)(end - start + 1));
}
+/*
+ * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ *
+ * @inode: The inode we want to perform iput on
+ *
+ * This function uses the generic vfs_inode::i_count to track whether we should
+ * just decrement it (in case it's > 1) or if this is the last iput then link
+ * the inode to the delayed iput machinery. Delayed iputs are processed at
+ * transaction commit time/superblock commit/cleaner kthread.
+ */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3249,12 +3282,8 @@ void btrfs_add_delayed_iput(struct inode *inode)
return;
spin_lock(&fs_info->delayed_iput_lock);
- if (binode->delayed_iput_count == 0) {
- ASSERT(list_empty(&binode->delayed_iput));
- list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
- } else {
- binode->delayed_iput_count++;
- }
+ ASSERT(list_empty(&binode->delayed_iput));
+ list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock(&fs_info->delayed_iput_lock);
}
@@ -3267,13 +3296,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
- if (inode->delayed_iput_count) {
- inode->delayed_iput_count--;
- list_move_tail(&inode->delayed_iput,
- &fs_info->delayed_iputs);
- } else {
- list_del_init(&inode->delayed_iput);
- }
+ list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
spin_lock(&fs_info->delayed_iput_lock);
@@ -3343,7 +3366,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = NULL;
int reserve = 0;
- int insert = 0;
+ bool insert = false;
int ret;
if (!root->orphan_block_rsv) {
@@ -3353,7 +3376,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
+ if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &inode->runtime_flags))
+ insert = true;
+
+ if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &inode->runtime_flags))
+ reserve = 1;
+
spin_lock(&root->orphan_lock);
+ /* If someone has created ->orphan_block_rsv, be happy to use it. */
if (!root->orphan_block_rsv) {
root->orphan_block_rsv = block_rsv;
} else if (block_rsv) {
@@ -3361,26 +3393,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
block_rsv = NULL;
}
- if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags)) {
-#if 0
- /*
- * For proper ENOSPC handling, we should do orphan
- * cleanup when mounting. But this introduces backward
- * compatibility issue.
- */
- if (!xchg(&root->orphan_item_inserted, 1))
- insert = 2;
- else
- insert = 1;
-#endif
- insert = 1;
+ if (insert)
atomic_inc(&root->orphan_inodes);
- }
-
- if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags))
- reserve = 1;
spin_unlock(&root->orphan_lock);
/* grab metadata reservation from transaction handle */
@@ -3404,7 +3418,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
}
/* insert an orphan item to track this unlinked/truncated file */
- if (insert >= 1) {
+ if (insert) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret) {
if (reserve) {
@@ -3428,15 +3442,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
ret = 0;
}
- /* insert an orphan item to track subvolume contains orphan files */
- if (insert >= 2) {
- ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
- root->root_key.objectid);
- if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, ret);
- return ret;
- }
- }
return 0;
}
@@ -3637,7 +3642,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
goto out;
}
- ret = btrfs_truncate(inode);
+ ret = btrfs_truncate(inode, false);
if (ret)
btrfs_orphan_del(NULL, BTRFS_I(inode));
} else {
@@ -4704,7 +4709,6 @@ delete:
if (updates) {
trans->delayed_ref_updates = 0;
ret = btrfs_run_delayed_refs(trans,
- fs_info,
updates * 2);
if (ret && !err)
err = ret;
@@ -4744,8 +4748,7 @@ error:
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
- ret = btrfs_run_delayed_refs(trans, fs_info,
- updates * 2);
+ ret = btrfs_run_delayed_refs(trans, updates * 2);
if (ret && !err)
err = ret;
}
@@ -4799,8 +4802,8 @@ again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved,
- block_start, blocksize);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ block_start, blocksize, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
ret = -ENOMEM;
goto out;
}
@@ -4867,8 +4870,8 @@ again:
out_unlock:
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ blocksize, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
unlock_page(page);
put_page(page);
out:
@@ -5123,7 +5126,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
inode_dio_wait(inode);
btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
- ret = btrfs_truncate(inode);
+ ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
int err;
@@ -5459,7 +5462,8 @@ no_delete:
/*
* this returns the key found in the dir entry in the location pointer.
- * If no dir entries were found, location->objectid is 0.
+ * If no dir entries were found, returns -ENOENT.
+ * If found a corrupted location in dir entry, returns -EUCLEAN.
*/
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
struct btrfs_key *location)
@@ -5477,27 +5481,27 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
- if (IS_ERR(di))
+ if (!di) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (IS_ERR(di)) {
ret = PTR_ERR(di);
-
- if (IS_ERR_OR_NULL(di))
- goto out_err;
+ goto out;
+ }
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
if (location->type != BTRFS_INODE_ITEM_KEY &&
location->type != BTRFS_ROOT_ITEM_KEY) {
+ ret = -EUCLEAN;
btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
__func__, name, btrfs_ino(BTRFS_I(dir)),
location->objectid, location->type, location->offset);
- goto out_err;
}
out:
btrfs_free_path(path);
return ret;
-out_err:
- location->objectid = 0;
- goto out;
}
/*
@@ -5800,9 +5804,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret < 0)
return ERR_PTR(ret);
- if (location.objectid == 0)
- return ERR_PTR(-ENOENT);
-
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(dir->i_sb, &location, root, NULL);
return inode;
@@ -7436,76 +7437,6 @@ out:
return ret;
}
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
-{
- struct radix_tree_root *root = &inode->i_mapping->page_tree;
- bool found = false;
- void **pagep = NULL;
- struct page *page = NULL;
- unsigned long start_idx;
- unsigned long end_idx;
-
- start_idx = start >> PAGE_SHIFT;
-
- /*
- * end is the last byte in the last page. end == start is legal
- */
- end_idx = end >> PAGE_SHIFT;
-
- rcu_read_lock();
-
- /* Most of the code in this while loop is lifted from
- * find_get_page. It's been modified to begin searching from a
- * page and return just the first page found in that range. If the
- * found idx is less than or equal to the end idx then we know that
- * a page exists. If no pages are found or if those pages are
- * outside of the range then we're fine (yay!) */
- while (page == NULL &&
- radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
- page = radix_tree_deref_slot(pagep);
- if (unlikely(!page))
- break;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- page = NULL;
- continue;
- }
- /*
- * Otherwise, shmem/tmpfs must be storing a swap entry
- * here as an exceptional entry: so return it without
- * attempting to raise page count.
- */
- page = NULL;
- break; /* TODO: Is this relevant for this use case? */
- }
-
- if (!page_cache_get_speculative(page)) {
- page = NULL;
- continue;
- }
-
- /*
- * Has the page moved?
- * This is part of the lockless pagecache protocol. See
- * include/linux/pagemap.h for details.
- */
- if (unlikely(page != *pagep)) {
- put_page(page);
- page = NULL;
- }
- }
-
- if (page) {
- if (page->index <= end_idx)
- found = true;
- put_page(page);
- }
-
- rcu_read_unlock();
- return found;
-}
-
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct extent_state **cached_state, int writing)
{
@@ -7531,8 +7462,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* get stale data.
*/
if (!ordered &&
- (!writing ||
- !btrfs_page_exists_in_range(inode, lockstart, lockend)))
+ (!writing || !filemap_range_has_page(inode->i_mapping,
+ lockstart, lockend)))
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -8263,9 +8194,8 @@ static void btrfs_endio_direct_write(struct bio *bio)
bio_put(bio);
}
-static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags, u64 offset)
+static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
+ struct bio *bio, u64 offset)
{
struct inode *inode = private_data;
blk_status_t ret;
@@ -8291,13 +8221,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
if (err) {
- dip->errors = 1;
-
/*
- * before atomic variable goto zero, we must make sure
- * dip->errors is perceived to be set.
+ * We want to perceive the errors flag being set before
+ * decrementing the reference count. We don't need a barrier
+ * since atomic operations with a return value are fully
+ * ordered as per atomic_t.txt
*/
- smp_mb__before_atomic();
+ dip->errors = 1;
}
/* if there are more bios still pending for this dio, just exit */
@@ -8345,9 +8275,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
return 0;
}
-static inline blk_status_t
-__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
- int async_submit)
+static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
+ struct inode *inode, u64 file_offset, int async_submit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
@@ -8370,8 +8299,8 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
if (write && async_submit) {
ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
file_offset, inode,
- __btrfs_submit_bio_start_direct_io,
- __btrfs_submit_bio_done);
+ btrfs_submit_bio_start_direct_io,
+ btrfs_submit_bio_done);
goto err;
} else if (write) {
/*
@@ -8457,7 +8386,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
*/
atomic_inc(&dip->pending_bios);
- status = __btrfs_submit_dio_bio(bio, inode, file_offset,
+ status = btrfs_submit_dio_bio(bio, inode, file_offset,
async_submit);
if (status) {
bio_put(bio);
@@ -8477,7 +8406,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
} while (submit_len > 0);
submit:
- status = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+ status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
if (!status)
return 0;
@@ -8485,10 +8414,11 @@ submit:
out_err:
dip->errors = 1;
/*
- * before atomic variable goto zero, we must
- * make sure dip->errors is perceived to be set.
+ * Before atomic variable goto zero, we must make sure dip->errors is
+ * perceived to be set. This ordering is ensured by the fact that an
+ * atomic operations with a return value are fully ordered as per
+ * atomic_t.txt
*/
- smp_mb__before_atomic();
if (atomic_dec_and_test(&dip->pending_bios))
bio_io_error(dip->orig_bio);
@@ -8706,7 +8636,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
btrfs_delalloc_release_space(inode, data_reserved,
- offset, dio_data.reserve);
+ offset, dio_data.reserve, true);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
@@ -8722,8 +8652,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
false);
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, data_reserved,
- offset, count - (size_t)ret);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
+ offset, count - (size_t)ret, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
}
out:
if (wakeup)
@@ -9038,7 +8968,8 @@ again:
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE - reserved_space);
+ page_start, PAGE_SIZE - reserved_space,
+ true);
}
}
@@ -9088,23 +9019,23 @@ again:
out_unlock:
if (!ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
}
unlock_page(page);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
btrfs_delalloc_release_space(inode, data_reserved, page_start,
- reserved_space);
+ reserved_space, (ret != 0));
out_noreserve:
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return ret;
}
-static int btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode, bool skip_writeback)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -9115,10 +9046,12 @@ static int btrfs_truncate(struct inode *inode)
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
- ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
- (u64)-1);
- if (ret)
- return ret;
+ if (!skip_writeback) {
+ ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+ (u64)-1);
+ if (ret)
+ return ret;
+ }
/*
* Yes ladies and gentlemen, this is indeed ugly. The fact is we have
@@ -9328,7 +9261,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_log_commit = 0;
- ei->delayed_iput_count = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
@@ -9448,7 +9380,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-void btrfs_destroy_cachep(void)
+void __cold btrfs_destroy_cachep(void)
{
/*
* Make sure all delayed rcu free inodes are flushed before we
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 111ee282b777..b2db3988813f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -106,7 +106,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int no_time_update);
/* Mask out flags that are inappropriate for the given type of inode. */
-static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags)
{
if (S_ISDIR(mode))
return flags;
@@ -723,7 +723,7 @@ fail:
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
dec_and_free:
if (atomic_dec_and_test(&root->will_be_snapshotted))
- wake_up_atomic_t(&root->will_be_snapshotted);
+ wake_up_var(&root->will_be_snapshotted);
free_pending:
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
@@ -1197,7 +1197,7 @@ again:
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
- (page_cnt - i_done) << PAGE_SHIFT);
+ (page_cnt - i_done) << PAGE_SHIFT, true);
}
@@ -1215,7 +1215,8 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
+ false);
extent_changeset_free(data_reserved);
return i_done;
out:
@@ -1225,8 +1226,9 @@ out:
}
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
- page_cnt << PAGE_SHIFT);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
+ page_cnt << PAGE_SHIFT, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
+ true);
extent_changeset_free(data_reserved);
return ret;
@@ -2600,7 +2602,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
range->len = (u64)-1;
}
ret = btrfs_defrag_file(file_inode(file), file,
- range, 0, 0);
+ range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
kfree(range);
@@ -3936,73 +3938,6 @@ int btrfs_clone_file_range(struct file *src_file, loff_t off,
return btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks. They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-static long btrfs_ioctl_trans_start(struct file *file)
-{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
- struct btrfs_file_private *private;
- int ret;
- static bool warned = false;
-
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
- if (!warned) {
- btrfs_warn(fs_info,
- "Userspace transaction mechanism is considered "
- "deprecated and slated to be removed in 4.17. "
- "If you have a valid use case please "
- "speak up on the mailing list");
- WARN_ON(1);
- warned = true;
- }
-
- ret = -EINPROGRESS;
- private = file->private_data;
- if (private && private->trans)
- goto out;
- if (!private) {
- private = kzalloc(sizeof(struct btrfs_file_private),
- GFP_KERNEL);
- if (!private)
- return -ENOMEM;
- file->private_data = private;
- }
-
- ret = -EROFS;
- if (btrfs_root_readonly(root))
- goto out;
-
- ret = mnt_want_write_file(file);
- if (ret)
- goto out;
-
- atomic_inc(&fs_info->open_ioctl_trans);
-
- ret = -ENOMEM;
- trans = btrfs_start_ioctl_transaction(root);
- if (IS_ERR(trans))
- goto out_drop;
-
- private->trans = trans;
- return 0;
-
-out_drop:
- atomic_dec(&fs_info->open_ioctl_trans);
- mnt_drop_write_file(file);
-out:
- return ret;
-}
-
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
@@ -4244,30 +4179,6 @@ out:
return ret;
}
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks. They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-long btrfs_ioctl_trans_end(struct file *file)
-{
- struct inode *inode = file_inode(file);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_file_private *private = file->private_data;
-
- if (!private || !private->trans)
- return -EINVAL;
-
- btrfs_end_transaction(private->trans);
- private->trans = NULL;
-
- atomic_dec(&root->fs_info->open_ioctl_trans);
-
- mnt_drop_write_file(file);
- return 0;
-}
-
static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
void __user *argp)
{
@@ -4429,7 +4340,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = 0;
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
- ret = btrfs_dev_replace_cancel(fs_info, p);
+ p->result = btrfs_dev_replace_cancel(fs_info);
+ ret = 0;
break;
default:
ret = -EINVAL;
@@ -5138,10 +5050,17 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
BTRFS_UUID_SIZE);
if (received_uuid_changed &&
- !btrfs_is_empty_uuid(root_item->received_uuid))
- btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid,
- BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- root->root_key.objectid);
+ !btrfs_is_empty_uuid(root_item->received_uuid)) {
+ ret = btrfs_uuid_tree_rem(trans, fs_info,
+ root_item->received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ root->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
btrfs_set_root_stransid(root_item, sa->stransid);
btrfs_set_root_rtransid(root_item, sa->rtransid);
@@ -5574,10 +5493,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_dev_info(fs_info, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
- case BTRFS_IOC_TRANS_START:
- return btrfs_ioctl_trans_start(file);
- case BTRFS_IOC_TRANS_END:
- return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(file, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d13128c70ddd..621083f8932c 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -290,7 +290,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
/*
* Make sure counter is updated before we wake up waiters.
*/
- smp_mb();
+ smp_mb__after_atomic();
if (waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
} else {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 6c7f18cd3b61..1c7f7f70caf4 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -382,14 +382,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
- size_t tot_len;
int ret = 0;
char *kaddr;
unsigned long bytes;
BUG_ON(srclen < LZO_LEN);
- tot_len = read_compress_length(data_in);
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5b311aeddcc8..661cc3db0c7c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -610,7 +610,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(btrfs_inode, entry->len);
+ btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -1154,7 +1154,7 @@ int __init ordered_data_init(void)
return 0;
}
-void ordered_data_exit(void)
+void __cold ordered_data_exit(void)
{
kmem_cache_destroy(btrfs_ordered_extent_cache);
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 56c4c0ee6381..4a1672a13ba6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -151,7 +151,9 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
unsigned long bytes)
{
int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
- return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
+ int csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size;
}
static inline void
@@ -215,5 +217,5 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
-void ordered_data_exit(void);
+void __cold ordered_data_exit(void);
#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 569205e651c7..4a8770485f77 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -365,9 +365,13 @@ void btrfs_print_tree(struct extent_buffer *c)
btrfs_node_blockptr(c, i));
}
for (i = 0; i < nr; i++) {
- struct extent_buffer *next = read_tree_block(fs_info,
- btrfs_node_blockptr(c, i),
- btrfs_node_ptr_generation(c, i));
+ struct btrfs_key first_key;
+ struct extent_buffer *next;
+
+ btrfs_node_key_to_cpu(c, &first_key, i);
+ next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
+ btrfs_node_ptr_generation(c, i),
+ level - 1, &first_key);
if (IS_ERR(next)) {
continue;
} else if (!extent_buffer_uptodate(next)) {
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b30a056963ab..5859f7d3cf3e 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -19,8 +19,8 @@
#include <linux/hashtable.h>
#include "props.h"
#include "btrfs_inode.h"
-#include "hash.h"
#include "transaction.h"
+#include "ctree.h"
#include "xattr.h"
#include "compression.h"
@@ -116,7 +116,7 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
return -EINVAL;
if (value_len == 0) {
- ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+ ret = btrfs_setxattr(trans, inode, handler->xattr_name,
NULL, 0, flags);
if (ret)
return ret;
@@ -130,13 +130,13 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
ret = handler->validate(value, value_len);
if (ret)
return ret;
- ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+ ret = btrfs_setxattr(trans, inode, handler->xattr_name,
value, value_len, flags);
if (ret)
return ret;
ret = handler->apply(inode, value, value_len);
if (ret) {
- __btrfs_setxattr(trans, inode, handler->xattr_name,
+ btrfs_setxattr(trans, inode, handler->xattr_name,
NULL, 0, flags);
return ret;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index aa259d6986e1..f583f13ff26e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -47,6 +47,82 @@
* - check all ioctl parameters
*/
+/*
+ * Helpers to access qgroup reservation
+ *
+ * Callers should ensure the lock context and type are valid
+ */
+
+static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
+{
+ u64 ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ ret += qgroup->rsv.values[i];
+
+ return ret;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
+{
+ if (type == BTRFS_QGROUP_RSV_DATA)
+ return "data";
+ if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
+ return "meta_pertrans";
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+ return "meta_prealloc";
+ return NULL;
+}
+#endif
+
+static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
+ qgroup->rsv.values[type] += num_bytes;
+}
+
+static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
+ if (qgroup->rsv.values[type] >= num_bytes) {
+ qgroup->rsv.values[type] -= num_bytes;
+ return;
+ }
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_RATELIMIT(1,
+ "qgroup %llu %s reserved space underflow, have %llu to free %llu",
+ qgroup->qgroupid, qgroup_rsv_type_str(type),
+ qgroup->rsv.values[type], num_bytes);
+#endif
+ qgroup->rsv.values[type] = 0;
+}
+
+static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *dest,
+ struct btrfs_qgroup *src)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
+}
+
+static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *dest,
+ struct btrfs_qgroup *src)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
+}
+
static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
int mod)
{
@@ -826,10 +902,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
int slot;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (fs_info->quota_root) {
- set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
+ if (fs_info->quota_root)
goto out;
- }
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
if (!fs_info->qgroup_ulist) {
@@ -923,8 +997,15 @@ out_add_root:
}
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
- set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
spin_unlock(&fs_info->qgroup_lock);
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (!ret) {
+ qgroup_rescan_zero_tracking(fs_info);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ }
+
out_free_path:
btrfs_free_path(path);
out_free_root:
@@ -991,33 +1072,29 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}
-static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup *qgroup,
- u64 num_bytes)
-{
-#ifdef CONFIG_BTRFS_DEBUG
- WARN_ON(qgroup->reserved < num_bytes);
- btrfs_debug(fs_info,
- "qgroup %llu reserved space underflow, have: %llu, to free: %llu",
- qgroup->qgroupid, qgroup->reserved, num_bytes);
-#endif
- qgroup->reserved = 0;
-}
/*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their reference and
- * exclusive counts adjusted.
+ * The easy accounting, we're updating qgroup relationship whose child qgroup
+ * only has exclusive extents.
+ *
+ * In this case, all exclsuive extents will also be exlusive for parent, so
+ * excl/rfer just get added/removed.
+ *
+ * So is qgroup reservation space, which should also be added/removed to
+ * parent.
+ * Or when child tries to release reservation space, parent will underflow its
+ * reservation (for relationship adding case).
*
* Caller should hold fs_info->qgroup_lock.
*/
static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
struct ulist *tmp, u64 ref_root,
- u64 num_bytes, int sign)
+ struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *glist;
struct ulist_node *unode;
struct ulist_iterator uiter;
+ u64 num_bytes = src->excl;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1030,13 +1107,11 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
qgroup->excl_cmpr += sign * num_bytes;
- if (sign > 0) {
- trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes);
- if (qgroup->reserved < num_bytes)
- report_reserved_underflow(fs_info, qgroup, num_bytes);
- else
- qgroup->reserved -= num_bytes;
- }
+
+ if (sign > 0)
+ qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
+ else
+ qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
qgroup_dirty(fs_info, qgroup);
@@ -1056,15 +1131,10 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup->rfer_cmpr += sign * num_bytes;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
- if (sign > 0) {
- trace_qgroup_update_reserve(fs_info, qgroup,
- -(s64)num_bytes);
- if (qgroup->reserved < num_bytes)
- report_reserved_underflow(fs_info, qgroup,
- num_bytes);
- else
- qgroup->reserved -= num_bytes;
- }
+ if (sign > 0)
+ qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
+ else
+ qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
qgroup->excl_cmpr += sign * num_bytes;
qgroup_dirty(fs_info, qgroup);
@@ -1107,7 +1177,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
if (qgroup->excl == qgroup->rfer) {
ret = 0;
err = __qgroup_excl_accounting(fs_info, tmp, dst,
- qgroup->excl, sign);
+ qgroup, sign);
if (err < 0) {
ret = err;
goto out;
@@ -1414,7 +1484,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
- assert_spin_locked(&delayed_refs->lock);
+ lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
while (*p) {
@@ -1614,7 +1684,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
return 0;
if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_buffer(root_eb, root_gen);
+ ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
goto out;
}
@@ -1645,6 +1715,7 @@ walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
+ struct btrfs_key first_key;
int parent_slot;
u64 child_gen;
u64 child_bytenr;
@@ -1657,8 +1728,10 @@ walk_down:
parent_slot = path->slots[level + 1];
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
- eb = read_tree_block(fs_info, child_bytenr, child_gen);
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ level, &first_key);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
@@ -2009,9 +2082,9 @@ out_free:
return ret;
}
-int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
@@ -2080,17 +2153,9 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
{
struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
- int start_rescan_worker = 0;
if (!quota_root)
- goto out;
-
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- test_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
- start_rescan_worker = 1;
-
- if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
- set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ return ret;
spin_lock(&fs_info->qgroup_lock);
while (!list_empty(&fs_info->dirty_qgroups)) {
@@ -2119,18 +2184,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- if (!ret && start_rescan_worker) {
- ret = qgroup_rescan_init(fs_info, 0, 1);
- if (!ret) {
- qgroup_rescan_zero_tracking(fs_info);
- btrfs_queue_work(fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
- }
- ret = 0;
- }
-
-out:
-
return ret;
}
@@ -2338,24 +2391,24 @@ out:
static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
- qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer)
+ qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
- qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl)
+ qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
return true;
}
-static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
+ enum btrfs_qgroup_rsv_type type)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
- int retried = 0;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2369,7 +2422,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
capable(CAP_SYS_RESOURCE))
enforce = false;
-retry:
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
if (!quota_root)
@@ -2385,7 +2437,7 @@ retry:
*/
ulist_reinit(fs_info->qgroup_ulist);
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- (uintptr_t)qgroup, GFP_ATOMIC);
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
@@ -2396,27 +2448,6 @@ retry:
qg = unode_aux_to_qgroup(unode);
if (enforce && !qgroup_check_limits(qg, num_bytes)) {
- /*
- * Commit the tree and retry, since we may have
- * deletions which would free up space.
- */
- if (!retried && qg->reserved > 0) {
- struct btrfs_trans_handle *trans;
-
- spin_unlock(&fs_info->qgroup_lock);
- ret = btrfs_start_delalloc_inodes(root, 0);
- if (ret)
- return ret;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- retried++;
- goto retry;
- }
ret = -EDQUOT;
goto out;
}
@@ -2424,7 +2455,7 @@ retry:
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(fs_info->qgroup_ulist,
glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
@@ -2439,8 +2470,8 @@ retry:
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, num_bytes);
- qg->reserved += num_bytes;
+ trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
+ qgroup_rsv_add(fs_info, qg, num_bytes, type);
}
out:
@@ -2448,8 +2479,18 @@ out:
return ret;
}
+/*
+ * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0
+ * qgroup).
+ *
+ * Will handle all higher level qgroup too.
+ *
+ * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
+ * This special case is only used for META_PERTRANS type.
+ */
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
- u64 ref_root, u64 num_bytes)
+ u64 ref_root, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
@@ -2463,6 +2504,10 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
if (num_bytes == 0)
return;
+ if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
+ WARN(1, "%s: Invalid type to free", __func__);
+ return;
+ }
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
@@ -2473,9 +2518,16 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
if (!qgroup)
goto out;
+ if (num_bytes == (u64)-1)
+ /*
+ * We're freeing all pertrans rsv, get reserved value from
+ * level 0 qgroup as real num_bytes to free.
+ */
+ num_bytes = qgroup->rsv.values[type];
+
ulist_reinit(fs_info->qgroup_ulist);
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- (uintptr_t)qgroup, GFP_ATOMIC);
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
@@ -2485,16 +2537,13 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
- if (qg->reserved < num_bytes)
- report_reserved_underflow(fs_info, qg, num_bytes);
- else
- qg->reserved -= num_bytes;
+ trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
+ qgroup_rsv_release(fs_info, qg, num_bytes, type);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(fs_info->qgroup_ulist,
glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
@@ -2877,7 +2926,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
goto cleanup;
- ret = qgroup_reserve(root, to_reserve, true);
+ ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
@@ -2940,7 +2989,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed);
+ btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+ BTRFS_QGROUP_RSV_DATA);
ret = freed;
out:
extent_changeset_release(&changeset);
@@ -2972,7 +3022,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
if (free)
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
BTRFS_I(inode)->root->objectid,
- changeset.bytes_changed);
+ changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
extent_changeset_release(&changeset);
@@ -3017,8 +3067,48 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
-int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- bool enforce)
+static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+ type != BTRFS_QGROUP_RSV_META_PERTRANS)
+ return;
+ if (num_bytes == 0)
+ return;
+
+ spin_lock(&root->qgroup_meta_rsv_lock);
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+ root->qgroup_meta_rsv_prealloc += num_bytes;
+ else
+ root->qgroup_meta_rsv_pertrans += num_bytes;
+ spin_unlock(&root->qgroup_meta_rsv_lock);
+}
+
+static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+ type != BTRFS_QGROUP_RSV_META_PERTRANS)
+ return 0;
+ if (num_bytes == 0)
+ return 0;
+
+ spin_lock(&root->qgroup_meta_rsv_lock);
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
+ num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
+ num_bytes);
+ root->qgroup_meta_rsv_prealloc -= num_bytes;
+ } else {
+ num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
+ num_bytes);
+ root->qgroup_meta_rsv_pertrans -= num_bytes;
+ }
+ spin_unlock(&root->qgroup_meta_rsv_lock);
+ return num_bytes;
+}
+
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3028,31 +3118,39 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, (s64)num_bytes);
- ret = qgroup_reserve(root, num_bytes, enforce);
+ trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
+ ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
- atomic64_add(num_bytes, &root->qgroup_meta_rsv);
+ /*
+ * Record what we have reserved into root.
+ *
+ * To avoid quota disabled->enabled underflow.
+ * In that case, we may try to free space we haven't reserved
+ * (since quota was disabled), so record what we reserved into root.
+ * And ensure later release won't underflow this number.
+ */
+ add_root_meta_rsv(root, num_bytes, type);
return ret;
}
-void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 reserved;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->objectid))
return;
- reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0);
- if (reserved == 0)
- return;
- trace_qgroup_meta_reserve(root, -(s64)reserved);
- btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved);
+ /* TODO: Update trace point to handle such free */
+ trace_qgroup_meta_free_all_pertrans(root);
+ /* Special value -1 means to free all reserved space */
+ btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
}
-void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3060,11 +3158,75 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
!is_fstree(root->objectid))
return;
+ /*
+ * reservation for META_PREALLOC can happen before quota is enabled,
+ * which can lead to underflow.
+ * Here ensure we will only free what we really have reserved.
+ */
+ num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
- atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
- trace_qgroup_meta_reserve(root, -(s64)num_bytes);
- btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes);
+ trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
+ btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
+}
+
+static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
+ int num_bytes)
+{
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_qgroup *qgroup;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret = 0;
+
+ if (num_bytes == 0)
+ return;
+ if (!quota_root)
+ return;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = unode_aux_to_qgroup(unode);
+
+ qgroup_rsv_release(fs_info, qg, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+ qgroup_rsv_add(fs_info, qg, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+}
+
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ !is_fstree(root->objectid))
+ return;
+ /* Same as btrfs_qgroup_free_meta_prealloc() */
+ num_bytes = sub_root_meta_rsv(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+ trace_qgroup_meta_convert(root, num_bytes);
+ qgroup_convert_meta(fs_info, root->objectid, num_bytes);
}
/*
@@ -3092,7 +3254,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
BTRFS_I(inode)->root->objectid,
- changeset.bytes_changed);
+ changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
extent_changeset_release(&changeset);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index d9984e87cddf..e63e2d497a8e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -62,6 +62,48 @@ struct btrfs_qgroup_extent_record {
};
/*
+ * Qgroup reservation types:
+ *
+ * DATA:
+ * space reserved for data
+ *
+ * META_PERTRANS:
+ * Space reserved for metadata (per-transaction)
+ * Due to the fact that qgroup data is only updated at transaction commit
+ * time, reserved space for metadata must be kept until transaction
+ * commits.
+ * Any metadata reserved that are used in btrfs_start_transaction() should
+ * be of this type.
+ *
+ * META_PREALLOC:
+ * There are cases where metadata space is reserved before starting
+ * transaction, and then btrfs_join_transaction() to get a trans handle.
+ * Any metadata reserved for such usage should be of this type.
+ * And after join_transaction() part (or all) of such reservation should
+ * be converted into META_PERTRANS.
+ */
+enum btrfs_qgroup_rsv_type {
+ BTRFS_QGROUP_RSV_DATA = 0,
+ BTRFS_QGROUP_RSV_META_PERTRANS,
+ BTRFS_QGROUP_RSV_META_PREALLOC,
+ BTRFS_QGROUP_RSV_LAST,
+};
+
+/*
+ * Represents how many bytes we have reserved for this qgroup.
+ *
+ * Each type should have different reservation behavior.
+ * E.g, data follows its io_tree flag modification, while
+ * *currently* meta is just reserve-and-clear during transcation.
+ *
+ * TODO: Add new type for reservation which can survive transaction commit.
+ * Currect metadata reservation behavior is not suitable for such case.
+ */
+struct btrfs_qgroup_rsv {
+ u64 values[BTRFS_QGROUP_RSV_LAST];
+};
+
+/*
* one struct for each qgroup, organized in fs_info->qgroup_tree.
*/
struct btrfs_qgroup {
@@ -87,7 +129,7 @@ struct btrfs_qgroup {
/*
* reservation tracking
*/
- u64 reserved;
+ struct btrfs_qgroup_rsv rsv;
/*
* lists
@@ -220,20 +262,21 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
struct ulist *old_roots, struct ulist *new_roots);
-int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
- u64 ref_root, u64 num_bytes);
+ u64 ref_root, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type);
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
+ BTRFS_QGROUP_RSV_DATA);
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -248,9 +291,54 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
int btrfs_qgroup_free_data(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- bool enforce);
-void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
-void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
+/* Reserve metadata space for pertrans and prealloc type */
+static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
+ int num_bytes, bool enforce)
+{
+ return __btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+}
+static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
+ int num_bytes, bool enforce)
+{
+ return __btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+}
+
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type);
+
+/* Free per-transaction meta reservation for error handling */
+static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
+ int num_bytes)
+{
+ __btrfs_qgroup_free_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
+}
+
+/* Pre-allocated meta reservation can be freed at need */
+static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
+ int num_bytes)
+{
+ __btrfs_qgroup_free_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+}
+
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
+
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
+
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index dec0907dfb8a..c3a2bc8af675 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1370,6 +1370,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
stripe_start = stripe->physical;
if (physical >= stripe_start &&
physical < stripe_start + rbio->stripe_len &&
+ stripe->dev->bdev &&
bio->bi_disk == stripe->dev->bdev->bd_disk &&
bio->bi_partno == stripe->dev->bdev->bd_partno) {
return i;
@@ -1986,7 +1987,13 @@ cleanup:
kfree(pointers);
cleanup_io:
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+ /*
+ * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
+ * valid rbio which is consistent with ondisk content, thus such a
+ * valid rbio can be cached to avoid further disk reads.
+ */
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
/*
* - In case of two failures, where rbio->failb != -1:
*
@@ -2008,8 +2015,6 @@ cleanup_io:
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
rbio_orig_end_io(rbio, err);
- } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- rbio_orig_end_io(rbio, err);
} else if (err == BLK_STS_OK) {
rbio->faila = -1;
rbio->failb = -1;
@@ -2767,24 +2772,8 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
return rbio;
}
-static void missing_raid56_work(struct btrfs_work *work)
-{
- struct btrfs_raid_bio *rbio;
-
- rbio = container_of(work, struct btrfs_raid_bio, work);
- __raid56_parity_recover(rbio);
-}
-
-static void async_missing_raid56(struct btrfs_raid_bio *rbio)
-{
- btrfs_init_work(&rbio->work, btrfs_rmw_helper,
- missing_raid56_work, NULL, NULL);
-
- btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
-}
-
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
{
if (!lock_stripe_add(rbio))
- async_missing_raid56(rbio);
+ async_read_rebuild(rbio);
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ab852b8e3e37..a52dd12af648 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -395,20 +395,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
goto error;
/* insert extent in reada_tree + all per-device trees, all or nothing */
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
radix_tree_preload_end();
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
radix_tree_preload_end();
goto error;
}
@@ -451,13 +451,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
}
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
goto error;
}
have_zone = 1;
}
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
if (!have_zone)
goto error;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 171f3cce30e6..35fab67dcbe8 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -579,11 +579,16 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
while (level >= 0) {
if (level) {
+ struct btrfs_key first_key;
+
block_bytenr = btrfs_node_blockptr(path->nodes[level],
path->slots[level]);
gen = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
- eb = read_tree_block(fs_info, block_bytenr, gen);
+ btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+ path->slots[level]);
+ eb = read_tree_block(fs_info, block_bytenr, gen,
+ level - 1, &first_key);
if (IS_ERR(eb))
return PTR_ERR(eb);
if (!extent_buffer_uptodate(eb)) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f0c3f00e97cb..4874c09f6d3c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1839,6 +1839,8 @@ again:
parent = eb;
while (1) {
+ struct btrfs_key first_key;
+
level = btrfs_header_level(parent);
BUG_ON(level < lowest_level);
@@ -1852,6 +1854,7 @@ again:
old_bytenr = btrfs_node_blockptr(parent, slot);
blocksize = fs_info->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+ btrfs_node_key_to_cpu(parent, &key, slot);
if (level <= max_level) {
eb = path->nodes[level];
@@ -1876,7 +1879,8 @@ again:
break;
}
- eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen);
+ eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
+ level - 1, &first_key);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
@@ -2036,6 +2040,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
for (i = *level; i > 0; i--) {
+ struct btrfs_key first_key;
+
eb = path->nodes[i];
nritems = btrfs_header_nritems(eb);
while (path->slots[i] < nritems) {
@@ -2056,7 +2062,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
}
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- eb = read_tree_block(fs_info, bytenr, ptr_gen);
+ btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
+ eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
+ &first_key);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -2714,6 +2722,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
+ struct btrfs_key first_key;
+
cond_resched();
upper = edge->node[UPPER];
@@ -2779,7 +2789,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
blocksize = root->fs_info->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
- eb = read_tree_block(fs_info, bytenr, generation);
+ btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
+ eb = read_tree_block(fs_info, bytenr, generation,
+ upper->level - 1, &first_key);
if (IS_ERR(eb)) {
err = PTR_ERR(eb);
goto next;
@@ -2944,7 +2956,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
BUG_ON(block->key_ready);
- eb = read_tree_block(fs_info, block->bytenr, block->key.offset);
+ eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
+ block->level, NULL);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -3226,7 +3239,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
mask);
if (!page) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE);
+ PAGE_SIZE, true);
ret = -ENOMEM;
goto out;
}
@@ -3245,9 +3258,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
unlock_page(page);
put_page(page);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE);
+ PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
+ PAGE_SIZE, true);
ret = -EIO;
goto out;
}
@@ -3268,8 +3281,22 @@ static int relocate_file_extent_cluster(struct inode *inode,
nr++;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL,
- 0);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
+ NULL, 0);
+ if (ret) {
+ unlock_page(page);
+ put_page(page);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ PAGE_SIZE, true);
+
+ clear_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+ goto out;
+
+ }
set_page_dirty(page);
unlock_extent(&BTRFS_I(inode)->io_tree,
@@ -3278,7 +3305,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
put_page(page);
index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
+ false);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ec56f33feea9..1a2066ac6fe7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -371,7 +371,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
- WARN_ON(!mutex_is_locked(&locks_root->lock));
+ lockdep_assert_held(&locks_root->lock);
p = &locks_root->root.rb_node;
while (*p) {
@@ -413,7 +413,7 @@ static struct full_stripe_lock *search_full_stripe_lock(
struct rb_node *node;
struct full_stripe_lock *entry;
- WARN_ON(!mutex_is_locked(&locks_root->lock));
+ lockdep_assert_held(&locks_root->lock);
node = locks_root->root.rb_node;
while (node) {
@@ -1111,7 +1111,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct scrub_ctx *sctx = sblock_to_check->sctx;
struct btrfs_device *dev;
struct btrfs_fs_info *fs_info;
- u64 length;
u64 logical;
unsigned int failed_mirror_index;
unsigned int is_metadata;
@@ -1139,7 +1138,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_unlock(&sctx->stat_lock);
return 0;
}
- length = sblock_to_check->page_count * PAGE_SIZE;
logical = sblock_to_check->pagev[0]->logical;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
@@ -1412,8 +1410,17 @@ nodatasum_case:
if (!page_bad->io_error && !sctx->is_dev_replace)
continue;
- /* try to find no-io-error page in mirrors */
- if (page_bad->io_error) {
+ if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ /*
+ * In case of dev replace, if raid56 rebuild process
+ * didn't work out correct data, then copy the content
+ * in sblock_bad to make sure target device is identical
+ * to source device, instead of writing garbage data in
+ * sblock_for_recheck array to target device.
+ */
+ sblock_other = NULL;
+ } else if (page_bad->io_error) {
+ /* try to find no-io-error page in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
sblocks_for_recheck[mirror_index].page_count > 0;
@@ -1718,6 +1725,45 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
return blk_status_to_errno(bio->bi_status);
}
+static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock)
+{
+ struct scrub_page *first_page = sblock->pagev[0];
+ struct bio *bio;
+ int page_num;
+
+ /* All pages in sblock belong to the same stripe on the same device. */
+ ASSERT(first_page->dev);
+ if (!first_page->dev->bdev)
+ goto out;
+
+ bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
+ bio_set_dev(bio, first_page->dev->bdev);
+
+ for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ struct scrub_page *page = sblock->pagev[page_num];
+
+ WARN_ON(!page->page);
+ bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ }
+
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+ bio_put(bio);
+ goto out;
+ }
+
+ bio_put(bio);
+
+ scrub_recheck_block_checksum(sblock);
+
+ return;
+out:
+ for (page_num = 0; page_num < sblock->page_count; page_num++)
+ sblock->pagev[page_num]->io_error = 1;
+
+ sblock->no_io_error_seen = 0;
+}
+
/*
* this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1733,6 +1779,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
sblock->no_io_error_seen = 1;
+ /* short cut for raid56 */
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+ return scrub_recheck_block_on_raid56(fs_info, sblock);
+
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
struct scrub_page *page = sblock->pagev[page_num];
@@ -1748,19 +1798,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio_set_dev(bio, page->dev->bdev);
bio_add_page(bio, page->page, PAGE_SIZE, 0);
- if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
- if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
- page->io_error = 1;
- sblock->no_io_error_seen = 0;
- }
- } else {
- bio->bi_iter.bi_sector = page->physical >> 9;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_iter.bi_sector = page->physical >> 9;
+ bio->bi_opf = REQ_OP_READ;
- if (btrfsic_submit_bio_wait(bio)) {
- page->io_error = 1;
- sblock->no_io_error_seen = 0;
- }
+ if (btrfsic_submit_bio_wait(bio)) {
+ page->io_error = 1;
+ sblock->no_io_error_seen = 0;
}
bio_put(bio);
@@ -2728,7 +2771,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
}
/* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
+ u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u64 physical_for_dev_replace)
{
@@ -2737,13 +2781,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sctx->fs_info->sectorsize;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ blocksize = map->stripe_len;
+ else
+ blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sctx->fs_info->nodesize;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ blocksize = map->stripe_len;
+ else
+ blocksize = sctx->fs_info->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
sctx->stat.tree_bytes_scrubbed += len;
@@ -2883,9 +2933,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
}
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sctx->fs_info->sectorsize;
+ blocksize = sparity->stripe_len;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sctx->fs_info->nodesize;
+ blocksize = sparity->stripe_len;
} else {
blocksize = sctx->fs_info->sectorsize;
WARN_ON(1);
@@ -3595,7 +3645,7 @@ again:
if (ret)
goto out;
- ret = scrub_extent(sctx, extent_logical, extent_len,
+ ret = scrub_extent(sctx, map, extent_logical, extent_len,
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
extent_logical - logical + physical);
@@ -3885,11 +3935,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
- btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+ btrfs_dev_replace_write_lock(&fs_info->dev_replace);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+ btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
found_key.offset, cache, is_dev_replace);
@@ -3925,10 +3975,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+ btrfs_dev_replace_write_lock(&fs_info->dev_replace);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+ btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
if (ro_set)
btrfs_dec_block_group_ro(cache);
@@ -4144,16 +4194,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EIO;
}
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (dev->scrub_ctx ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -EINPROGRESS;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret) {
@@ -4480,7 +4530,8 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
* move on to the next inode.
*/
if (em->block_start > logical ||
- em->block_start + em->block_len < logical + len) {
+ em->block_start + em->block_len < logical + len ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
free_extent_map(em);
ret = 1;
goto out_unlock;
@@ -4620,7 +4671,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
{
struct bio *bio;
struct btrfs_device *dev;
- int ret;
dev = sctx->wr_tgtdev;
if (!dev)
@@ -4635,17 +4685,15 @@ static int write_page_nocow(struct scrub_ctx *sctx,
bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio_set_dev(bio, dev->bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
- ret = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
-leave_with_eio:
+ /* bio_add_page won't fail on a freshly allocated bio */
+ bio_add_page(bio, page, PAGE_SIZE, 0);
+
+ if (btrfsic_submit_bio_wait(bio)) {
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
- if (btrfsic_submit_bio_wait(bio))
- goto leave_with_eio;
-
bio_put(bio);
return 0;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f306c608dc28..1f5748c7d1c7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -27,10 +27,10 @@
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/compat.h>
+#include <linux/crc32c.h>
#include "send.h"
#include "backref.h"
-#include "hash.h"
#include "locking.h"
#include "disk-io.h"
#include "btrfs_inode.h"
@@ -112,6 +112,7 @@ struct send_ctx {
u64 cur_inode_mode;
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
+ u64 cur_inode_next_write_offset;
u64 send_progress;
@@ -270,6 +271,7 @@ struct name_cache_entry {
char name[];
};
+__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
enum btrfs_compare_tree_result result,
const char *what)
@@ -611,9 +613,9 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
}
-#define TLV_PUT(sctx, attrtype, attrlen, data) \
+#define TLV_PUT(sctx, attrtype, data, attrlen) \
do { \
- ret = tlv_put(sctx, attrtype, attrlen, data); \
+ ret = tlv_put(sctx, attrtype, data, attrlen); \
if (ret < 0) \
goto tlv_put_failure; \
} while (0)
@@ -695,7 +697,7 @@ static int send_cmd(struct send_ctx *sctx)
hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
hdr->crc = 0;
- crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -5005,6 +5007,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
u64 len;
int ret = 0;
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, end - offset);
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -5026,6 +5031,7 @@ static int send_hole(struct send_ctx *sctx, u64 end)
break;
offset += len;
}
+ sctx->cur_inode_next_write_offset = offset;
tlv_put_failure:
fs_path_free(p);
return ret;
@@ -5261,6 +5267,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
} else {
ret = send_extent_data(sctx, offset, len);
}
+ sctx->cur_inode_next_write_offset = offset + len;
out:
return ret;
}
@@ -5785,6 +5792,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
u64 right_gid;
int need_chmod = 0;
int need_chown = 0;
+ int need_truncate = 1;
int pending_move = 0;
int refs_processed = 0;
@@ -5822,9 +5830,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode))
need_chmod = 1;
+ if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
+ need_truncate = 0;
} else {
+ u64 old_size;
+
ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
- NULL, NULL, &right_mode, &right_uid,
+ &old_size, NULL, &right_mode, &right_uid,
&right_gid, NULL);
if (ret < 0)
goto out;
@@ -5833,6 +5845,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
need_chmod = 1;
+ if ((old_size == sctx->cur_inode_size) ||
+ (sctx->cur_inode_size > old_size &&
+ sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
+ need_truncate = 0;
}
if (S_ISREG(sctx->cur_inode_mode)) {
@@ -5851,10 +5867,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
}
}
- ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
- sctx->cur_inode_size);
- if (ret < 0)
- goto out;
+ if (need_truncate) {
+ ret = send_truncate(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen,
+ sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ }
}
if (need_chown) {
@@ -5908,6 +5927,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_ino = key->objectid;
sctx->cur_inode_new_gen = 0;
sctx->cur_inode_last_extent = (u64)-1;
+ sctx->cur_inode_next_write_offset = 0;
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6e71a2a78363..170baef49fae 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
#include <linux/slab.h>
#include <linux/cleancache.h>
#include <linux/ratelimit.h>
+#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include "delayed-inode.h"
#include "ctree.h"
@@ -48,7 +49,6 @@
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
-#include "hash.h"
#include "props.h"
#include "xattr.h"
#include "volumes.h"
@@ -308,21 +308,50 @@ static void btrfs_put_super(struct super_block *sb)
}
enum {
- Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
- Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
- Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
- Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
- Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
- Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
- Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
- Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
- Opt_skip_balance, Opt_check_integrity,
+ Opt_acl, Opt_noacl,
+ Opt_clear_cache,
+ Opt_commit_interval,
+ Opt_compress,
+ Opt_compress_force,
+ Opt_compress_force_type,
+ Opt_compress_type,
+ Opt_degraded,
+ Opt_device,
+ Opt_fatal_errors,
+ Opt_flushoncommit, Opt_noflushoncommit,
+ Opt_inode_cache, Opt_noinode_cache,
+ Opt_max_inline,
+ Opt_barrier, Opt_nobarrier,
+ Opt_datacow, Opt_nodatacow,
+ Opt_datasum, Opt_nodatasum,
+ Opt_defrag, Opt_nodefrag,
+ Opt_discard, Opt_nodiscard,
+ Opt_nologreplay,
+ Opt_norecovery,
+ Opt_ratio,
+ Opt_rescan_uuid_tree,
+ Opt_skip_balance,
+ Opt_space_cache, Opt_no_space_cache,
+ Opt_space_cache_version,
+ Opt_ssd, Opt_nossd,
+ Opt_ssd_spread, Opt_nossd_spread,
+ Opt_subvol,
+ Opt_subvolid,
+ Opt_thread_pool,
+ Opt_treelog, Opt_notreelog,
+ Opt_usebackuproot,
+ Opt_user_subvol_rm_allowed,
+
+ /* Deprecated options */
+ Opt_alloc_start,
+ Opt_recovery,
+ Opt_subvolrootid,
+
+ /* Debugging options */
+ Opt_check_integrity,
Opt_check_integrity_including_extent_data,
- Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
- Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
- Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
- Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
- Opt_nologreplay, Opt_norecovery,
+ Opt_check_integrity_print_mask,
+ Opt_enospc_debug, Opt_noenospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
@@ -333,58 +362,63 @@ enum {
};
static const match_table_t tokens = {
- {Opt_degraded, "degraded"},
- {Opt_subvol, "subvol=%s"},
- {Opt_subvolid, "subvolid=%s"},
- {Opt_device, "device=%s"},
- {Opt_nodatasum, "nodatasum"},
- {Opt_datasum, "datasum"},
- {Opt_nodatacow, "nodatacow"},
- {Opt_datacow, "datacow"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_barrier, "barrier"},
- {Opt_max_inline, "max_inline=%s"},
- {Opt_alloc_start, "alloc_start=%s"},
- {Opt_thread_pool, "thread_pool=%d"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_clear_cache, "clear_cache"},
+ {Opt_commit_interval, "commit=%u"},
{Opt_compress, "compress"},
{Opt_compress_type, "compress=%s"},
{Opt_compress_force, "compress-force"},
{Opt_compress_force_type, "compress-force=%s"},
- {Opt_ssd, "ssd"},
- {Opt_ssd_spread, "ssd_spread"},
- {Opt_nossd, "nossd"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_notreelog, "notreelog"},
- {Opt_treelog, "treelog"},
- {Opt_nologreplay, "nologreplay"},
- {Opt_norecovery, "norecovery"},
+ {Opt_degraded, "degraded"},
+ {Opt_device, "device=%s"},
+ {Opt_fatal_errors, "fatal_errors=%s"},
{Opt_flushoncommit, "flushoncommit"},
{Opt_noflushoncommit, "noflushoncommit"},
- {Opt_ratio, "metadata_ratio=%d"},
+ {Opt_inode_cache, "inode_cache"},
+ {Opt_noinode_cache, "noinode_cache"},
+ {Opt_max_inline, "max_inline=%s"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_datacow, "datacow"},
+ {Opt_nodatacow, "nodatacow"},
+ {Opt_datasum, "datasum"},
+ {Opt_nodatasum, "nodatasum"},
+ {Opt_defrag, "autodefrag"},
+ {Opt_nodefrag, "noautodefrag"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
+ {Opt_nologreplay, "nologreplay"},
+ {Opt_norecovery, "norecovery"},
+ {Opt_ratio, "metadata_ratio=%u"},
+ {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
+ {Opt_skip_balance, "skip_balance"},
{Opt_space_cache, "space_cache"},
+ {Opt_no_space_cache, "nospace_cache"},
{Opt_space_cache_version, "space_cache=%s"},
- {Opt_clear_cache, "clear_cache"},
+ {Opt_ssd, "ssd"},
+ {Opt_nossd, "nossd"},
+ {Opt_ssd_spread, "ssd_spread"},
+ {Opt_nossd_spread, "nossd_spread"},
+ {Opt_subvol, "subvol=%s"},
+ {Opt_subvolid, "subvolid=%s"},
+ {Opt_thread_pool, "thread_pool=%u"},
+ {Opt_treelog, "treelog"},
+ {Opt_notreelog, "notreelog"},
+ {Opt_usebackuproot, "usebackuproot"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
- {Opt_enospc_debug, "enospc_debug"},
- {Opt_noenospc_debug, "noenospc_debug"},
+
+ /* Deprecated options */
+ {Opt_alloc_start, "alloc_start=%s"},
+ {Opt_recovery, "recovery"},
{Opt_subvolrootid, "subvolrootid=%d"},
- {Opt_defrag, "autodefrag"},
- {Opt_nodefrag, "noautodefrag"},
- {Opt_inode_cache, "inode_cache"},
- {Opt_noinode_cache, "noinode_cache"},
- {Opt_no_space_cache, "nospace_cache"},
- {Opt_recovery, "recovery"}, /* deprecated */
- {Opt_usebackuproot, "usebackuproot"},
- {Opt_skip_balance, "skip_balance"},
+
+ /* Debugging options */
{Opt_check_integrity, "check_int"},
{Opt_check_integrity_including_extent_data, "check_int_data"},
- {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
- {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
- {Opt_fatal_errors, "fatal_errors=%s"},
- {Opt_commit_interval, "commit=%d"},
+ {Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
+ {Opt_enospc_debug, "enospc_debug"},
+ {Opt_noenospc_debug, "noenospc_debug"},
#ifdef CONFIG_BTRFS_DEBUG
{Opt_fragment_data, "fragment=data"},
{Opt_fragment_metadata, "fragment=metadata"},
@@ -579,6 +613,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, NOSSD);
btrfs_clear_and_info(info, SSD,
"not using ssd optimizations");
+ /* Fallthrough */
+ case Opt_nossd_spread:
btrfs_clear_and_info(info, SSD_SPREAD,
"not using spread ssd allocation scheme");
break;
@@ -594,12 +630,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
ret = match_int(&args[0], &intarg);
if (ret) {
goto out;
- } else if (intarg > 0) {
- info->thread_pool_size = intarg;
- } else {
+ } else if (intarg == 0) {
ret = -EINVAL;
goto out;
}
+ info->thread_pool_size = intarg;
break;
case Opt_max_inline:
num = match_strdup(&args[0]);
@@ -658,16 +693,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
- if (ret) {
+ if (ret)
goto out;
- } else if (intarg >= 0) {
- info->metadata_ratio = intarg;
- btrfs_info(info, "metadata ratio %d",
- info->metadata_ratio);
- } else {
- ret = -EINVAL;
- goto out;
- }
+ info->metadata_ratio = intarg;
+ btrfs_info(info, "metadata ratio %u",
+ info->metadata_ratio);
break;
case Opt_discard:
btrfs_set_and_info(info, DISCARD,
@@ -762,17 +792,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
- if (ret) {
+ if (ret)
goto out;
- } else if (intarg >= 0) {
- info->check_integrity_print_mask = intarg;
- btrfs_info(info,
- "check_integrity_print_mask 0x%x",
- info->check_integrity_print_mask);
- } else {
- ret = -EINVAL;
- goto out;
- }
+ info->check_integrity_print_mask = intarg;
+ btrfs_info(info, "check_integrity_print_mask 0x%x",
+ info->check_integrity_print_mask);
break;
#else
case Opt_check_integrity_including_extent_data:
@@ -798,24 +822,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
- if (ret < 0) {
- btrfs_err(info, "invalid commit interval");
- ret = -EINVAL;
+ if (ret)
goto out;
- }
- if (intarg > 0) {
- if (intarg > 300) {
- btrfs_warn(info,
- "excessive commit interval %d",
- intarg);
- }
- info->commit_interval = intarg;
- } else {
+ if (intarg == 0) {
btrfs_info(info,
- "using default commit interval %ds",
+ "using default commit interval %us",
BTRFS_DEFAULT_COMMIT_INTERVAL);
- info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ } else if (intarg > 300) {
+ btrfs_warn(info, "excessive commit interval %d",
+ intarg);
}
+ info->commit_interval = intarg;
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
@@ -932,8 +950,8 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
{
substring_t args[MAX_OPT_ARGS];
char *opts, *orig, *p;
- char *num = NULL;
int error = 0;
+ u64 subvolid;
if (!options)
return 0;
@@ -963,18 +981,15 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
}
break;
case Opt_subvolid:
- num = match_strdup(&args[0]);
- if (num) {
- *subvol_objectid = memparse(num, NULL);
- kfree(num);
- /* we want the original fs_tree */
- if (!*subvol_objectid)
- *subvol_objectid =
- BTRFS_FS_TREE_OBJECTID;
- } else {
- error = -EINVAL;
+ error = match_u64(&args[0], &subvolid);
+ if (error)
goto out;
- }
+
+ /* we want the original fs_tree */
+ if (subvolid == 0)
+ subvolid = BTRFS_FS_TREE_OBJECTID;
+
+ *subvol_objectid = subvolid;
break;
case Opt_subvolrootid:
pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
@@ -1284,7 +1299,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
- seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+ seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
if (btrfs_test_opt(info, COMPRESS)) {
compress_type = btrfs_compress_type2str(info->compress_type);
if (btrfs_test_opt(info, FORCE_COMPRESS))
@@ -1340,12 +1355,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
info->check_integrity_print_mask);
#endif
if (info->metadata_ratio)
- seq_printf(seq, ",metadata_ratio=%d",
- info->metadata_ratio);
+ seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
- seq_printf(seq, ",commit=%d", info->commit_interval);
+ seq_printf(seq, ",commit=%u", info->commit_interval);
#ifdef CONFIG_BTRFS_DEBUG
if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
@@ -1545,7 +1559,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
* it for searching for existing supers, so this lets us do that and
* then open_ctree will properly initialize everything later.
*/
- fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
if (!fs_info) {
error = -ENOMEM;
goto error_sec_opts;
@@ -1690,7 +1704,7 @@ out:
}
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
- int new_pool_size, int old_pool_size)
+ u32 new_pool_size, u32 old_pool_size)
{
if (new_pool_size == old_pool_size)
return;
@@ -1758,8 +1772,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
u64 old_max_inline = fs_info->max_inline;
- int old_thread_pool_size = fs_info->thread_pool_size;
- unsigned int old_metadata_ratio = fs_info->metadata_ratio;
+ u32 old_thread_pool_size = fs_info->thread_pool_size;
+ u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
sync_filesystem(sb);
@@ -2290,11 +2304,18 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
struct list_head *head;
struct rcu_string *name;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ /*
+ * Lightweight locking of the devices. We should not need
+ * device_list_mutex here as we only read the device data and the list
+ * is protected by RCU. Even if a device is deleted during the list
+ * traversals, we'll get valid data, the freeing callback will wait at
+ * least until until the rcu_read_unlock.
+ */
+ rcu_read_lock();
cur_devices = fs_info->fs_devices;
while (cur_devices) {
head = &cur_devices->devices;
- list_for_each_entry(dev, head, dev_list) {
+ list_for_each_entry_rcu(dev, head, dev_list) {
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->name)
@@ -2306,14 +2327,12 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
}
if (first_dev) {
- rcu_read_lock();
name = rcu_dereference(first_dev->name);
seq_escape(m, name->str, " \t\n\\");
- rcu_read_unlock();
} else {
WARN_ON(1);
}
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ rcu_read_unlock();
return 0;
}
@@ -2355,7 +2374,7 @@ static int __init btrfs_interface_init(void)
return misc_register(&btrfs_misc);
}
-static void btrfs_interface_exit(void)
+static __cold void btrfs_interface_exit(void)
{
misc_deregister(&btrfs_misc);
}
@@ -2376,22 +2395,18 @@ static void __init btrfs_print_mod_info(void)
", ref-verify=on"
#endif
"\n",
- btrfs_crc32c_impl());
+ crc32c_impl());
}
static int __init init_btrfs_fs(void)
{
int err;
- err = btrfs_hash_init();
- if (err)
- return err;
-
btrfs_props_init();
err = btrfs_init_sysfs();
if (err)
- goto free_hash;
+ return err;
btrfs_init_compress();
@@ -2472,8 +2487,7 @@ free_cachep:
free_compress:
btrfs_exit_compress();
btrfs_exit_sysfs();
-free_hash:
- btrfs_hash_exit();
+
return err;
}
@@ -2493,7 +2507,6 @@ static void __exit exit_btrfs_fs(void)
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
btrfs_exit_compress();
- btrfs_hash_exit();
}
late_initcall(init_btrfs_fs);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a8bafed931f4..ca067471cd46 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -272,7 +272,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
{
struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
struct btrfs_block_group_cache *block_group;
- int index = to_raid_kobj(kobj)->raid_type;
+ int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags);
u64 val = 0;
down_read(&sinfo->groups_sem);
@@ -923,7 +923,7 @@ out1:
return ret;
}
-void btrfs_exit_sysfs(void)
+void __cold btrfs_exit_sysfs(void)
{
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
kset_unregister(btrfs_kset);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9786d8cd0aa6..e74278170806 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -278,8 +278,7 @@ int btrfs_run_sanity_tests(void)
}
}
ret = btrfs_test_extent_map();
- if (ret)
- goto out;
+
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 70c993f01670..c23bd00bdd92 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -343,7 +343,7 @@ static void test_case_4(struct extent_map_tree *em_tree)
__test_case_4(em_tree, SZ_4K);
}
-int btrfs_test_extent_map()
+int btrfs_test_extent_map(void)
{
struct extent_map_tree *em_tree;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 90204b166643..160eb2fba726 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -63,7 +63,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
btrfs_set_extent_generation(leaf, item, 1);
btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
block_info = (struct btrfs_tree_block_info *)(item + 1);
- btrfs_set_tree_block_level(leaf, block_info, 1);
+ btrfs_set_tree_block_level(leaf, block_info, 0);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
if (parent > 0) {
btrfs_set_extent_inline_ref_type(leaf, iref,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04f07144b45c..5c4cf0f9146b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -37,22 +37,16 @@
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
- [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
- __TRANS_START),
- [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
- __TRANS_START |
- __TRANS_ATTACH),
- [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
- __TRANS_START |
+ [TRANS_STATE_BLOCKED] = __TRANS_START,
+ [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
+ [TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN),
- [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
- __TRANS_START |
+ [TRANS_STATE_UNBLOCKED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK),
- [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
- __TRANS_START |
+ [TRANS_STATE_COMPLETED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK),
@@ -126,9 +120,9 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
spin_unlock(&tree->lock);
}
-static noinline void switch_commit_roots(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
down_write(&fs_info->commit_root_sem);
@@ -319,7 +313,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
root->last_trans < trans->transid) || force) {
WARN_ON(root == fs_info->extent_root);
- WARN_ON(root->commit_root != root->node);
+ WARN_ON(!force && root->commit_root != root->node);
/*
* see below for IN_TRANS_SETUP usage rules
@@ -449,11 +443,7 @@ static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
return 0;
- if (type == TRANS_USERSPACE)
- return 1;
-
- if (type == TRANS_START &&
- !atomic_read(&fs_info->open_ioctl_trans))
+ if (type == TRANS_START)
return 1;
return 0;
@@ -508,8 +498,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
*/
if (num_items && root != fs_info->chunk_root) {
qgroup_reserved = num_items * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved,
- enforce_qgroups);
+ ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
+ enforce_qgroups);
if (ret)
return ERR_PTR(ret);
@@ -593,7 +583,7 @@ again:
got_it:
btrfs_record_root_in_trans(h, root);
- if (!current->journal_info && type != TRANS_USERSPACE)
+ if (!current->journal_info)
current->journal_info = h;
return h;
@@ -606,7 +596,7 @@ alloc_fail:
btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
num_bytes);
reserve_fail:
- btrfs_qgroup_free_meta(root, qgroup_reserved);
+ btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
return ERR_PTR(ret);
}
@@ -658,14 +648,6 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
return trans;
}
-struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root,
- unsigned int num_items)
-{
- return start_transaction(root, num_items, TRANS_START,
- BTRFS_RESERVE_FLUSH_LIMIT, true);
-}
-
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
@@ -678,12 +660,6 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root
BTRFS_RESERVE_NO_FLUSH, true);
}
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
-{
- return start_transaction(root, 0, TRANS_USERSPACE,
- BTRFS_RESERVE_NO_FLUSH, true);
-}
-
/*
* btrfs_attach_transaction() - catch the running transaction
*
@@ -789,8 +765,7 @@ out:
void btrfs_throttle(struct btrfs_fs_info *fs_info)
{
- if (!atomic_read(&fs_info->open_ioctl_trans))
- wait_current_trans(fs_info);
+ wait_current_trans(fs_info);
}
static int should_end_transaction(struct btrfs_trans_handle *trans)
@@ -806,7 +781,6 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_fs_info *fs_info = trans->fs_info;
int updates;
int err;
@@ -818,7 +792,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (updates) {
- err = btrfs_run_delayed_refs(trans, fs_info, updates * 2);
+ err = btrfs_run_delayed_refs(trans, updates * 2);
if (err) /* Error code will also eval true */
return err;
}
@@ -826,6 +800,27 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
return should_end_transaction(trans);
}
+static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
+
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (!trans->block_rsv) {
+ ASSERT(!trans->bytes_reserved);
+ return;
+ }
+
+ if (!trans->bytes_reserved)
+ return;
+
+ ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, trans->bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv,
+ trans->bytes_reserved);
+ trans->bytes_reserved = 0;
+}
+
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int throttle)
{
@@ -843,11 +838,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
return 0;
}
- btrfs_trans_release_metadata(trans, info);
+ btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, info);
+ btrfs_create_pending_block_groups(trans);
trans->delayed_ref_updates = 0;
if (!trans->sync) {
@@ -864,16 +859,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
- btrfs_trans_release_metadata(trans, info);
+ btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, info);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
- if (lock && !atomic_read(&info->open_ioctl_trans) &&
- should_end_transaction(trans) &&
+ if (lock && should_end_transaction(trans) &&
READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
spin_lock(&info->trans_lock);
if (cur_trans->state == TRANS_STATE_RUNNING)
@@ -1072,40 +1066,33 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
}
/*
- * when btree blocks are allocated, they have some corresponding bits set for
- * them in one of two extent_io trees. This is used to make sure all of
- * those extents are on disk for transaction or log commit
+ * When btree blocks are allocated the corresponding extents are marked dirty.
+ * This function ensures such extents are persisted on disk for transaction or
+ * log commit.
+ *
+ * @trans: transaction whose dirty pages we'd like to write
*/
-static int btrfs_write_and_wait_marked_extents(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *dirty_pages, int mark)
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
{
int ret;
int ret2;
+ struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct blk_plug plug;
blk_start_plug(&plug);
- ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark);
+ ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
blk_finish_plug(&plug);
ret2 = btrfs_wait_extents(fs_info, dirty_pages);
+ clear_btree_io_tree(&trans->transaction->dirty_pages);
+
if (ret)
return ret;
- if (ret2)
+ else if (ret2)
return ret2;
- return 0;
-}
-
-static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- int ret;
-
- ret = btrfs_write_and_wait_marked_extents(fs_info,
- &trans->transaction->dirty_pages,
- EXTENT_DIRTY);
- clear_btree_io_tree(&trans->transaction->dirty_pages);
-
- return ret;
+ else
+ return 0;
}
/*
@@ -1155,9 +1142,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
* failures will cause the file system to go offline. We still need
* to clean up the delayed refs.
*/
-static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *io_bgs = &trans->transaction->io_bgs;
struct list_head *next;
@@ -1173,7 +1160,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
@@ -1192,7 +1179,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
return ret;
/* run_qgroups might have added some more refs */
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
again:
@@ -1209,7 +1196,7 @@ again:
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
}
@@ -1218,7 +1205,7 @@ again:
ret = btrfs_write_dirty_block_groups(trans, fs_info);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
}
@@ -1251,9 +1238,9 @@ void btrfs_add_dead_root(struct btrfs_root *root)
/*
* update all the cowonly tree roots on disk
*/
-static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *gang[8];
int i;
int ret;
@@ -1297,7 +1284,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
- btrfs_qgroup_free_meta_all(root);
+ btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1366,15 +1353,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return 0;
/*
+ * Ensure dirty @src will be commited. Or, after comming
+ * commit_fs_roots() and switch_commit_roots(), any dirty but not
+ * recorded root will never be updated again, causing an outdated root
+ * item.
+ */
+ record_root_in_trans(trans, src, 1);
+
+ /*
* We are going to commit transaction, see btrfs_commit_transaction()
* comment for reason locking tree_log_mutex
*/
mutex_lock(&fs_info->tree_log_mutex);
- ret = commit_fs_roots(trans, fs_info);
+ ret = commit_fs_roots(trans);
if (ret)
goto out;
- ret = btrfs_qgroup_account_extents(trans, fs_info);
+ ret = btrfs_qgroup_account_extents(trans);
if (ret < 0)
goto out;
@@ -1397,11 +1392,11 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* like chunk and root tree, as they won't affect qgroup.
* And we don't write super to avoid half committed status.
*/
- ret = commit_cowonly_roots(trans, fs_info);
+ ret = commit_cowonly_roots(trans);
if (ret)
goto out;
- switch_commit_roots(trans->transaction, fs_info);
- ret = btrfs_write_and_wait_transaction(trans, fs_info);
+ switch_commit_roots(trans->transaction);
+ ret = btrfs_write_and_wait_transaction(trans);
if (ret)
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction for qgroup");
@@ -1430,9 +1425,10 @@ out:
* the creation of the pending snapshots, just return 0.
*/
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_pending_snapshot *pending)
{
+
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key key;
struct btrfs_root_item *new_root_item;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -1524,7 +1520,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* otherwise we corrupt the FS during
* snapshot
*/
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret) { /* Transaction aborted */
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1620,7 +1616,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1674,7 +1670,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1699,8 +1695,7 @@ no_free_objectid:
/*
* create all the snapshots we've scheduled for creation
*/
-static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
{
struct btrfs_pending_snapshot *pending, *next;
struct list_head *head = &trans->transaction->pending_snapshots;
@@ -1708,7 +1703,7 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
list_for_each_entry_safe(pending, next, head, list) {
list_del(&pending->list);
- ret = create_pending_snapshot(trans, fs_info, pending);
+ ret = create_pending_snapshot(trans, pending);
if (ret)
break;
}
@@ -1861,10 +1856,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
}
-static void cleanup_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int err)
+static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
DEFINE_WAIT(wait);
@@ -1904,7 +1898,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
- trace_btrfs_transaction_commit(root);
+ trace_btrfs_transaction_commit(trans->root);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -1959,13 +1953,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
- ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+ ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
btrfs_end_transaction(trans);
return ret;
}
- btrfs_trans_release_metadata(trans, fs_info);
+ btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
cur_trans = trans->transaction;
@@ -1978,9 +1972,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
smp_wmb();
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
- ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+ ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
btrfs_end_transaction(trans);
return ret;
@@ -2008,12 +2002,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
run_it = 1;
mutex_unlock(&fs_info->ro_block_group_mutex);
- if (run_it)
- ret = btrfs_start_dirty_block_groups(trans, fs_info);
- }
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
+ if (run_it) {
+ ret = btrfs_start_dirty_block_groups(trans);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+ }
}
spin_lock(&fs_info->trans_lock);
@@ -2061,7 +2056,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto cleanup_transaction;
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto cleanup_transaction;
@@ -2069,7 +2064,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto cleanup_transaction;
@@ -2106,7 +2101,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* deal with them in create_pending_snapshot(), which is the
* core function of the snapshot creation.
*/
- ret = create_pending_snapshots(trans, fs_info);
+ ret = create_pending_snapshots(trans);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
@@ -2122,13 +2117,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* because all the tree which are snapshoted will be forced to COW
* the nodes and leaves.
*/
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
@@ -2157,7 +2152,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
mutex_lock(&fs_info->tree_log_mutex);
- ret = commit_fs_roots(trans, fs_info);
+ ret = commit_fs_roots(trans);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
@@ -2179,7 +2174,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* commit_fs_roots() can call btrfs_save_ino_cache(), which generates
* new delayed refs. Must handle them or qgroup can be wrong.
*/
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
@@ -2190,14 +2185,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
- ret = btrfs_qgroup_account_extents(trans, fs_info);
+ ret = btrfs_qgroup_account_extents(trans);
if (ret < 0) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
- ret = commit_cowonly_roots(trans, fs_info);
+ ret = commit_cowonly_roots(trans);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
@@ -2229,7 +2224,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
- switch_commit_roots(cur_trans, fs_info);
+ switch_commit_roots(cur_trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
@@ -2241,7 +2236,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
sizeof(*fs_info->super_copy));
btrfs_update_commit_device_size(fs_info);
- btrfs_update_commit_device_bytes_used(fs_info, cur_trans);
+ btrfs_update_commit_device_bytes_used(cur_trans);
clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
@@ -2256,7 +2251,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
- ret = btrfs_write_and_wait_transaction(trans, fs_info);
+ ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction");
@@ -2273,7 +2268,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
- btrfs_finish_extent_commit(trans, fs_info);
+ btrfs_finish_extent_commit(trans);
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
@@ -2319,13 +2314,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
scrub_continue:
btrfs_scrub_continue(fs_info);
cleanup_transaction:
- btrfs_trans_release_metadata(trans, fs_info);
+ btrfs_trans_release_metadata(trans);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
- cleanup_transaction(trans, trans->root, ret);
+ cleanup_transaction(trans, ret);
return ret;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6beee072b1bd..b6c94ce33503 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -69,6 +69,22 @@ struct btrfs_transaction {
struct list_head pending_chunks;
struct list_head switch_commits;
struct list_head dirty_bgs;
+
+ /*
+ * There is no explicit lock which protects io_bgs, rather its
+ * consistency is implied by the fact that all the sites which modify
+ * it do so under some form of transaction critical section, namely:
+ *
+ * - btrfs_start_dirty_block_groups - This function can only ever be
+ * run by one of the transaction committers. Refer to
+ * BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction
+ *
+ * - btrfs_write_dirty_blockgroups - this is called by
+ * commit_cowonly_roots from transaction critical section
+ * (TRANS_STATE_COMMIT_DOING)
+ *
+ * - btrfs_cleanup_dirty_bgs - called on transaction abort
+ */
struct list_head io_bgs;
struct list_head dropped_roots;
@@ -89,21 +105,18 @@ struct btrfs_transaction {
#define __TRANS_FREEZABLE (1U << 0)
-#define __TRANS_USERSPACE (1U << 8)
#define __TRANS_START (1U << 9)
#define __TRANS_ATTACH (1U << 10)
#define __TRANS_JOIN (1U << 11)
#define __TRANS_JOIN_NOLOCK (1U << 12)
#define __TRANS_DUMMY (1U << 13)
-#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
#define TRANS_ATTACH (__TRANS_ATTACH)
#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
-#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
- __TRANS_ATTACH)
+#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
@@ -186,15 +199,11 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
struct btrfs_root *root,
unsigned int num_items,
int min_factor);
-struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root,
- unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index c3c8d48f6618..8871286c1a91 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -30,7 +30,6 @@
#include "tree-checker.h"
#include "disk-io.h"
#include "compression.h"
-#include "hash.h"
/*
* Error message should follow the following format:
@@ -53,7 +52,8 @@
* Allows callers to customize the output.
*/
__printf(4, 5)
-static void generic_err(const struct btrfs_root *root,
+__cold
+static void generic_err(const struct btrfs_fs_info *fs_info,
const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
@@ -65,10 +65,10 @@ static void generic_err(const struct btrfs_root *root,
vaf.fmt = fmt;
vaf.va = &args;
- btrfs_crit(root->fs_info,
+ btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
- root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf);
va_end(args);
}
@@ -77,7 +77,8 @@ static void generic_err(const struct btrfs_root *root,
* offset has its own meaning.
*/
__printf(4, 5)
-static void file_extent_err(const struct btrfs_root *root,
+__cold
+static void file_extent_err(const struct btrfs_fs_info *fs_info,
const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
@@ -91,10 +92,11 @@ static void file_extent_err(const struct btrfs_root *root,
vaf.fmt = fmt;
vaf.va = &args;
- btrfs_crit(root->fs_info,
+ btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
- btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
- btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf);
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, key.offset, &vaf);
va_end(args);
}
@@ -102,26 +104,26 @@ static void file_extent_err(const struct btrfs_root *root,
* Return 0 if the btrfs_file_extent_##name is aligned to @alignment
* Else return 1
*/
-#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \
+#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment) \
({ \
if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
- file_extent_err((root), (leaf), (slot), \
+ file_extent_err((fs_info), (leaf), (slot), \
"invalid %s for file extent, have %llu, should be aligned to %u", \
(#name), btrfs_file_extent_##name((leaf), (fi)), \
(alignment)); \
(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \
})
-static int check_extent_data_item(struct btrfs_root *root,
+static int check_extent_data_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
struct btrfs_file_extent_item *fi;
- u32 sectorsize = root->fs_info->sectorsize;
+ u32 sectorsize = fs_info->sectorsize;
u32 item_size = btrfs_item_size_nr(leaf, slot);
if (!IS_ALIGNED(key->offset, sectorsize)) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"unaligned file_offset for file extent, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
@@ -130,7 +132,7 @@ static int check_extent_data_item(struct btrfs_root *root,
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid type for file extent, have %u expect range [0, %u]",
btrfs_file_extent_type(leaf, fi),
BTRFS_FILE_EXTENT_TYPES);
@@ -142,14 +144,14 @@ static int check_extent_data_item(struct btrfs_root *root,
* and must be caught in open_ctree().
*/
if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid compression for file extent, have %u expect range [0, %u]",
btrfs_file_extent_compression(leaf, fi),
BTRFS_COMPRESS_TYPES);
return -EUCLEAN;
}
if (btrfs_file_extent_encryption(leaf, fi)) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid encryption for file extent, have %u expect 0",
btrfs_file_extent_encryption(leaf, fi));
return -EUCLEAN;
@@ -157,7 +159,7 @@ static int check_extent_data_item(struct btrfs_root *root,
if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
/* Inline extent must have 0 as key offset */
if (key->offset) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid file_offset for inline file extent, have %llu expect 0",
key->offset);
return -EUCLEAN;
@@ -171,7 +173,7 @@ static int check_extent_data_item(struct btrfs_root *root,
/* Uncompressed inline extent size must match item size */
if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
btrfs_file_extent_ram_bytes(leaf, fi)) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
btrfs_file_extent_ram_bytes(leaf, fi));
@@ -182,40 +184,41 @@ static int check_extent_data_item(struct btrfs_root *root,
/* Regular or preallocated extent has fixed item size */
if (item_size != sizeof(*fi)) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid item size for reg/prealloc file extent, have %u expect %zu",
item_size, sizeof(*fi));
return -EUCLEAN;
}
- if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) ||
- CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) ||
- CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize))
+ if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize))
return -EUCLEAN;
return 0;
}
-static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+static int check_csum_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
{
- u32 sectorsize = root->fs_info->sectorsize;
- u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+ u32 sectorsize = fs_info->sectorsize;
+ u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"invalid key objectid for csum item, have %llu expect %llu",
key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
return -EUCLEAN;
}
if (!IS_ALIGNED(key->offset, sectorsize)) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"unaligned key offset for csum item, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
}
if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"unaligned item size for csum item, have %u should be aligned to %u",
btrfs_item_size_nr(leaf, slot), csumsize);
return -EUCLEAN;
@@ -228,7 +231,8 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
* which represents inode number
*/
__printf(4, 5)
-static void dir_item_err(const struct btrfs_root *root,
+__cold
+static void dir_item_err(const struct btrfs_fs_info *fs_info,
const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
@@ -242,14 +246,15 @@ static void dir_item_err(const struct btrfs_root *root,
vaf.fmt = fmt;
vaf.va = &args;
- btrfs_crit(root->fs_info,
+ btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
- btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
- btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
va_end(args);
}
-static int check_dir_item(struct btrfs_root *root,
+static int check_dir_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
@@ -268,7 +273,7 @@ static int check_dir_item(struct btrfs_root *root,
/* header itself should not cross item boundary */
if (cur + sizeof(*di) > item_size) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"dir item header crosses item boundary, have %zu boundary %u",
cur + sizeof(*di), item_size);
return -EUCLEAN;
@@ -277,7 +282,7 @@ static int check_dir_item(struct btrfs_root *root,
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
if (dir_type >= BTRFS_FT_MAX) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
dir_type, BTRFS_FT_MAX);
return -EUCLEAN;
@@ -285,14 +290,14 @@ static int check_dir_item(struct btrfs_root *root,
if (key->type == BTRFS_XATTR_ITEM_KEY &&
dir_type != BTRFS_FT_XATTR) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"invalid dir item type for XATTR key, have %u expect %u",
dir_type, BTRFS_FT_XATTR);
return -EUCLEAN;
}
if (dir_type == BTRFS_FT_XATTR &&
key->type != BTRFS_XATTR_ITEM_KEY) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"xattr dir type found for non-XATTR key");
return -EUCLEAN;
}
@@ -305,21 +310,21 @@ static int check_dir_item(struct btrfs_root *root,
name_len = btrfs_dir_name_len(leaf, di);
data_len = btrfs_dir_data_len(leaf, di);
if (name_len > max_name_len) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"dir item name len too long, have %u max %u",
name_len, max_name_len);
return -EUCLEAN;
}
- if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
- dir_item_err(root, leaf, slot,
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
+ dir_item_err(fs_info, leaf, slot,
"dir item name and data len too long, have %u max %u",
name_len + data_len,
- BTRFS_MAX_XATTR_SIZE(root->fs_info));
+ BTRFS_MAX_XATTR_SIZE(fs_info));
return -EUCLEAN;
}
if (data_len && dir_type != BTRFS_FT_XATTR) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"dir item with invalid data len, have %u expect 0",
data_len);
return -EUCLEAN;
@@ -329,7 +334,7 @@ static int check_dir_item(struct btrfs_root *root,
/* header and name/data should not cross item boundary */
if (cur + total_size > item_size) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"dir item data crosses item boundary, have %u boundary %u",
cur + total_size, item_size);
return -EUCLEAN;
@@ -347,7 +352,7 @@ static int check_dir_item(struct btrfs_root *root,
(unsigned long)(di + 1), name_len);
name_hash = btrfs_name_hash(namebuf, name_len);
if (key->offset != name_hash) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"name hash mismatch with key, have 0x%016x expect 0x%016llx",
name_hash, key->offset);
return -EUCLEAN;
@@ -362,7 +367,7 @@ static int check_dir_item(struct btrfs_root *root,
/*
* Common point to switch the item-specific validation.
*/
-static int check_leaf_item(struct btrfs_root *root,
+static int check_leaf_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
@@ -370,24 +375,23 @@ static int check_leaf_item(struct btrfs_root *root,
switch (key->type) {
case BTRFS_EXTENT_DATA_KEY:
- ret = check_extent_data_item(root, leaf, key, slot);
+ ret = check_extent_data_item(fs_info, leaf, key, slot);
break;
case BTRFS_EXTENT_CSUM_KEY:
- ret = check_csum_item(root, leaf, key, slot);
+ ret = check_csum_item(fs_info, leaf, key, slot);
break;
case BTRFS_DIR_ITEM_KEY:
case BTRFS_DIR_INDEX_KEY:
case BTRFS_XATTR_ITEM_KEY:
- ret = check_dir_item(root, leaf, key, slot);
+ ret = check_dir_item(fs_info, leaf, key, slot);
break;
}
return ret;
}
-static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
+static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
bool check_item_data)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
/* No valid key type is 0, so all key should be larger than this key */
struct btrfs_key prev_key = {0, 0, 0};
struct btrfs_key key;
@@ -420,7 +424,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
eb = btrfs_root_node(check_root);
/* if leaf is the root, then it's fine */
if (leaf != eb) {
- generic_err(check_root, leaf, 0,
+ generic_err(fs_info, leaf, 0,
"invalid nritems, have %u should not be 0 for non-root leaf",
nritems);
free_extent_buffer(eb);
@@ -453,7 +457,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
/* Make sure the keys are in the right order */
if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
prev_key.objectid, prev_key.type,
prev_key.offset, key.objectid, key.type,
@@ -472,7 +476,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
item_end_expected = btrfs_item_offset_nr(leaf,
slot - 1);
if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"unexpected item end, have %u expect %u",
btrfs_item_end_nr(leaf, slot),
item_end_expected);
@@ -486,7 +490,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
*/
if (btrfs_item_end_nr(leaf, slot) >
BTRFS_LEAF_DATA_SIZE(fs_info)) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"slot end outside of leaf, have %u expect range [0, %u]",
btrfs_item_end_nr(leaf, slot),
BTRFS_LEAF_DATA_SIZE(fs_info));
@@ -496,7 +500,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
/* Also check if the item pointer overlaps with btrfs item. */
if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
btrfs_item_ptr_offset(leaf, slot)) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
btrfs_item_nr_offset(slot) +
sizeof(struct btrfs_item),
@@ -509,7 +513,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
* Check if the item size and content meet other
* criteria
*/
- ret = check_leaf_item(root, leaf, &key, slot);
+ ret = check_leaf_item(fs_info, leaf, &key, slot);
if (ret < 0)
return ret;
}
@@ -522,18 +526,19 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
return 0;
}
-int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf)
+int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf)
{
- return check_leaf(root, leaf, true);
+ return check_leaf(fs_info, leaf, true);
}
-int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf)
{
- return check_leaf(root, leaf, false);
+ return check_leaf(fs_info, leaf, false);
}
-int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
+int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
{
unsigned long nr = btrfs_header_nritems(node);
struct btrfs_key key, next_key;
@@ -541,12 +546,12 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
u64 bytenr;
int ret = 0;
- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
- btrfs_crit(root->fs_info,
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
+ btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
- root->objectid, node->start,
+ btrfs_header_owner(node), node->start,
nr == 0 ? "small" : "large", nr,
- BTRFS_NODEPTRS_PER_BLOCK(root->fs_info));
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
return -EUCLEAN;
}
@@ -556,21 +561,21 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
btrfs_node_key_to_cpu(node, &next_key, slot + 1);
if (!bytenr) {
- generic_err(root, node, slot,
+ generic_err(fs_info, node, slot,
"invalid NULL node pointer");
ret = -EUCLEAN;
goto out;
}
- if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) {
- generic_err(root, node, slot,
+ if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
+ generic_err(fs_info, node, slot,
"unaligned pointer, have %llu should be aligned to %u",
- bytenr, root->fs_info->sectorsize);
+ bytenr, fs_info->sectorsize);
ret = -EUCLEAN;
goto out;
}
if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
- generic_err(root, node, slot,
+ generic_err(fs_info, node, slot,
"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
key.objectid, key.type, key.offset,
next_key.objectid, next_key.type,
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3d53e8d6fda0..aba542755710 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -25,14 +25,15 @@
* Will check not only the item pointers, but also every possible member
* in item data.
*/
-int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf);
/*
* Less strict leaf checker.
* Will only check item pointers, not reading item data.
*/
-int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf);
-int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
+int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index cb65089127cc..c09dbe4bd6e7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -39,7 +39,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int level;
int next_key_ret = 0;
u64 last_ret = 0;
- u64 min_trans = 0;
if (root->fs_info->extent_root == root) {
/*
@@ -81,7 +80,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
path->keep_locks = 1;
- ret = btrfs_search_forward(root, &key, path, min_trans);
+ ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
if (ret < 0)
goto out;
if (ret > 0) {
@@ -130,7 +129,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
*/
path->slots[1] = btrfs_header_nritems(path->nodes[1]);
next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- min_trans);
+ BTRFS_OLDEST_GENERATION);
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4fd19b4d6675..c91babc6aa4b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -21,12 +21,12 @@
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
+#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
#include "backref.h"
-#include "hash.h"
#include "compression.h"
#include "qgroup.h"
#include "inode-map.h"
@@ -286,7 +286,7 @@ struct walk_control {
* inside it
*/
int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen);
+ struct walk_control *wc, u64 gen, int level);
};
/*
@@ -294,7 +294,7 @@ struct walk_control {
*/
static int process_one_buffer(struct btrfs_root *log,
struct extent_buffer *eb,
- struct walk_control *wc, u64 gen)
+ struct walk_control *wc, u64 gen, int level)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
@@ -304,7 +304,7 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen);
+ ret = btrfs_read_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
@@ -853,7 +853,6 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
char *name;
int name_len;
@@ -887,7 +886,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
else
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
out:
kfree(name);
iput(inode);
@@ -967,7 +966,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
if (key->type == BTRFS_INODE_EXTREF_KEY) {
- if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+ if (btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0],
+ ref_objectid,
name, namelen, NULL))
match = 1;
@@ -1005,7 +1006,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
u64 ref_index, char *name, int namelen,
int *search_done)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *victim_name;
int victim_name_len;
@@ -1063,7 +1063,7 @@ again:
kfree(victim_name);
if (ret)
return ret;
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret)
return ret;
*search_done = 1;
@@ -1134,8 +1134,7 @@ again:
victim_name_len);
if (!ret)
ret = btrfs_run_delayed_items(
- trans,
- fs_info);
+ trans);
}
iput(victim_parent);
kfree(victim_name);
@@ -1191,7 +1190,8 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
read_extent_buffer(eb, *name, (unsigned long)&extref->name,
*namelen);
- *index = btrfs_inode_extref_index(eb, extref);
+ if (index)
+ *index = btrfs_inode_extref_index(eb, extref);
if (parent_objectid)
*parent_objectid = btrfs_inode_extref_parent(eb, extref);
@@ -1212,12 +1212,102 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
- *index = btrfs_inode_ref_index(eb, ref);
+ if (index)
+ *index = btrfs_inode_ref_index(eb, ref);
return 0;
}
/*
+ * Take an inode reference item from the log tree and iterate all names from the
+ * inode reference item in the subvolume tree with the same key (if it exists).
+ * For any name that is not in the inode reference item from the log tree, do a
+ * proper unlink of that name (that is, remove its entry from the inode
+ * reference item and both dir index keys).
+ */
+static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode,
+ struct extent_buffer *log_eb,
+ int log_slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ unsigned long ref_ptr;
+ unsigned long ref_end;
+ struct extent_buffer *eb;
+
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[0];
+ ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
+ while (ref_ptr < ref_end) {
+ char *name = NULL;
+ int namelen;
+ u64 parent_id;
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ NULL, &parent_id);
+ } else {
+ parent_id = key->offset;
+ ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+ NULL);
+ }
+ if (ret)
+ goto out;
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
+ parent_id, name,
+ namelen, NULL);
+ else
+ ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
+ namelen, NULL);
+
+ if (!ret) {
+ struct inode *dir;
+
+ btrfs_release_path(path);
+ dir = read_one_inode(root, parent_id);
+ if (!dir) {
+ ret = -ENOENT;
+ kfree(name);
+ goto out;
+ }
+ ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ inode, name, namelen);
+ kfree(name);
+ iput(dir);
+ if (ret)
+ goto out;
+ goto again;
+ }
+
+ kfree(name);
+ ref_ptr += namelen;
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ref_ptr += sizeof(struct btrfs_inode_extref);
+ else
+ ref_ptr += sizeof(struct btrfs_inode_ref);
+ }
+ ret = 0;
+ out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
* root is the destination we are replaying into, and path is for temp
@@ -1345,6 +1435,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * Before we overwrite the inode reference item in the subvolume tree
+ * with the item from the log tree, we must unlink all names from the
+ * parent directory that are in the subvolume's tree inode reference
+ * item, otherwise we end up with an inconsistent subvolume tree where
+ * dir index entries exist for a name but there is no inode reference
+ * item with the same name.
+ */
+ ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
+ key);
+ if (ret)
+ goto out;
+
/* finally write the back reference in the inode */
ret = overwrite_item(trans, root, path, eb, slot, key);
out:
@@ -1992,7 +2095,6 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct inode *dir,
struct btrfs_key *dir_key)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
struct extent_buffer *eb;
int slot;
@@ -2056,7 +2158,7 @@ again:
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
BTRFS_I(inode), name, name_len);
if (!ret)
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
kfree(name);
iput(inode);
if (ret)
@@ -2304,17 +2406,16 @@ out:
* back refs).
*/
static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen)
+ struct walk_control *wc, u64 gen, int level)
{
int nritems;
struct btrfs_path *path;
struct btrfs_root *root = wc->replay_dest;
struct btrfs_key key;
- int level;
int i;
int ret;
- ret = btrfs_read_buffer(eb, gen);
+ ret = btrfs_read_buffer(eb, gen, level, NULL);
if (ret)
return ret;
@@ -2431,6 +2532,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level >= BTRFS_MAX_LEVEL);
while (*level > 0) {
+ struct btrfs_key first_key;
+
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
cur = path->nodes[*level];
@@ -2443,6 +2546,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
parent = path->nodes[*level];
@@ -2453,7 +2557,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return PTR_ERR(next);
if (*level == 1) {
- ret = wc->process_func(root, next, wc, ptr_gen);
+ ret = wc->process_func(root, next, wc, ptr_gen,
+ *level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2461,7 +2566,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen,
+ *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2491,7 +2597,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2541,7 +2647,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
root_owner = btrfs_header_owner(parent);
ret = wc->process_func(root, path->nodes[*level], wc,
- btrfs_header_generation(path->nodes[*level]));
+ btrfs_header_generation(path->nodes[*level]),
+ *level);
if (ret)
return ret;
@@ -2623,7 +2730,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
ret = wc->process_func(log, path->nodes[orig_level], wc,
- btrfs_header_generation(path->nodes[orig_level]));
+ btrfs_header_generation(path->nodes[orig_level]),
+ orig_level);
if (ret)
goto out;
if (wc->free) {
@@ -3866,6 +3974,7 @@ fill_holes:
ASSERT(ret == 0);
src = src_path->nodes[0];
i = 0;
+ need_find_last_extent = true;
}
btrfs_item_key_to_cpu(src, &key, i);
@@ -3900,6 +4009,36 @@ fill_holes:
break;
*last_extent = extent_end;
}
+
+ /*
+ * Check if there is a hole between the last extent found in our leaf
+ * and the first extent in the next leaf. If there is one, we need to
+ * log an explicit hole so that at replay time we can punch the hole.
+ */
+ if (ret == 0 &&
+ key.objectid == btrfs_ino(inode) &&
+ key.type == BTRFS_EXTENT_DATA_KEY &&
+ i == btrfs_header_nritems(src_path->nodes[0])) {
+ ret = btrfs_next_leaf(inode->root, src_path);
+ need_find_last_extent = true;
+ if (ret > 0) {
+ ret = 0;
+ } else if (ret == 0) {
+ btrfs_item_key_to_cpu(src_path->nodes[0], &key,
+ src_path->slots[0]);
+ if (key.objectid == btrfs_ino(inode) &&
+ key.type == BTRFS_EXTENT_DATA_KEY &&
+ *last_extent < key.offset) {
+ const u64 len = key.offset - *last_extent;
+
+ ret = btrfs_insert_file_extent(trans, log,
+ btrfs_ino(inode),
+ *last_extent, 0,
+ 0, len, 0, len,
+ 0, 0, 0);
+ }
+ }
+ }
/*
* Need to let the callers know we dropped the path so they should
* re-search.
@@ -5411,7 +5550,6 @@ out:
* the last committed transaction
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct dentry *parent,
const loff_t start,
@@ -5419,6 +5557,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
int inode_only,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct super_block *sb;
struct dentry *old_parent = NULL;
@@ -5444,7 +5583,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) {
+ if (btrfs_root_refs(&root->root_item) == 0) {
ret = 1;
goto end_no_trans;
}
@@ -5576,7 +5715,7 @@ end_no_trans:
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry,
+ struct dentry *dentry,
const loff_t start,
const loff_t end,
struct btrfs_log_ctx *ctx)
@@ -5584,8 +5723,8 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
- parent, start, end, LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
+ start, end, LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -5847,13 +5986,12 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct dentry *parent)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
/*
* this will force the logging code to walk the dentry chain
* up for the file
*/
- if (S_ISREG(inode->vfs_inode.i_mode))
+ if (!S_ISDIR(inode->vfs_inode.i_mode))
inode->last_unlink_trans = trans->transid;
/*
@@ -5864,7 +6002,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
return 0;
- return btrfs_log_inode_parent(trans, root, inode, parent, 0,
- LLONG_MAX, LOG_INODE_EXISTS, NULL);
+ return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, NULL);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 483027f9a7f4..88abc43312a1 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -65,7 +65,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry,
+ struct dentry *dentry,
const loff_t start,
const loff_t end,
struct btrfs_log_ctx *ctx);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 726f928238d0..9916f03430bc 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -282,7 +282,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
key.offset = 0;
again_search_slot:
- ret = btrfs_search_forward(root, &key, path, 0);
+ ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
if (ret) {
if (ret > 0)
ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2ceb924ca0d6..93f8f17cacca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
+#include <linux/list_sort.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
@@ -278,7 +279,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev,
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_cleanup_fs_uuids(void)
+void __exit btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
@@ -708,7 +709,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
fs_devices->rw_devices++;
- list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+ list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
}
brelse(bh);
@@ -895,7 +896,11 @@ error:
return ERR_PTR(-ENOMEM);
}
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
+/*
+ * After we have read the system tree and know devids belonging to
+ * this filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
struct btrfs_device *latest_dev = NULL;
@@ -1103,6 +1108,20 @@ out:
return ret;
}
+static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct btrfs_device *dev1, *dev2;
+
+ dev1 = list_entry(a, struct btrfs_device, dev_list);
+ dev2 = list_entry(b, struct btrfs_device, dev_list);
+
+ if (dev1->devid < dev2->devid)
+ return -1;
+ else if (dev1->devid > dev2->devid)
+ return 1;
+ return 0;
+}
+
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
@@ -1113,6 +1132,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->opened++;
ret = 0;
} else {
+ list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = __btrfs_open_devices(fs_devices, flags, holder);
}
mutex_unlock(&uuid_mutex);
@@ -1916,12 +1936,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_lock(&uuid_mutex);
num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
@@ -2047,7 +2067,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices;
- WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+ lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
/*
* in case of fs with no seed, srcdev->fs_devices will point
@@ -2237,7 +2257,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
u64 super_flags;
- BUG_ON(!mutex_is_locked(&uuid_mutex));
+ lockdep_assert_held(&uuid_mutex);
if (!fs_devices->seeding)
return -EINVAL;
@@ -2642,7 +2662,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
- ASSERT(list_empty(&srcdev->resized_list));
device->commit_total_bytes = srcdev->commit_total_bytes;
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
@@ -2666,19 +2685,6 @@ error:
return ret;
}
-void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
- struct btrfs_device *tgtdev)
-{
- u32 sectorsize = fs_info->sectorsize;
-
- WARN_ON(fs_info->fs_devices->rw_devices == 0);
- tgtdev->io_width = sectorsize;
- tgtdev->io_align = sectorsize;
- tgtdev->sector_size = sectorsize;
- tgtdev->fs_info = fs_info;
- set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state);
-}
-
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
@@ -2984,7 +2990,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
+ lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_can_relocate(fs_info, chunk_offset);
if (ret)
@@ -2997,6 +3003,16 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (ret)
return ret;
+ /*
+ * We add the kobjects here (and after forcing data chunk creation)
+ * since relocation is the only place we'll create chunks of a new
+ * type at runtime. The only place where we'll remove the last
+ * chunk of a type is the call immediately below this one. Even
+ * so, we're protected against races with the cleaner thread since
+ * we're covered by the delete_unused_bgs_mutex.
+ */
+ btrfs_add_raid_kobjects(fs_info);
+
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -3124,6 +3140,8 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
if (ret < 0)
return ret;
+ btrfs_add_raid_kobjects(fs_info);
+
return 1;
}
}
@@ -3892,12 +3910,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
BUG_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
@@ -4202,7 +4220,8 @@ static int btrfs_uuid_scan_kthread(void *data)
key.offset = 0;
while (1) {
- ret = btrfs_search_forward(root, &key, path, 0);
+ ret = btrfs_search_forward(root, &key, path,
+ BTRFS_OLDEST_GENERATION);
if (ret) {
if (ret > 0)
ret = 0;
@@ -4672,7 +4691,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
-#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \
+#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
@@ -4713,10 +4732,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
BUG_ON(!alloc_profile_is_valid(type, 0));
- if (list_empty(&fs_devices->alloc_list))
+ if (list_empty(&fs_devices->alloc_list)) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info, "%s: no writable device", __func__);
return -ENOSPC;
+ }
- index = __get_raid_index(type);
+ index = btrfs_bg_flags_to_raid_index(type);
sub_stripes = btrfs_raid_array[index].sub_stripes;
dev_stripes = btrfs_raid_array[index].dev_stripes;
@@ -4729,7 +4751,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = SZ_1G;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+ devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@@ -4738,7 +4760,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+ devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
@@ -4797,8 +4819,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (ret == 0)
max_avail = max_stripe_size * dev_stripes;
- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+ if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info,
+ "%s: devid %llu has no free space, have=%llu want=%u",
+ __func__, device->devid, max_avail,
+ BTRFS_STRIPE_LEN * dev_stripes);
continue;
+ }
if (ndevs == fs_devices->rw_devices) {
WARN(1, "%s: found more than %llu devices\n",
@@ -4821,18 +4849,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
/* round down to number of usable stripes */
ndevs = round_down(ndevs, devs_increment);
- if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+ if (ndevs < devs_min) {
ret = -ENOSPC;
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
+ btrfs_debug(info,
+ "%s: not enough devices with free space: have=%d minimum required=%d",
+ __func__, ndevs, devs_min);
+ }
goto error;
}
ndevs = min(ndevs, devs_max);
/*
- * the primary goal is to maximize the number of stripes, so use as many
- * devices as possible, even if the stripes are not maximum sized.
+ * The primary goal is to maximize the number of stripes, so use as
+ * many devices as possible, even if the stripes are not maximum sized.
+ *
+ * The DUP profile stores more than one stripe per device, the
+ * max_avail is the total size so we have to adjust.
*/
- stripe_size = devices_info[ndevs-1].max_avail;
+ stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
num_stripes = ndevs * dev_stripes;
/*
@@ -4853,22 +4889,19 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
* and compare that answer with the max chunk size
*/
if (stripe_size * data_stripes > max_chunk_size) {
- u64 mask = (1ULL << 24) - 1;
-
stripe_size = div_u64(max_chunk_size, data_stripes);
/* bump the answer up to a 16MB boundary */
- stripe_size = (stripe_size + mask) & ~mask;
+ stripe_size = round_up(stripe_size, SZ_16M);
- /* but don't go higher than the limits we found
- * while searching for free extents
+ /*
+ * But don't go higher than the limits we found while searching
+ * for free extents
*/
- if (stripe_size > devices_info[ndevs-1].max_avail)
- stripe_size = devices_info[ndevs-1].max_avail;
+ stripe_size = min(devices_info[ndevs - 1].max_avail,
+ stripe_size);
}
- stripe_size = div_u64(stripe_size, dev_stripes);
-
/* align to BTRFS_STRIPE_LEN */
stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
@@ -5067,7 +5100,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
{
u64 chunk_offset;
- ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+ lockdep_assert_held(&fs_info->chunk_mutex);
chunk_offset = find_next_chunk(fs_info);
return __btrfs_alloc_chunk(trans, chunk_offset, type);
}
@@ -5208,11 +5241,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
ret = 1;
free_extent_map(em);
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
fs_info->dev_replace.tgtdev)
ret++;
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
return ret;
}
@@ -5253,13 +5286,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first, int num,
- int optimal, int dev_replace_is_ongoing)
+ struct map_lookup *map, int first,
+ int dev_replace_is_ongoing)
{
int i;
+ int num_stripes;
+ int preferred_mirror;
int tolerance;
struct btrfs_device *srcdev;
+ ASSERT((map->type &
+ (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ num_stripes = map->sub_stripes;
+ else
+ num_stripes = map->num_stripes;
+
+ preferred_mirror = first + current->pid % num_stripes;
+
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
@@ -5273,10 +5318,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
* mirror is available
*/
for (tolerance = 0; tolerance < 2; tolerance++) {
- if (map->stripes[optimal].dev->bdev &&
- (tolerance || map->stripes[optimal].dev != srcdev))
- return optimal;
- for (i = first; i < first + num; i++) {
+ if (map->stripes[preferred_mirror].dev->bdev &&
+ (tolerance || map->stripes[preferred_mirror].dev != srcdev))
+ return preferred_mirror;
+ for (i = first; i < first + num_stripes; i++) {
if (map->stripes[i].dev->bdev &&
(tolerance || map->stripes[i].dev != srcdev))
return i;
@@ -5286,7 +5331,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
/* we couldn't find one that doesn't fail. Just return something
* and the io error handling code will clean up eventually
*/
- return optimal;
+ return preferred_mirror;
}
static inline int parity_smaller(u64 a, u64 b)
@@ -5778,10 +5823,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (!bbio_ret)
goto out;
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (!dev_replace_is_ongoing)
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
else
btrfs_dev_replace_set_lock_blocking(dev_replace);
@@ -5813,8 +5858,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
- map->num_stripes,
- current->pid % map->num_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
@@ -5842,8 +5885,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
stripe_index,
- map->sub_stripes, stripe_index +
- current->pid % map->sub_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
@@ -5983,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
out:
if (dev_replace_is_ongoing) {
btrfs_dev_replace_clear_lock_blocking(dev_replace);
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
}
free_extent_map(em);
return ret;
@@ -6617,7 +6658,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices;
int ret;
- BUG_ON(!mutex_is_locked(&uuid_mutex));
+ lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
fs_devices = fs_info->fs_devices->seed;
@@ -7357,20 +7398,20 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
}
/* Must be invoked during the transaction commit */
-void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *transaction)
+void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_device *dev;
int i;
- if (list_empty(&transaction->pending_chunks))
+ if (list_empty(&trans->pending_chunks))
return;
/* In order to kick the device replace finish process */
mutex_lock(&fs_info->chunk_mutex);
- list_for_each_entry(em, &transaction->pending_chunks, list) {
+ list_for_each_entry(em, &trans->pending_chunks, list) {
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 28c28eeadff3..d1fcaea9fef5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -422,7 +422,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
struct btrfs_device *device, struct btrfs_device *this_dev);
int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
@@ -436,7 +436,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u8 *uuid);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
const char *device_path, u64 devid);
-void btrfs_cleanup_fs_uuids(void);
+void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
@@ -476,8 +476,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
-void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
- struct btrfs_device *tgtdev);
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
@@ -546,9 +544,30 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
btrfs_dev_stat_set(dev, index, 0);
}
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
+{
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
+ return BTRFS_RAID_RAID10;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+ return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return BTRFS_RAID_DUP;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
+
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
-void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *transaction);
+void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index de7d072c78ef..e1e8177deb5e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -33,7 +33,7 @@
#include "locking.h"
-ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size)
{
struct btrfs_dir_item *di;
@@ -233,7 +233,7 @@ out:
/*
* @value: "" makes the attribute to empty, NULL removes it
*/
-int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+int btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
@@ -374,7 +374,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
const char *name, void *buffer, size_t size)
{
name = xattr_full_name(handler, name);
- return __btrfs_getxattr(inode, name, buffer, size);
+ return btrfs_getxattr(inode, name, buffer, size);
}
static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
@@ -383,7 +383,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
size_t size, int flags)
{
name = xattr_full_name(handler, name);
- return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+ return btrfs_setxattr(NULL, inode, name, buffer, size, flags);
}
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
@@ -448,8 +448,8 @@ static int btrfs_initxattrs(struct inode *inode,
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
- err = __btrfs_setxattr(trans, inode, name,
- xattr->value, xattr->value_len, 0);
+ err = btrfs_setxattr(trans, inode, name, xattr->value,
+ xattr->value_len, 0);
kfree(name);
if (err < 0)
break;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 15fc4743dc70..e215a3212a2a 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -23,13 +23,14 @@
extern const struct xattr_handler *btrfs_xattr_handlers[];
-extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size);
-extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+int btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags);
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr);
diff --git a/fs/buffer.c b/fs/buffer.c
index 9a73924db22f..ec5dd39071e6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1511,7 +1511,7 @@ void block_invalidatepage(struct page *page, unsigned int offset,
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (offset == 0)
+ if (length == PAGE_SIZE)
try_to_release_page(page, 0);
out:
return;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index e7f16a77a22a..222bc5d8b62c 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -32,7 +32,7 @@ static struct fscache_object *cachefiles_alloc_object(
struct cachefiles_cache *cache;
struct cachefiles_xattr *auxdata;
unsigned keylen, auxlen;
- void *buffer;
+ void *buffer, *p;
char *key;
cache = container_of(_cache, struct cachefiles_cache, cache);
@@ -65,8 +65,12 @@ static struct fscache_object *cachefiles_alloc_object(
if (!buffer)
goto nomem_buffer;
- keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
- ASSERTCMP(keylen, <, 512);
+ keylen = cookie->key_len;
+ if (keylen <= sizeof(cookie->inline_key))
+ p = cookie->inline_key;
+ else
+ p = cookie->key;
+ memcpy(buffer + 2, p, keylen);
*(uint16_t *)buffer = keylen;
((char *)buffer)[keylen + 2] = 0;
@@ -80,15 +84,17 @@ static struct fscache_object *cachefiles_alloc_object(
/* get hold of the auxiliary data and prepend the object type */
auxdata = buffer;
- auxlen = 0;
- if (cookie->def->get_aux) {
- auxlen = cookie->def->get_aux(cookie->netfs_data,
- auxdata->data, 511);
- ASSERTCMP(auxlen, <, 511);
+ auxlen = cookie->aux_len;
+ if (auxlen) {
+ if (auxlen <= sizeof(cookie->inline_aux))
+ p = cookie->inline_aux;
+ else
+ p = cookie->aux;
+ memcpy(auxdata->data, p, auxlen);
}
auxdata->len = auxlen + 1;
- auxdata->type = cookie->def->type;
+ auxdata->type = cookie->type;
lookup_data->auxdata = auxdata;
lookup_data->key = key;
@@ -177,10 +183,12 @@ static void cachefiles_lookup_complete(struct fscache_object *_object)
* increment the usage count on an inode object (may fail if unmounting)
*/
static
-struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object,
+ enum fscache_obj_ref_trace why)
{
struct cachefiles_object *object =
container_of(_object, struct cachefiles_object, fscache);
+ int u;
_enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
@@ -188,7 +196,9 @@ struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
#endif
- atomic_inc(&object->usage);
+ u = atomic_inc_return(&object->usage);
+ trace_cachefiles_ref(object, _object->cookie,
+ (enum cachefiles_obj_ref_trace)why, u);
return &object->fscache;
}
@@ -202,6 +212,7 @@ static void cachefiles_update_object(struct fscache_object *_object)
struct cachefiles_cache *cache;
struct fscache_cookie *cookie;
const struct cred *saved_cred;
+ const void *aux;
unsigned auxlen;
_enter("{OBJ%x}", _object->debug_id);
@@ -216,26 +227,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
}
cookie = object->fscache.cookie;
+ auxlen = cookie->aux_len;
- if (!cookie->def->get_aux) {
+ if (!auxlen) {
fscache_unuse_cookie(_object);
_leave(" [no aux]");
return;
}
- auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
+ auxdata = kmalloc(2 + auxlen + 3, cachefiles_gfp);
if (!auxdata) {
fscache_unuse_cookie(_object);
_leave(" [nomem]");
return;
}
- auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+ aux = (auxlen <= sizeof(cookie->inline_aux)) ?
+ cookie->inline_aux : cookie->aux;
+
+ memcpy(auxdata->data, aux, auxlen);
fscache_unuse_cookie(_object);
- ASSERTCMP(auxlen, <, 511);
auxdata->len = auxlen + 1;
- auxdata->type = cookie->def->type;
+ auxdata->type = cookie->type;
cachefiles_begin_secure(cache, &saved_cred);
cachefiles_update_object_xattr(object, auxdata);
@@ -309,10 +323,12 @@ static void cachefiles_drop_object(struct fscache_object *_object)
/*
* dispose of a reference to an object
*/
-static void cachefiles_put_object(struct fscache_object *_object)
+static void cachefiles_put_object(struct fscache_object *_object,
+ enum fscache_obj_ref_trace why)
{
struct cachefiles_object *object;
struct fscache_cache *cache;
+ int u;
ASSERT(_object);
@@ -328,7 +344,11 @@ static void cachefiles_put_object(struct fscache_object *_object)
ASSERTIFCMP(object->fscache.parent,
object->fscache.parent->n_children, >, 0);
- if (atomic_dec_and_test(&object->usage)) {
+ u = atomic_dec_return(&object->usage);
+ trace_cachefiles_ref(object, _object->cookie,
+ (enum cachefiles_obj_ref_trace)why, u);
+ ASSERTCMP(u, !=, -1);
+ if (u == 0) {
_debug("- kill object OBJ%x", object->fscache.debug_id);
ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
@@ -421,7 +441,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
loff_t oi_size;
int ret;
- _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+ ni_size = _object->store_limit_l;
_enter("{OBJ%x},[%llu]",
_object->debug_id, (unsigned long long) ni_size);
@@ -493,8 +513,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op)
cache = container_of(object->fscache.cache,
struct cachefiles_cache, cache);
- op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
- &ni_size);
+ ni_size = op->object->store_limit_l;
_enter("{OBJ%x},[%llu]",
op->object->debug_id, (unsigned long long)ni_size);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index bb3a02ca9da4..d2f6f996e65a 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -124,6 +124,8 @@ struct cachefiles_xattr {
uint8_t data[];
};
+#include <trace/events/cachefiles.h>
+
/*
* note change of state for daemon
*/
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 711f13d8c2de..f54d3f5b2e40 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -22,6 +22,7 @@
#include <linux/statfs.h>
#include <linux/sysctl.h>
#include <linux/miscdevice.h>
+#define CREATE_TRACE_POINTS
#include "internal.h"
unsigned cachefiles_debug;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 3978b324cbca..0daa1e3fe0df 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -30,11 +30,11 @@
*/
static noinline
void __cachefiles_printk_object(struct cachefiles_object *object,
- const char *prefix,
- u8 *keybuf)
+ const char *prefix)
{
struct fscache_cookie *cookie;
- unsigned keylen, loop;
+ const u8 *k;
+ unsigned loop;
pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);
pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
@@ -56,23 +56,16 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
object->fscache.cookie->parent,
object->fscache.cookie->netfs_data,
object->fscache.cookie->flags);
- if (keybuf && cookie->def)
- keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
- CACHEFILES_KEYBUF_SIZE);
- else
- keylen = 0;
+ pr_err("%skey=[%u] '", prefix, cookie->key_len);
+ k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
+ cookie->inline_key : cookie->key;
+ for (loop = 0; loop < cookie->key_len; loop++)
+ pr_cont("%02x", k[loop]);
+ pr_cont("'\n");
} else {
pr_err("%scookie=NULL\n", prefix);
- keylen = 0;
}
spin_unlock(&object->fscache.lock);
-
- if (keylen) {
- pr_err("%skey=[%u] '", prefix, keylen);
- for (loop = 0; loop < keylen; loop++)
- pr_cont("%02x", keybuf[loop]);
- pr_cont("'\n");
- }
}
/*
@@ -81,14 +74,10 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
static noinline void cachefiles_printk_object(struct cachefiles_object *object,
struct cachefiles_object *xobject)
{
- u8 *keybuf;
-
- keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
if (object)
- __cachefiles_printk_object(object, "", keybuf);
+ __cachefiles_printk_object(object, "");
if (xobject)
- __cachefiles_printk_object(xobject, "x", keybuf);
- kfree(keybuf);
+ __cachefiles_printk_object(xobject, "x");
}
/*
@@ -120,6 +109,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
}
write_unlock(&cache->active_lock);
+ trace_cachefiles_mark_buried(NULL, dentry, why);
_leave(" [no owner]");
return;
@@ -130,6 +120,8 @@ found_dentry:
object->fscache.state->name,
dentry);
+ trace_cachefiles_mark_buried(object, dentry, why);
+
if (fscache_object_is_live(&object->fscache)) {
pr_err("\n");
pr_err("Error: Can't preemptively bury live object\n");
@@ -158,13 +150,15 @@ static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
try_again:
write_lock(&cache->active_lock);
+ dentry = object->dentry;
+ trace_cachefiles_mark_active(object, dentry);
+
if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
pr_err("Error: Object already active\n");
cachefiles_printk_object(object, NULL);
BUG();
}
- dentry = object->dentry;
_p = &cache->active_nodes.rb_node;
while (*_p) {
_parent = *_p;
@@ -191,6 +185,8 @@ try_again:
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
wait_for_old_object:
+ trace_cachefiles_wait_active(object, dentry, xobject);
+
if (fscache_object_is_live(&xobject->fscache)) {
pr_err("\n");
pr_err("Error: Unexpected object collision\n");
@@ -248,12 +244,12 @@ wait_for_old_object:
ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
- cache->cache.ops->put_object(&xobject->fscache);
+ cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry);
goto try_again;
requeue:
clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
- cache->cache.ops->put_object(&xobject->fscache);
+ cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo);
_leave(" = -ETIMEDOUT");
return -ETIMEDOUT;
}
@@ -265,6 +261,11 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
struct cachefiles_object *object,
blkcnt_t i_blocks)
{
+ struct dentry *dentry = object->dentry;
+ struct inode *inode = d_backing_inode(dentry);
+
+ trace_cachefiles_mark_inactive(object, dentry, inode);
+
write_lock(&cache->active_lock);
rb_erase(&object->active_node, &cache->active_nodes);
clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
@@ -288,6 +289,7 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
* - unlocks the directory mutex
*/
static int cachefiles_bury_object(struct cachefiles_cache *cache,
+ struct cachefiles_object *object,
struct dentry *dir,
struct dentry *rep,
bool preemptive,
@@ -312,6 +314,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
if (ret < 0) {
cachefiles_io_error(cache, "Unlink security error");
} else {
+ trace_cachefiles_unlink(object, rep, why);
ret = vfs_unlink(d_inode(dir), rep, NULL);
if (preemptive)
@@ -413,6 +416,7 @@ try_again:
if (ret < 0) {
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
+ trace_cachefiles_rename(object, rep, grave, why);
ret = vfs_rename(d_inode(dir), rep,
d_inode(cache->graveyard), grave, NULL, 0);
if (ret != 0 && ret != -ENOMEM)
@@ -458,7 +462,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
/* we need to check that our parent is _still_ our parent - it
* may have been renamed */
if (dir == object->dentry->d_parent) {
- ret = cachefiles_bury_object(cache, dir,
+ ret = cachefiles_bury_object(cache, object, dir,
object->dentry, false,
FSCACHE_OBJECT_WAS_RETIRED);
} else {
@@ -486,6 +490,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
{
struct cachefiles_cache *cache;
struct dentry *dir, *next = NULL;
+ struct inode *inode;
struct path path;
unsigned long start;
const char *name;
@@ -529,13 +534,17 @@ lookup_again:
start = jiffies;
next = lookup_one_len(name, dir, nlen);
cachefiles_hist(cachefiles_lookup_histogram, start);
- if (IS_ERR(next))
+ if (IS_ERR(next)) {
+ trace_cachefiles_lookup(object, next, NULL);
goto lookup_error;
+ }
- _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative");
+ inode = d_backing_inode(next);
+ trace_cachefiles_lookup(object, next, inode);
+ _debug("next -> %p %s", next, inode ? "positive" : "negative");
if (!key)
- object->new = !d_backing_inode(next);
+ object->new = !inode;
/* if this element of the path doesn't exist, then the lookup phase
* failed, and we can release any readers in the certain knowledge that
@@ -558,6 +567,8 @@ lookup_again:
start = jiffies;
ret = vfs_mkdir(d_inode(dir), next, 0);
cachefiles_hist(cachefiles_mkdir_histogram, start);
+ if (!key)
+ trace_cachefiles_mkdir(object, next, ret);
if (ret < 0)
goto create_error;
@@ -587,6 +598,7 @@ lookup_again:
start = jiffies;
ret = vfs_create(d_inode(dir), next, S_IFREG, true);
cachefiles_hist(cachefiles_create_histogram, start);
+ trace_cachefiles_create(object, next, ret);
if (ret < 0)
goto create_error;
@@ -629,7 +641,8 @@ lookup_again:
* mutex) */
object->dentry = NULL;
- ret = cachefiles_bury_object(cache, dir, next, true,
+ ret = cachefiles_bury_object(cache, object, dir, next,
+ true,
FSCACHE_OBJECT_IS_STALE);
dput(next);
next = NULL;
@@ -955,7 +968,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
/* actually remove the victim (drops the dir mutex) */
_debug("bury");
- ret = cachefiles_bury_object(cache, dir, victim, false,
+ ret = cachefiles_bury_object(cache, NULL, dir, victim, false,
FSCACHE_OBJECT_WAS_CULLED);
if (ret < 0)
goto error;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 883bc7bb12c5..5082c8a49686 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -952,6 +952,7 @@ error:
* - cache withdrawal is prevented by the caller
*/
void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+ __releases(&object->fscache.cookie->lock)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index d31c1a72d8a5..0a29a00aed2e 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -113,6 +113,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
/* attempt to install the cache metadata directly */
_debug("SET #%u", auxdata->len);
+ clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
XATTR_CREATE);
@@ -141,6 +142,7 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
/* attempt to install the cache metadata directly */
_debug("SET #%u", auxdata->len);
+ clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
XATTR_REPLACE);
@@ -180,7 +182,8 @@ int cachefiles_check_auxdata(struct cachefiles_object *object)
goto error;
xlen--;
- validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);
+ validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen,
+ i_size_read(d_backing_inode(dentry)));
if (validity != FSCACHE_CHECKAUX_OKAY)
goto error;
@@ -249,7 +252,8 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
object->fscache.cookie->def->name, dlen);
result = fscache_check_aux(&object->fscache,
- &auxbuf->data, dlen);
+ &auxbuf->data, dlen,
+ i_size_read(d_backing_inode(dentry)));
switch (result) {
/* entry okay as is */
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a3ab265d3215..33a211b364ed 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -27,7 +27,6 @@
struct ceph_aux_inode {
u64 version;
struct timespec mtime;
- loff_t size;
};
struct fscache_netfs ceph_cache_netfs = {
@@ -41,34 +40,15 @@ static LIST_HEAD(ceph_fscache_list);
struct ceph_fscache_entry {
struct list_head list;
struct fscache_cookie *fscache;
- struct ceph_fsid fsid;
size_t uniq_len;
+ /* The following members must be last */
+ struct ceph_fsid fsid;
char uniquifier[0];
};
-static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t maxbuf)
-{
- const struct ceph_fs_client* fsc = cookie_netfs_data;
- const char *fscache_uniq = fsc->mount_options->fscache_uniq;
- uint16_t fsid_len, uniq_len;
-
- fsid_len = sizeof(fsc->client->fsid);
- uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
- if (fsid_len + uniq_len > maxbuf)
- return 0;
-
- memcpy(buffer, &fsc->client->fsid, fsid_len);
- if (uniq_len)
- memcpy(buffer + fsid_len, fscache_uniq, uniq_len);
-
- return fsid_len + uniq_len;
-}
-
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
.name = "CEPH.fsid",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = ceph_fscache_session_get_key,
};
int ceph_fscache_register(void)
@@ -110,16 +90,19 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
goto out_unlock;
}
+ memcpy(&ent->fsid, fsid, sizeof(*fsid));
+ if (uniq_len > 0) {
+ memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
+ ent->uniq_len = uniq_len;
+ }
+
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
- fsc, true);
+ &ent->fsid, sizeof(ent->fsid) + uniq_len,
+ NULL, 0,
+ fsc, 0, true);
if (fsc->fscache) {
- memcpy(&ent->fsid, fsid, sizeof(*fsid));
- if (uniq_len > 0) {
- memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
- ent->uniq_len = uniq_len;
- }
ent->fscache = fsc->fscache;
list_add_tail(&ent->list, &ceph_fscache_list);
} else {
@@ -133,59 +116,21 @@ out_unlock:
return err;
}
-static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t maxbuf)
-{
- const struct ceph_inode_info* ci = cookie_netfs_data;
- uint16_t klen;
-
- /* use ceph virtual inode (id + snapshot) */
- klen = sizeof(ci->i_vino);
- if (klen > maxbuf)
- return 0;
-
- memcpy(buffer, &ci->i_vino, klen);
- return klen;
-}
-
-static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- struct ceph_aux_inode aux;
- const struct ceph_inode_info* ci = cookie_netfs_data;
- const struct inode* inode = &ci->vfs_inode;
-
- memset(&aux, 0, sizeof(aux));
- aux.version = ci->i_version;
- aux.mtime = inode->i_mtime;
- aux.size = i_size_read(inode);
-
- memcpy(buffer, &aux, sizeof(aux));
-
- return sizeof(aux);
-}
-
-static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
- uint64_t *size)
-{
- const struct ceph_inode_info* ci = cookie_netfs_data;
- *size = i_size_read(&ci->vfs_inode);
-}
-
static enum fscache_checkaux ceph_fscache_inode_check_aux(
- void *cookie_netfs_data, const void *data, uint16_t dlen)
+ void *cookie_netfs_data, const void *data, uint16_t dlen,
+ loff_t object_size)
{
struct ceph_aux_inode aux;
struct ceph_inode_info* ci = cookie_netfs_data;
struct inode* inode = &ci->vfs_inode;
- if (dlen != sizeof(aux))
+ if (dlen != sizeof(aux) ||
+ i_size_read(inode) != object_size)
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&aux, 0, sizeof(aux));
aux.version = ci->i_version;
aux.mtime = inode->i_mtime;
- aux.size = i_size_read(inode);
if (memcmp(data, &aux, sizeof(aux)) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
@@ -197,9 +142,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.name = "CEPH.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = ceph_fscache_inode_get_key,
- .get_attr = ceph_fscache_inode_get_attr,
- .get_aux = ceph_fscache_inode_get_aux,
.check_aux = ceph_fscache_inode_check_aux,
};
@@ -207,6 +149,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_aux_inode aux;
/* No caching for filesystem */
if (!fsc->fscache)
@@ -218,9 +161,14 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
inode_lock_nested(inode, I_MUTEX_CHILD);
if (!ci->fscache) {
+ memset(&aux, 0, sizeof(aux));
+ aux.version = ci->i_version;
+ aux.mtime = inode->i_mtime;
ci->fscache = fscache_acquire_cookie(fsc->fscache,
- &ceph_fscache_inode_object_def,
- ci, false);
+ &ceph_fscache_inode_object_def,
+ &ci->i_vino, sizeof(ci->i_vino),
+ &aux, sizeof(aux),
+ ci, i_size_read(inode), false);
}
inode_unlock(inode);
}
@@ -235,7 +183,7 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
ci->fscache = NULL;
fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
- fscache_relinquish_cookie(cookie, 0);
+ fscache_relinquish_cookie(cookie, &ci->i_vino, false);
}
static bool ceph_fscache_can_enable(void *data)
@@ -254,11 +202,11 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
if (inode_is_open_for_write(inode)) {
dout("fscache_file_set_cookie %p %p disabling cache\n",
inode, filp);
- fscache_disable_cookie(ci->fscache, false);
+ fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
fscache_uncache_all_inode_pages(ci->fscache, inode);
} else {
- fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
- inode);
+ fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
+ ceph_fscache_can_enable, inode);
if (fscache_cookie_enabled(ci->fscache)) {
dout("fscache_file_set_cookie %p %p enabling cache\n",
inode, filp);
@@ -351,7 +299,8 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
if (!cache_valid(ci))
return;
- ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
+ ret = fscache_write_page(ci->fscache, page, i_size_read(inode),
+ GFP_KERNEL);
if (ret)
fscache_uncache_page(ci->fscache, page);
}
@@ -385,7 +334,7 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
WARN_ON_ONCE(!found);
mutex_unlock(&ceph_fscache_lock);
- __fscache_relinquish_cookie(fsc->fscache, 0);
+ __fscache_relinquish_cookie(fsc->fscache, NULL, false);
}
fsc->fscache = NULL;
}
@@ -402,7 +351,7 @@ void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
* truncate while the caller holds CEPH_CAP_FILE_RD */
mutex_lock(&ci->i_truncate_mutex);
if (!cache_valid(ci)) {
- if (fscache_check_consistency(ci->fscache))
+ if (fscache_check_consistency(ci->fscache, &ci->i_vino))
fscache_invalidate(ci->fscache);
spin_lock(&ci->i_ceph_lock);
ci->i_fscache_gen = ci->i_rdcache_gen;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6582c4507e6c..0e5bd3e3344e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3965,6 +3965,32 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
}
/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+int ceph_drop_caps_for_unlink(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (inode->i_nlink == 1) {
+ drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ if (__ceph_caps_dirty(ci)) {
+ struct ceph_mds_client *mdsc =
+ ceph_inode_to_client(inode)->mdsc;
+ __cap_delay_requeue_front(mdsc, ci);
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ return drop;
+}
+
+/*
* Helpers for embedding cap and dentry lease releases into mds
* requests.
*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0c4346806e17..2bdd561c4c68 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2,7 +2,6 @@
#include <linux/ceph/ceph_debug.h>
#include <linux/spinlock.h>
-#include <linux/fs_struct.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/sched.h>
@@ -1003,26 +1002,6 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
}
/*
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
- * looks like the link count will hit 0, drop any other caps (other
- * than PIN) we don't specifically want (due to the file still being
- * open).
- */
-static int drop_caps_for_unlink(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
-
- spin_lock(&ci->i_ceph_lock);
- if (inode->i_nlink == 1) {
- drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
- ci->i_ceph_flags |= CEPH_I_NODELAY;
- }
- spin_unlock(&ci->i_ceph_lock);
- return drop;
-}
-
-/*
* rmdir and unlink are differ only by the metadata op code
*/
static int ceph_unlink(struct inode *dir, struct dentry *dentry)
@@ -1056,7 +1035,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- req->r_inode_drop = drop_caps_for_unlink(inode);
+ req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
d_delete(dentry);
@@ -1104,8 +1083,10 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
/* release LINK_RDCACHE on source inode (mds will lock it) */
req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
- if (d_really_is_positive(new_dentry))
- req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry));
+ if (d_really_is_positive(new_dentry)) {
+ req->r_inode_drop =
+ ceph_drop_caps_for_unlink(d_inode(new_dentry));
+ }
err = ceph_mdsc_do_request(mdsc, old_dir, req);
if (!err && !req->r_reply_info.head->is_dentry) {
/*
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6639926eed4e..b67eec3532a1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -640,7 +640,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
struct ceph_aio_request {
struct kiocb *iocb;
size_t total_len;
- int write;
+ bool write;
+ bool should_dirty;
int error;
struct list_head osd_reqs;
unsigned num_reqs;
@@ -750,7 +751,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
}
}
- ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write);
+ ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty);
ceph_osdc_put_request(req);
if (rc < 0)
@@ -847,6 +848,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
+ bool should_dirty = !write && iter_is_iovec(iter);
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -914,6 +916,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (aio_req) {
aio_req->iocb = iocb;
aio_req->write = write;
+ aio_req->should_dirty = should_dirty;
INIT_LIST_HEAD(&aio_req->osd_reqs);
if (write) {
aio_req->mtime = mtime;
@@ -971,7 +974,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
len = ret;
}
- ceph_put_page_vector(pages, num_pages, !write);
+ ceph_put_page_vector(pages, num_pages, should_dirty);
ceph_osdc_put_request(req);
if (ret < 0)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a62d2a9841dc..fb2bc9c15a23 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -225,6 +225,7 @@ static int parse_fsopt_token(char *c, void *private)
return -ENOMEM;
break;
case Opt_mds_namespace:
+ kfree(fsopt->mds_namespace);
fsopt->mds_namespace = kstrndup(argstr[0].from,
argstr[0].to-argstr[0].from,
GFP_KERNEL);
@@ -232,6 +233,7 @@ static int parse_fsopt_token(char *c, void *private)
return -ENOMEM;
break;
case Opt_fscache_uniq:
+ kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = kstrndup(argstr[0].from,
argstr[0].to-argstr[0].from,
GFP_KERNEL);
@@ -711,14 +713,17 @@ static int __init init_caches(void)
goto bad_dentry;
ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
-
if (!ceph_file_cachep)
goto bad_file;
- if ((error = ceph_fscache_register()))
- goto bad_file;
+ error = ceph_fscache_register();
+ if (error)
+ goto bad_fscache;
return 0;
+
+bad_fscache:
+ kmem_cache_destroy(ceph_file_cachep);
bad_file:
kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry:
@@ -836,7 +841,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
int err;
unsigned long started = jiffies; /* note the start time */
struct dentry *root;
- int first = 0; /* first vfsmount for this super_block */
dout("mount start %p\n", fsc);
mutex_lock(&fsc->client->mount_mutex);
@@ -861,17 +865,17 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
path = fsc->mount_options->server_path + 1;
dout("mount opening path %s\n", path);
}
+
+ err = ceph_fs_debugfs_init(fsc);
+ if (err < 0)
+ goto out;
+
root = open_root_dentry(fsc, path, started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto out;
}
fsc->sb->s_root = dget(root);
- first = 1;
-
- err = ceph_fs_debugfs_init(fsc);
- if (err < 0)
- goto fail;
} else {
root = dget(fsc->sb->s_root);
}
@@ -881,11 +885,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
mutex_unlock(&fsc->client->mount_mutex);
return root;
-fail:
- if (first) {
- dput(fsc->sb->s_root);
- fsc->sb->s_root = NULL;
- }
out:
mutex_unlock(&fsc->client->mount_mutex);
return ERR_PTR(err);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 21b2e5b004eb..1c2086e0fec2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -987,7 +987,7 @@ extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
-
+extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force);
extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a65e4a56318c..a279c58fe360 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -67,18 +67,18 @@ static int find_dynamic_major(void)
int i;
struct char_device_struct *cd;
- for (i = ARRAY_SIZE(chrdevs)-1; i > CHRDEV_MAJOR_DYN_END; i--) {
+ for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) {
if (chrdevs[i] == NULL)
return i;
}
for (i = CHRDEV_MAJOR_DYN_EXT_START;
- i > CHRDEV_MAJOR_DYN_EXT_END; i--) {
+ i >= CHRDEV_MAJOR_DYN_EXT_END; i--) {
for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next)
if (cd->major == i)
break;
- if (cd == NULL || cd->major != i)
+ if (cd == NULL)
return i;
}
@@ -121,8 +121,8 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
}
if (major >= CHRDEV_MAJOR_MAX) {
- pr_err("CHRDEV \"%s\" major requested (%d) is greater than the maximum (%d)\n",
- name, major, CHRDEV_MAJOR_MAX);
+ pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n",
+ name, major, CHRDEV_MAJOR_MAX-1);
ret = -EINVAL;
goto out;
}
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 687da62daf4e..741749a98614 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -187,13 +187,13 @@ config CIFS_NFSD_EXPORT
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB311
- bool "SMB3.1.1 network file system support (Experimental)"
+ bool "SMB3.1.1 network file system support"
depends on CIFS
+ select CRYPTO_SHA512
help
- This enables experimental support for the newest, SMB3.1.1, dialect.
- This dialect includes improved security negotiation features.
- If unsure, say N
+ This enables support for the newest, and most secure dialect, SMB3.11.
+ If unsure, say Y
config CIFS_SMB_DIRECT
bool "SMB Direct support (Experimental)"
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 2c14020e5e1d..edf5f40898bf 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -46,67 +46,11 @@ void cifs_fscache_unregister(void)
}
/*
- * Key layout of CIFS server cache index object
- */
-struct cifs_server_key {
- uint16_t family; /* address family */
- __be16 port; /* IP port */
- union {
- struct in_addr ipv4_addr;
- struct in6_addr ipv6_addr;
- } addr[0];
-};
-
-/*
- * Server object keyed by {IPaddress,port,family} tuple
- */
-static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t maxbuf)
-{
- const struct TCP_Server_Info *server = cookie_netfs_data;
- const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
- const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
- const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
- struct cifs_server_key *key = buffer;
- uint16_t key_len = sizeof(struct cifs_server_key);
-
- memset(key, 0, key_len);
-
- /*
- * Should not be a problem as sin_family/sin6_family overlays
- * sa_family field
- */
- switch (sa->sa_family) {
- case AF_INET:
- key->family = sa->sa_family;
- key->port = addr->sin_port;
- key->addr[0].ipv4_addr = addr->sin_addr;
- key_len += sizeof(key->addr[0].ipv4_addr);
- break;
-
- case AF_INET6:
- key->family = sa->sa_family;
- key->port = addr6->sin6_port;
- key->addr[0].ipv6_addr = addr6->sin6_addr;
- key_len += sizeof(key->addr[0].ipv6_addr);
- break;
-
- default:
- cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
- key_len = 0;
- break;
- }
-
- return key_len;
-}
-
-/*
* Server object for FS-Cache
*/
const struct fscache_cookie_def cifs_fscache_server_index_def = {
.name = "CIFS.server",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = cifs_server_get_key,
};
/*
@@ -116,7 +60,7 @@ struct cifs_fscache_super_auxdata {
u64 resource_id; /* unique server resource id */
};
-static char *extract_sharename(const char *treename)
+char *extract_sharename(const char *treename)
{
const char *src;
char *delim, *dst;
@@ -140,56 +84,11 @@ static char *extract_sharename(const char *treename)
return dst;
}
-/*
- * Superblock object currently keyed by share name
- */
-static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
- uint16_t maxbuf)
-{
- const struct cifs_tcon *tcon = cookie_netfs_data;
- char *sharename;
- uint16_t len;
-
- sharename = extract_sharename(tcon->treeName);
- if (IS_ERR(sharename)) {
- cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
- sharename = NULL;
- return 0;
- }
-
- len = strlen(sharename);
- if (len > maxbuf)
- return 0;
-
- memcpy(buffer, sharename, len);
-
- kfree(sharename);
-
- return len;
-}
-
-static uint16_t
-cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
- uint16_t maxbuf)
-{
- struct cifs_fscache_super_auxdata auxdata;
- const struct cifs_tcon *tcon = cookie_netfs_data;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.resource_id = tcon->resource_id;
-
- if (maxbuf > sizeof(auxdata))
- maxbuf = sizeof(auxdata);
-
- memcpy(buffer, &auxdata, maxbuf);
-
- return maxbuf;
-}
-
static enum
fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
const void *data,
- uint16_t datalen)
+ uint16_t datalen,
+ loff_t object_size)
{
struct cifs_fscache_super_auxdata auxdata;
const struct cifs_tcon *tcon = cookie_netfs_data;
@@ -212,68 +111,14 @@ fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
const struct fscache_cookie_def cifs_fscache_super_index_def = {
.name = "CIFS.super",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = cifs_super_get_key,
- .get_aux = cifs_fscache_super_get_aux,
.check_aux = cifs_fscache_super_check_aux,
};
-/*
- * Auxiliary data attached to CIFS inode within the cache
- */
-struct cifs_fscache_inode_auxdata {
- struct timespec last_write_time;
- struct timespec last_change_time;
- u64 eof;
-};
-
-static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t maxbuf)
-{
- const struct cifsInodeInfo *cifsi = cookie_netfs_data;
- uint16_t keylen;
-
- /* use the UniqueId as the key */
- keylen = sizeof(cifsi->uniqueid);
- if (keylen > maxbuf)
- keylen = 0;
- else
- memcpy(buffer, &cifsi->uniqueid, keylen);
-
- return keylen;
-}
-
-static void
-cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
-{
- const struct cifsInodeInfo *cifsi = cookie_netfs_data;
-
- *size = cifsi->vfs_inode.i_size;
-}
-
-static uint16_t
-cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
- uint16_t maxbuf)
-{
- struct cifs_fscache_inode_auxdata auxdata;
- const struct cifsInodeInfo *cifsi = cookie_netfs_data;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.eof = cifsi->server_eof;
- auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
- auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
-
- if (maxbuf > sizeof(auxdata))
- maxbuf = sizeof(auxdata);
-
- memcpy(buffer, &auxdata, maxbuf);
-
- return maxbuf;
-}
-
static enum
fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
const void *data,
- uint16_t datalen)
+ uint16_t datalen,
+ loff_t object_size)
{
struct cifs_fscache_inode_auxdata auxdata;
struct cifsInodeInfo *cifsi = cookie_netfs_data;
@@ -295,8 +140,5 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
const struct fscache_cookie_def cifs_fscache_inode_object_def = {
.name = "CIFS.uniqueid",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = cifs_fscache_inode_get_key,
- .get_attr = cifs_fscache_inode_get_attr,
- .get_aux = cifs_fscache_inode_get_aux,
.check_aux = cifs_fscache_inode_check_aux,
};
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index f2b0a7f124da..a6ef088e057b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -36,37 +36,6 @@
#include <crypto/skcipher.h>
#include <crypto/aead.h>
-static int
-cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
-{
- int rc;
- unsigned int size;
-
- if (server->secmech.sdescmd5 != NULL)
- return 0; /* already allocated */
-
- server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
- if (IS_ERR(server->secmech.md5)) {
- cifs_dbg(VFS, "could not allocate crypto md5\n");
- rc = PTR_ERR(server->secmech.md5);
- server->secmech.md5 = NULL;
- return rc;
- }
-
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.md5);
- server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdescmd5) {
- crypto_free_shash(server->secmech.md5);
- server->secmech.md5 = NULL;
- return -ENOMEM;
- }
- server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
- server->secmech.sdescmd5->shash.flags = 0x0;
-
- return 0;
-}
-
int __cifs_calc_signature(struct smb_rqst *rqst,
struct TCP_Server_Info *server, char *signature,
struct shash_desc *shash)
@@ -132,13 +101,10 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
if (!rqst->rq_iov || !signature || !server)
return -EINVAL;
- if (!server->secmech.sdescmd5) {
- rc = cifs_crypto_shash_md5_allocate(server);
- if (rc) {
- cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
- return -1;
- }
- }
+ rc = cifs_alloc_hash("md5", &server->secmech.md5,
+ &server->secmech.sdescmd5);
+ if (rc)
+ return -1;
rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
if (rc) {
@@ -663,37 +629,6 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
return rc;
}
-static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
-{
- int rc;
- unsigned int size;
-
- /* check if already allocated */
- if (server->secmech.sdeschmacmd5)
- return 0;
-
- server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
- if (IS_ERR(server->secmech.hmacmd5)) {
- cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
- rc = PTR_ERR(server->secmech.hmacmd5);
- server->secmech.hmacmd5 = NULL;
- return rc;
- }
-
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.hmacmd5);
- server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdeschmacmd5) {
- crypto_free_shash(server->secmech.hmacmd5);
- server->secmech.hmacmd5 = NULL;
- return -ENOMEM;
- }
- server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
- server->secmech.sdeschmacmd5->shash.flags = 0x0;
-
- return 0;
-}
-
int
setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
{
@@ -757,9 +692,10 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
mutex_lock(&ses->server->srv_mutex);
- rc = crypto_hmacmd5_alloc(ses->server);
+ rc = cifs_alloc_hash("hmac(md5)",
+ &ses->server->secmech.hmacmd5,
+ &ses->server->secmech.sdeschmacmd5);
if (rc) {
- cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
goto unlock;
}
@@ -893,6 +829,11 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)
server->secmech.md5 = NULL;
}
+ if (server->secmech.sha512) {
+ crypto_free_shash(server->secmech.sha512);
+ server->secmech.sha512 = NULL;
+ }
+
if (server->secmech.hmacmd5) {
crypto_free_shash(server->secmech.hmacmd5);
server->secmech.hmacmd5 = NULL;
@@ -916,4 +857,6 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)
server->secmech.sdeschmacmd5 = NULL;
kfree(server->secmech.sdescmd5);
server->secmech.sdescmd5 = NULL;
+ kfree(server->secmech.sdescsha512);
+ server->secmech.sdescsha512 = NULL;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 32cdea67bbfd..f715609b13f3 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1486,6 +1486,7 @@ MODULE_SOFTDEP("pre: nls");
MODULE_SOFTDEP("pre: aes");
MODULE_SOFTDEP("pre: cmac");
MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: sha512");
MODULE_SOFTDEP("pre: aead2");
MODULE_SOFTDEP("pre: ccm");
module_init(init_cifs)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 48f7c197cd2d..2282562e78a1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -130,10 +130,12 @@ struct cifs_secmech {
struct crypto_shash *md5; /* md5 hash function */
struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
struct crypto_shash *cmacaes; /* block-cipher based MAC function */
+ struct crypto_shash *sha512; /* sha512 hash function */
struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */
struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */
+ struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */
struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */
struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */
};
@@ -466,6 +468,7 @@ struct smb_version_values {
__u32 exclusive_lock_type;
__u32 shared_lock_type;
__u32 unlock_lock_type;
+ size_t header_preamble_size;
size_t header_size;
size_t max_header_size;
size_t read_rsp_size;
@@ -673,7 +676,8 @@ struct TCP_Server_Info {
unsigned int max_read;
unsigned int max_write;
#ifdef CONFIG_CIFS_SMB311
- __u8 preauth_sha_hash[64]; /* save initital negprot hash */
+ /* save initital negprot hash */
+ __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
#endif /* 3.1.1 */
struct delayed_work reconnect; /* reconnect workqueue job */
struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
@@ -862,7 +866,7 @@ struct cifs_ses {
__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
#ifdef CONFIG_CIFS_SMB311
- __u8 preauth_sha_hash[64];
+ __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
#endif /* 3.1.1 */
};
@@ -1466,6 +1470,7 @@ struct dfs_info3_param {
#define CIFS_FATTR_NEED_REVAL 0x4
#define CIFS_FATTR_INO_COLLISION 0x8
#define CIFS_FATTR_UNKNOWN_NLINK 0x10
+#define CIFS_FATTR_FAKE_ROOT_INO 0x20
struct cifs_fattr {
u32 cf_flags;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 93d565186698..365a414a75e9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -542,4 +542,9 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
void cifs_aio_ctx_release(struct kref *refcount);
int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
+
+int cifs_alloc_hash(const char *name, struct crypto_shash **shash,
+ struct sdesc **sdesc);
+void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc);
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9ceebf30eb22..59c09a596c0a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1454,7 +1454,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
unsigned int data_offset, data_len;
struct cifs_readdata *rdata = mid->callback_data;
char *buf = server->smallbuf;
- unsigned int buflen = get_rfc1002_length(buf) + 4;
+ unsigned int buflen = get_rfc1002_length(buf) +
+ server->vals->header_preamble_size;
bool use_rdma_mr = false;
cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
@@ -1504,7 +1505,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return cifs_readv_discard(server, mid);
}
- data_offset = server->ops->read_data_offset(buf) + 4;
+ data_offset = server->ops->read_data_offset(buf) +
+ server->vals->header_preamble_size;
if (data_offset < server->total_read) {
/*
* win2k8 sometimes sends an offset of 0 when the read
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a726f524fb84..4e0808f40195 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -775,7 +775,8 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
unsigned int pdu_length = get_rfc1002_length(buf);
/* make sure this will fit in a large buffer */
- if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) {
+ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) -
+ server->vals->header_preamble_size) {
cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
cifs_reconnect(server);
wake_up(&server->response_q);
@@ -791,7 +792,9 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* now read the rest */
length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
- pdu_length - HEADER_SIZE(server) + 1 + 4);
+ pdu_length - HEADER_SIZE(server) + 1
+ + server->vals->header_preamble_size);
+
if (length < 0)
return length;
server->total_read += length;
@@ -884,7 +887,8 @@ cifs_demultiplex_thread(void *p)
continue;
/* make sure we have enough to get to the MID */
- if (pdu_length < HEADER_SIZE(server) - 1 - 4) {
+ if (pdu_length < HEADER_SIZE(server) - 1 -
+ server->vals->header_preamble_size) {
cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
pdu_length);
cifs_reconnect(server);
@@ -893,8 +897,10 @@ cifs_demultiplex_thread(void *p)
}
/* read down to the MID */
- length = cifs_read_from_socket(server, buf + 4,
- HEADER_SIZE(server) - 1 - 4);
+ length = cifs_read_from_socket(server,
+ buf + server->vals->header_preamble_size,
+ HEADER_SIZE(server) - 1
+ - server->vals->header_preamble_size);
if (length < 0)
continue;
server->total_read += length;
@@ -4306,7 +4312,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
server->sec_mode, server->capabilities, server->timeAdj);
if (ses->auth_key.response) {
- cifs_dbg(VFS, "Free previous auth_key.response = %p\n",
+ cifs_dbg(FYI, "Free previous auth_key.response = %p\n",
ses->auth_key.response);
kfree(ses->auth_key.response);
ses->auth_key.response = NULL;
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 8d4b7bc8ae91..25d3f66b2d50 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -23,11 +23,63 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
+/*
+ * Key layout of CIFS server cache index object
+ */
+struct cifs_server_key {
+ struct {
+ uint16_t family; /* address family */
+ __be16 port; /* IP port */
+ } hdr;
+ union {
+ struct in_addr ipv4_addr;
+ struct in6_addr ipv6_addr;
+ };
+} __packed;
+
+/*
+ * Get a cookie for a server object keyed by {IPaddress,port,family} tuple
+ */
void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
{
+ const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+ const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+ const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
+ struct cifs_server_key key;
+ uint16_t key_len = sizeof(key.hdr);
+
+ memset(&key, 0, sizeof(key));
+
+ /*
+ * Should not be a problem as sin_family/sin6_family overlays
+ * sa_family field
+ */
+ key.hdr.family = sa->sa_family;
+ switch (sa->sa_family) {
+ case AF_INET:
+ key.hdr.port = addr->sin_port;
+ key.ipv4_addr = addr->sin_addr;
+ key_len += sizeof(key.ipv4_addr);
+ break;
+
+ case AF_INET6:
+ key.hdr.port = addr6->sin6_port;
+ key.ipv6_addr = addr6->sin6_addr;
+ key_len += sizeof(key.ipv6_addr);
+ break;
+
+ default:
+ cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
+ server->fscache = NULL;
+ return;
+ }
+
server->fscache =
fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
- &cifs_fscache_server_index_def, server, true);
+ &cifs_fscache_server_index_def,
+ &key, key_len,
+ NULL, 0,
+ server, 0, true);
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
__func__, server, server->fscache);
}
@@ -36,17 +88,29 @@ void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
{
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
__func__, server, server->fscache);
- fscache_relinquish_cookie(server->fscache, 0);
+ fscache_relinquish_cookie(server->fscache, NULL, false);
server->fscache = NULL;
}
void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
{
struct TCP_Server_Info *server = tcon->ses->server;
+ char *sharename;
+
+ sharename = extract_sharename(tcon->treeName);
+ if (IS_ERR(sharename)) {
+ cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
+ tcon->fscache = NULL;
+ return;
+ }
tcon->fscache =
fscache_acquire_cookie(server->fscache,
- &cifs_fscache_super_index_def, tcon, true);
+ &cifs_fscache_super_index_def,
+ sharename, strlen(sharename),
+ &tcon->resource_id, sizeof(tcon->resource_id),
+ tcon, 0, true);
+ kfree(sharename);
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
__func__, server->fscache, tcon->fscache);
}
@@ -54,10 +118,28 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
{
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache);
- fscache_relinquish_cookie(tcon->fscache, 0);
+ fscache_relinquish_cookie(tcon->fscache, &tcon->resource_id, false);
tcon->fscache = NULL;
}
+static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi,
+ struct cifs_tcon *tcon)
+{
+ struct cifs_fscache_inode_auxdata auxdata;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.eof = cifsi->server_eof;
+ auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+ auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+ cifsi->fscache =
+ fscache_acquire_cookie(tcon->fscache,
+ &cifs_fscache_inode_object_def,
+ &cifsi->uniqueid, sizeof(cifsi->uniqueid),
+ &auxdata, sizeof(auxdata),
+ cifsi, cifsi->vfs_inode.i_size, true);
+}
+
static void cifs_fscache_enable_inode_cookie(struct inode *inode)
{
struct cifsInodeInfo *cifsi = CIFS_I(inode);
@@ -67,21 +149,28 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
if (cifsi->fscache)
return;
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
- cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
- &cifs_fscache_inode_object_def, cifsi, true);
- cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
- __func__, tcon->fscache, cifsi->fscache);
- }
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE))
+ return;
+
+ cifs_fscache_acquire_inode_cookie(cifsi, tcon);
+
+ cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
+ __func__, tcon->fscache, cifsi->fscache);
}
void cifs_fscache_release_inode_cookie(struct inode *inode)
{
+ struct cifs_fscache_inode_auxdata auxdata;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
if (cifsi->fscache) {
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.eof = cifsi->server_eof;
+ auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+ auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
- fscache_relinquish_cookie(cifsi->fscache, 0);
+ fscache_relinquish_cookie(cifsi->fscache, &auxdata, false);
cifsi->fscache = NULL;
}
}
@@ -93,7 +182,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode)
if (cifsi->fscache) {
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
fscache_uncache_all_inode_pages(cifsi->fscache, inode);
- fscache_relinquish_cookie(cifsi->fscache, 1);
+ fscache_relinquish_cookie(cifsi->fscache, NULL, true);
cifsi->fscache = NULL;
}
}
@@ -110,16 +199,14 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
{
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct fscache_cookie *old = cifsi->fscache;
if (cifsi->fscache) {
/* retire the current fscache cache and get a new one */
- fscache_relinquish_cookie(cifsi->fscache, 1);
+ fscache_relinquish_cookie(cifsi->fscache, NULL, true);
- cifsi->fscache = fscache_acquire_cookie(
- cifs_sb_master_tcon(cifs_sb)->fscache,
- &cifs_fscache_inode_object_def,
- cifsi, true);
+ cifs_fscache_acquire_inode_cookie(cifsi, tcon);
cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
__func__, cifsi->fscache, old);
}
@@ -214,13 +301,15 @@ int __cifs_readpages_from_fscache(struct inode *inode,
void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
int ret;
cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
- __func__, CIFS_I(inode)->fscache, page, inode);
- ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
+ __func__, cifsi->fscache, page, inode);
+ ret = fscache_write_page(cifsi->fscache, page,
+ cifsi->vfs_inode.i_size, GFP_KERNEL);
if (ret != 0)
- fscache_uncache_page(CIFS_I(inode)->fscache, page);
+ fscache_uncache_page(cifsi->fscache, page);
}
void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages)
@@ -239,4 +328,3 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
fscache_wait_on_page_write(cookie, page);
fscache_uncache_page(cookie, page);
}
-
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 24794b6cd8ec..c7e3ac251e16 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -27,6 +27,18 @@
#ifdef CONFIG_CIFS_FSCACHE
+/*
+ * Auxiliary data attached to CIFS inode within the cache
+ */
+struct cifs_fscache_inode_auxdata {
+ struct timespec last_write_time;
+ struct timespec last_change_time;
+ u64 eof;
+};
+
+/*
+ * cache.c
+ */
extern struct fscache_netfs cifs_fscache_netfs;
extern const struct fscache_cookie_def cifs_fscache_server_index_def;
extern const struct fscache_cookie_def cifs_fscache_super_index_def;
@@ -34,6 +46,7 @@ extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
extern int cifs_fscache_register(void);
extern void cifs_fscache_unregister(void);
+extern char *extract_sharename(const char *);
/*
* fscache.c
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8f9a8cc7cc62..f856df4adae3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -707,6 +707,18 @@ cgfi_exit:
return rc;
}
+/* Simple function to return a 64 bit hash of string. Rarely called */
+static __u64 simple_hashstr(const char *str)
+{
+ const __u64 hash_mult = 1125899906842597L; /* a big enough prime */
+ __u64 hash = 0;
+
+ while (*str)
+ hash = (hash + (__u64) *str++) * hash_mult;
+
+ return hash;
+}
+
int
cifs_get_inode_info(struct inode **inode, const char *full_path,
FILE_ALL_INFO *data, struct super_block *sb, int xid,
@@ -816,6 +828,14 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
tmprc);
fattr.cf_uniqueid = iunique(sb, ROOT_I);
cifs_autodisable_serverino(cifs_sb);
+ } else if ((fattr.cf_uniqueid == 0) &&
+ strlen(full_path) == 0) {
+ /* some servers ret bad root ino ie 0 */
+ cifs_dbg(FYI, "Invalid (0) inodenum\n");
+ fattr.cf_flags |=
+ CIFS_FATTR_FAKE_ROOT_INO;
+ fattr.cf_uniqueid =
+ simple_hashstr(tcon->treeName);
}
}
} else
@@ -832,6 +852,16 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
&fattr.cf_uniqueid, data);
if (tmprc)
fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ else if ((fattr.cf_uniqueid == 0) &&
+ strlen(full_path) == 0) {
+ /*
+ * Reuse existing root inode num since
+ * inum zero for root causes ls of . and .. to
+ * not be returned
+ */
+ cifs_dbg(FYI, "Srv ret 0 inode num for root\n");
+ fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ }
} else
fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
}
@@ -893,6 +923,9 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
}
cgii_exit:
+ if ((*inode) && ((*inode)->i_ino == 0))
+ cifs_dbg(FYI, "inode number of zero returned\n");
+
kfree(buf);
cifs_put_tlink(tlink);
return rc;
@@ -1066,10 +1099,7 @@ iget_no_retry:
out:
kfree(path);
- /* can not call macro free_xid here since in a void func
- * TODO: This is no longer true
- */
- _free_xid(xid);
+ free_xid(xid);
return inode;
}
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 60b5a11ee11b..889a840172eb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -50,25 +50,12 @@ static int
symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
{
int rc;
- unsigned int size;
- struct crypto_shash *md5;
- struct sdesc *sdescmd5;
-
- md5 = crypto_alloc_shash("md5", 0, 0);
- if (IS_ERR(md5)) {
- rc = PTR_ERR(md5);
- cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n",
- __func__, rc);
- return rc;
- }
- size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
- sdescmd5 = kmalloc(size, GFP_KERNEL);
- if (!sdescmd5) {
- rc = -ENOMEM;
+ struct crypto_shash *md5 = NULL;
+ struct sdesc *sdescmd5 = NULL;
+
+ rc = cifs_alloc_hash("md5", &md5, &sdescmd5);
+ if (rc)
goto symlink_hash_err;
- }
- sdescmd5->shash.tfm = md5;
- sdescmd5->shash.flags = 0x0;
rc = crypto_shash_init(&sdescmd5->shash);
if (rc) {
@@ -85,9 +72,7 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
symlink_hash_err:
- crypto_free_shash(md5);
- kfree(sdescmd5);
-
+ cifs_free_hash(&md5, &sdescmd5);
return rc;
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a0dbced4a45c..460084a8eac5 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -848,3 +848,57 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len);
return 0;
}
+
+/**
+ * cifs_alloc_hash - allocate hash and hash context together
+ *
+ * The caller has to make sure @sdesc is initialized to either NULL or
+ * a valid context. Both can be freed via cifs_free_hash().
+ */
+int
+cifs_alloc_hash(const char *name,
+ struct crypto_shash **shash, struct sdesc **sdesc)
+{
+ int rc = 0;
+ size_t size;
+
+ if (*sdesc != NULL)
+ return 0;
+
+ *shash = crypto_alloc_shash(name, 0, 0);
+ if (IS_ERR(*shash)) {
+ cifs_dbg(VFS, "could not allocate crypto %s\n", name);
+ rc = PTR_ERR(*shash);
+ *shash = NULL;
+ *sdesc = NULL;
+ return rc;
+ }
+
+ size = sizeof(struct shash_desc) + crypto_shash_descsize(*shash);
+ *sdesc = kmalloc(size, GFP_KERNEL);
+ if (*sdesc == NULL) {
+ cifs_dbg(VFS, "no memory left to allocate crypto %s\n", name);
+ crypto_free_shash(*shash);
+ *shash = NULL;
+ return -ENOMEM;
+ }
+
+ (*sdesc)->shash.tfm = *shash;
+ (*sdesc)->shash.flags = 0x0;
+ return 0;
+}
+
+/**
+ * cifs_free_hash - free hash and hash context together
+ *
+ * Freeing a NULL hash or context is safe.
+ */
+void
+cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc)
+{
+ kfree(*sdesc);
+ *sdesc = NULL;
+ if (*shash)
+ crypto_free_shash(*shash);
+ *shash = NULL;
+}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 3d495e440c87..aff8ce8ba34d 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1122,6 +1122,7 @@ struct smb_version_values smb1_values = {
.exclusive_lock_type = 0,
.shared_lock_type = LOCKING_ANDX_SHARED_LOCK,
.unlock_lock_type = 0,
+ .header_preamble_size = 4,
.header_size = sizeof(struct smb_hdr),
.max_header_size = MAX_CIFS_HDR_SIZE,
.read_rsp_size = sizeof(READ_RSP),
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 62c88dfed57b..3bfc9c990724 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -745,7 +745,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
"STATUS_NOLOGON_SERVER_TRUST_ACCOUNT"},
{STATUS_DOMAIN_TRUST_INCONSISTENT, -EIO,
"STATUS_DOMAIN_TRUST_INCONSISTENT"},
- {STATUS_FS_DRIVER_REQUIRED, -EIO, "STATUS_FS_DRIVER_REQUIRED"},
+ {STATUS_FS_DRIVER_REQUIRED, -EOPNOTSUPP, "STATUS_FS_DRIVER_REQUIRED"},
{STATUS_IMAGE_ALREADY_LOADED_AS_DLL, -EIO,
"STATUS_IMAGE_ALREADY_LOADED_AS_DLL"},
{STATUS_NETWORK_OPEN_RESTRICTION, -EIO,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 76d03abaa38c..5406e95f5d92 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -150,7 +150,8 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
}
return 1;
}
- if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) {
+ if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE -
+ srvr->vals->header_preamble_size) {
cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
mid);
return 1;
@@ -189,26 +190,26 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
}
}
- if (4 + len != length) {
- cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n",
- length, 4 + len, mid);
+ if (srvr->vals->header_preamble_size + len != length) {
+ cifs_dbg(VFS, "Total length %u RFC1002 length %zu mismatch mid %llu\n",
+ length, srvr->vals->header_preamble_size + len, mid);
return 1;
}
clc_len = smb2_calc_size(hdr);
- if (4 + len != clc_len) {
- cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
- clc_len, 4 + len, mid);
+ if (srvr->vals->header_preamble_size + len != clc_len) {
+ cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n",
+ clc_len, srvr->vals->header_preamble_size + len, mid);
/* create failed on symlink */
if (command == SMB2_CREATE_HE &&
shdr->Status == STATUS_STOPPED_ON_SYMLINK)
return 0;
/* Windows 7 server returns 24 bytes more */
- if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
+ if (clc_len + 24 - srvr->vals->header_preamble_size == len && command == SMB2_OPLOCK_BREAK_HE)
return 0;
/* server can return one byte more due to implied bcc[0] */
- if (clc_len == 4 + len + 1)
+ if (clc_len == srvr->vals->header_preamble_size + len + 1)
return 0;
/*
@@ -218,10 +219,10 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
* Log the server error (once), but allow it and continue
* since the frame is parseable.
*/
- if (clc_len < 4 /* RFC1001 header size */ + len) {
+ if (clc_len < srvr->vals->header_preamble_size /* RFC1001 header size */ + len) {
printk_once(KERN_WARNING
- "SMB2 server sent bad RFC1001 len %d not %d\n",
- len, clc_len - 4);
+ "SMB2 server sent bad RFC1001 len %d not %zu\n",
+ len, clc_len - srvr->vals->header_preamble_size);
return 0;
}
@@ -706,3 +707,67 @@ smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
return 0;
}
+
+#ifdef CONFIG_CIFS_SMB311
+/**
+ * smb311_update_preauth_hash - update @ses hash with the packet data in @iov
+ *
+ * Assumes @iov does not contain the rfc1002 length and iov[0] has the
+ * SMB2 header.
+ */
+int
+smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec)
+{
+ int i, rc;
+ struct sdesc *d;
+ struct smb2_sync_hdr *hdr;
+
+ if (ses->server->tcpStatus == CifsGood) {
+ /* skip non smb311 connections */
+ if (ses->server->dialect != SMB311_PROT_ID)
+ return 0;
+
+ /* skip last sess setup response */
+ hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ if (hdr->Flags & SMB2_FLAGS_SIGNED)
+ return 0;
+ }
+
+ rc = smb311_crypto_shash_allocate(ses->server);
+ if (rc)
+ return rc;
+
+ d = ses->server->secmech.sdescsha512;
+ rc = crypto_shash_init(&d->shash);
+ if (rc) {
+ cifs_dbg(VFS, "%s: could not init sha512 shash\n", __func__);
+ return rc;
+ }
+
+ rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash,
+ SMB2_PREAUTH_HASH_SIZE);
+ if (rc) {
+ cifs_dbg(VFS, "%s: could not update sha512 shash\n", __func__);
+ return rc;
+ }
+
+ for (i = 0; i < nvec; i++) {
+ rc = crypto_shash_update(&d->shash,
+ iov[i].iov_base, iov[i].iov_len);
+ if (rc) {
+ cifs_dbg(VFS, "%s: could not update sha512 shash\n",
+ __func__);
+ return rc;
+ }
+ }
+
+ rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash);
+ if (rc) {
+ cifs_dbg(VFS, "%s: could not finalize sha512 shash\n",
+ __func__);
+ return rc;
+ }
+
+ return 0;
+}
+#endif
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index eb68e2fcc500..968b1d43a1ea 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1412,7 +1412,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
} while (rc == -EAGAIN);
if (rc) {
- if (rc != -ENOENT)
+ if ((rc != -ENOENT) && (rc != -EOPNOTSUPP))
cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc);
goto out;
}
@@ -1457,6 +1457,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int sub_offset;
unsigned int print_len;
unsigned int print_offset;
+ struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
@@ -1479,7 +1481,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
}
if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
- get_rfc1002_length(err_buf) + 4 < SMB2_SYMLINK_STRUCT_SIZE) {
+ get_rfc1002_length(err_buf) + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) {
kfree(utf16_path);
return -ENOENT;
}
@@ -1492,13 +1494,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
print_len = le16_to_cpu(symlink->PrintNameLength);
print_offset = le16_to_cpu(symlink->PrintNameOffset);
- if (get_rfc1002_length(err_buf) + 4 <
+ if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size <
SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
kfree(utf16_path);
return -ENOENT;
}
- if (get_rfc1002_length(err_buf) + 4 <
+ if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size <
SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
kfree(utf16_path);
return -ENOENT;
@@ -2050,7 +2052,8 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile)
}
static void
-fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq)
+fill_transform_hdr(struct TCP_Server_Info *server,
+ struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq)
{
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base;
@@ -2062,10 +2065,19 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq)
tr_hdr->Flags = cpu_to_le16(0x01);
get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
- inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - 4);
+ inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - server->vals->header_preamble_size);
inc_rfc1001_len(tr_hdr, orig_len);
}
+/* We can not use the normal sg_set_buf() as we will sometimes pass a
+ * stack object as buf.
+ */
+static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
+ unsigned int buflen)
+{
+ sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
+}
+
static struct scatterlist *
init_sg(struct smb_rqst *rqst, u8 *sign)
{
@@ -2080,16 +2092,16 @@ init_sg(struct smb_rqst *rqst, u8 *sign)
return NULL;
sg_init_table(sg, sg_len);
- sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len);
+ smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len);
for (i = 1; i < rqst->rq_nvec; i++)
- sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base,
+ smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base,
rqst->rq_iov[i].iov_len);
for (j = 0; i < sg_len - 1; i++, j++) {
unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz
: rqst->rq_tailsz;
sg_set_page(&sg[i], rqst->rq_pages[j], len, 0);
}
- sg_set_buf(&sg[sg_len - 1], sign, SMB2_SIGNATURE_SIZE);
+ smb2_sg_set_buf(&sg[sg_len - 1], sign, SMB2_SIGNATURE_SIZE);
return sg;
}
@@ -2125,7 +2137,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
{
struct smb2_transform_hdr *tr_hdr =
(struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base;
- unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20 - server->vals->header_preamble_size;
int rc = 0;
struct scatterlist *sg;
u8 sign[SMB2_SIGNATURE_SIZE] = {};
@@ -2253,7 +2265,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
goto err_free_iov;
/* fill the 1st iov with a transform header */
- fill_transform_hdr(tr_hdr, old_rq);
+ fill_transform_hdr(server, tr_hdr, old_rq);
new_rq->rq_iov[0].iov_base = tr_hdr;
new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr);
@@ -2335,10 +2347,10 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
if (rc)
return rc;
- memmove(buf + 4, iov[1].iov_base, buf_data_size);
+ memmove(buf + server->vals->header_preamble_size, iov[1].iov_base, buf_data_size);
hdr = (struct smb2_hdr *)buf;
hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size);
- server->total_read = buf_data_size + page_data_size + 4;
+ server->total_read = buf_data_size + page_data_size + server->vals->header_preamble_size;
return rc;
}
@@ -2442,7 +2454,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- data_offset = server->ops->read_data_offset(buf) + 4;
+ data_offset = server->ops->read_data_offset(buf) + server->vals->header_preamble_size;
#ifdef CONFIG_CIFS_SMB_DIRECT
use_rdma_mr = rdata->mr;
#endif
@@ -2538,11 +2550,12 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
unsigned int npages;
struct page **pages;
unsigned int len;
- unsigned int buflen = get_rfc1002_length(buf) + 4;
+ unsigned int buflen = get_rfc1002_length(buf) + server->vals->header_preamble_size;
int rc;
int i = 0;
- len = min_t(unsigned int, buflen, server->vals->read_rsp_size - 4 +
+ len = min_t(unsigned int, buflen, server->vals->read_rsp_size -
+ server->vals->header_preamble_size +
sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1;
rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len);
@@ -2550,8 +2563,9 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
return rc;
server->total_read += rc;
- len = le32_to_cpu(tr_hdr->OriginalMessageSize) + 4 -
- server->vals->read_rsp_size;
+ len = le32_to_cpu(tr_hdr->OriginalMessageSize) +
+ server->vals->header_preamble_size -
+ server->vals->read_rsp_size;
npages = DIV_ROUND_UP(len, PAGE_SIZE);
pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
@@ -2577,7 +2591,8 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
if (rc)
goto free_pages;
- rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size - 4,
+ rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size -
+ server->vals->header_preamble_size,
pages, npages, len);
if (rc)
goto free_pages;
@@ -2614,7 +2629,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid_entry;
/* switch to large buffer if too big for a small one */
- if (pdu_length + 4 > MAX_CIFS_SMALL_BUFFER_SIZE) {
+ if (pdu_length + server->vals->header_preamble_size > MAX_CIFS_SMALL_BUFFER_SIZE) {
server->large_buf = true;
memcpy(server->bigbuf, buf, server->total_read);
buf = server->bigbuf;
@@ -2622,12 +2637,13 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
/* now read the rest */
length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
- pdu_length - HEADER_SIZE(server) + 1 + 4);
+ pdu_length - HEADER_SIZE(server) + 1 +
+ server->vals->header_preamble_size);
if (length < 0)
return length;
server->total_read += length;
- buf_size = pdu_length + 4 - sizeof(struct smb2_transform_hdr);
+ buf_size = pdu_length + server->vals->header_preamble_size - sizeof(struct smb2_transform_hdr);
length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0);
if (length)
return length;
@@ -2656,7 +2672,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
- if (pdu_length + 4 < sizeof(struct smb2_transform_hdr) +
+ if (pdu_length + server->vals->header_preamble_size < sizeof(struct smb2_transform_hdr) +
sizeof(struct smb2_sync_hdr)) {
cifs_dbg(VFS, "Transform message is too small (%u)\n",
pdu_length);
@@ -2665,14 +2681,14 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
return -ECONNABORTED;
}
- if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) {
+ if (pdu_length + server->vals->header_preamble_size < orig_len + sizeof(struct smb2_transform_hdr)) {
cifs_dbg(VFS, "Transform message is broken\n");
cifs_reconnect(server);
wake_up(&server->response_q);
return -ECONNABORTED;
}
- if (pdu_length + 4 > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
+ if (pdu_length + server->vals->header_preamble_size > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
return receive_encrypted_read(server, mid);
return receive_encrypted_standard(server, mid);
@@ -2683,7 +2699,8 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
- return handle_read_data(server, mid, buf, get_rfc1002_length(buf) + 4,
+ return handle_read_data(server, mid, buf, get_rfc1002_length(buf) +
+ server->vals->header_preamble_size,
NULL, 0, 0);
}
@@ -3088,6 +3105,7 @@ struct smb_version_values smb20_values = {
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
+ .header_preamble_size = 4,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3108,6 +3126,7 @@ struct smb_version_values smb21_values = {
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
+ .header_preamble_size = 4,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3128,6 +3147,7 @@ struct smb_version_values smb3any_values = {
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
+ .header_preamble_size = 4,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3148,6 +3168,7 @@ struct smb_version_values smbdefault_values = {
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
+ .header_preamble_size = 4,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3168,6 +3189,7 @@ struct smb_version_values smb30_values = {
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
+ .header_preamble_size = 4,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3188,6 +3210,7 @@ struct smb_version_values smb302_values = {
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
+ .header_preamble_size = 4,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3209,6 +3232,7 @@ struct smb_version_values smb311_values = {
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
+ .header_preamble_size = 4,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 63778ac22fd9..f7741cee2a4c 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -453,6 +453,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
return rc;
req->sync_hdr.SessionId = 0;
+#ifdef CONFIG_CIFS_SMB311
+ memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
+ memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
+#endif
if (strcmp(ses->server->vals->version_string,
SMB3ANY_VERSION_STRING) == 0) {
@@ -564,6 +568,15 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
/* BB: add check that dialect was valid given dialect(s) we asked for */
+#ifdef CONFIG_CIFS_SMB311
+ /*
+ * Keep a copy of the hash after negprot. This hash will be
+ * the starting hash value for all sessions made from this
+ * server.
+ */
+ memcpy(server->preauth_sha_hash, ses->preauth_sha_hash,
+ SMB2_PREAUTH_HASH_SIZE);
+#endif
/* SMB2 only has an extended negflavor */
server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
/* set it to the maximum buffer size value we can send with 1 credit */
@@ -571,8 +584,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
SMB2_MAX_BUFFER_SIZE);
server->max_read = le32_to_cpu(rsp->MaxReadSize);
server->max_write = le32_to_cpu(rsp->MaxWriteSize);
- /* BB Do we need to validate the SecurityMode? */
server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+ if ((server->sec_mode & SMB2_SEC_MODE_FLAGS_ALL) != server->sec_mode)
+ cifs_dbg(FYI, "Server returned unexpected security mode 0x%x\n",
+ server->sec_mode);
server->capabilities = le32_to_cpu(rsp->Capabilities);
/* Internal types */
server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
@@ -621,6 +636,10 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
return 0;
#endif
+ /* In SMB3.11 preauth integrity supersedes validate negotiate */
+ if (tcon->ses->server->dialect == SMB311_PROT_ID)
+ return 0;
+
/*
* validation ioctl must be signed, so no point sending this if we
* can not sign it (ie are not known user). Even if signing is not
@@ -1148,6 +1167,14 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
sess_data->buf0_type = CIFS_NO_BUFFER;
sess_data->nls_cp = (struct nls_table *) nls_cp;
+#ifdef CONFIG_CIFS_SMB311
+ /*
+ * Initialize the session hash with the server one.
+ */
+ memcpy(ses->preauth_sha_hash, ses->server->preauth_sha_hash,
+ SMB2_PREAUTH_HASH_SIZE);
+#endif
+
while (sess_data->func)
sess_data->func(sess_data);
@@ -1280,6 +1307,11 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
iov[1].iov_base = unc_path;
iov[1].iov_len = unc_path_len;
+ /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
+ if ((ses->server->dialect == SMB311_PROT_ID) &&
+ !encryption_required(tcon))
+ req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+
rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
@@ -1441,7 +1473,7 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
unsigned int remaining;
char *name;
- data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+ data_offset = (char *)rsp + server->vals->header_preamble_size + le32_to_cpu(rsp->CreateContextsOffset);
remaining = le32_to_cpu(rsp->CreateContextsLength);
cc = (struct create_context *)data_offset;
while (remaining >= sizeof(struct create_context)) {
@@ -1738,8 +1770,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
tcon->treeName, path);
- if (rc)
+ if (rc) {
+ cifs_small_buf_release(req);
return rc;
+ }
req->NameLength = cpu_to_le16(name_len * 2);
uni_path_len = copy_size;
path = copy_path;
@@ -1750,8 +1784,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (uni_path_len % 8 != 0) {
copy_size = roundup(uni_path_len, 8);
copy_path = kzalloc(copy_size, GFP_KERNEL);
- if (!copy_path)
+ if (!copy_path) {
+ cifs_small_buf_release(req);
return -ENOMEM;
+ }
memcpy((char *)copy_path, (const char *)path,
uni_path_len);
uni_path_len = copy_size;
@@ -3418,6 +3454,7 @@ static int
build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
int outbuf_len, u64 persistent_fid, u64 volatile_fid)
{
+ struct TCP_Server_Info *server = tcon->ses->server;
int rc;
struct smb2_query_info_req *req;
unsigned int total_len;
@@ -3440,7 +3477,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
req->InputBufferOffset =
cpu_to_le16(sizeof(struct smb2_query_info_req) - 1);
req->OutputBufferLength = cpu_to_le32(
- outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4);
+ outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - server->vals->header_preamble_size);
iov->iov_base = (char *)req;
iov->iov_len = total_len;
@@ -3457,6 +3494,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
struct smb2_fs_full_size_info *info = NULL;
int flags = 0;
@@ -3477,7 +3515,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
}
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
- info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ +
+ info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size +
le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr);
rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
@@ -3500,6 +3538,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype, max_len, min_len;
struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
unsigned int rsp_len, offset;
int flags = 0;
@@ -3540,15 +3579,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
goto qfsattr_exit;
if (level == FS_ATTRIBUTE_INFORMATION)
- memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset
+ memcpy(&tcon->fsAttrInfo, server->vals->header_preamble_size + offset
+ (char *)&rsp->hdr, min_t(unsigned int,
rsp_len, max_len));
else if (level == FS_DEVICE_INFORMATION)
- memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset
+ memcpy(&tcon->fsDevInfo, server->vals->header_preamble_size + offset
+ (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
else if (level == FS_SECTOR_SIZE_INFORMATION) {
struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
- (4 /* RFC1001 len */ + offset + (char *)&rsp->hdr);
+ (server->vals->header_preamble_size + offset + (char *)&rsp->hdr);
tcon->ss_flags = le32_to_cpu(ss_info->Flags);
tcon->perf_sector_size =
le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 2a2b34ccaf49..253e2c7c952f 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -249,6 +249,8 @@ struct smb2_negotiate_req {
/* SecurityMode flags */
#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001
#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
+#define SMB2_SEC_MODE_FLAGS_ALL 0x0003
+
/* Capabilities flags */
#define SMB2_GLOBAL_CAP_DFS 0x00000001
#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
@@ -264,6 +266,7 @@ struct smb2_negotiate_req {
#define SMB311_SALT_SIZE 32
/* Hash Algorithm Types */
#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
+#define SMB2_PREAUTH_HASH_SIZE 64
struct smb2_preauth_neg_context {
__le16 ContextType; /* 1 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 05287b01f596..cbcce3f7e86f 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -202,4 +202,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
+#ifdef CONFIG_CIFS_SMB311
+extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server);
+extern int smb311_update_preauth_hash(struct cifs_ses *ses,
+ struct kvec *iov, int nvec);
+#endif
#endif /* _SMB2PROTO_H */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 99493946e2f9..bf49cb73b9e6 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -43,77 +43,62 @@
static int
smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
{
- int rc;
- unsigned int size;
+ return cifs_alloc_hash("hmac(sha256)",
+ &server->secmech.hmacsha256,
+ &server->secmech.sdeschmacsha256);
+}
- if (server->secmech.sdeschmacsha256 != NULL)
- return 0; /* already allocated */
+static int
+smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+ struct cifs_secmech *p = &server->secmech;
+ int rc;
- server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
- if (IS_ERR(server->secmech.hmacsha256)) {
- cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
- rc = PTR_ERR(server->secmech.hmacsha256);
- server->secmech.hmacsha256 = NULL;
- return rc;
- }
+ rc = cifs_alloc_hash("hmac(sha256)",
+ &p->hmacsha256,
+ &p->sdeschmacsha256);
+ if (rc)
+ goto err;
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.hmacsha256);
- server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdeschmacsha256) {
- crypto_free_shash(server->secmech.hmacsha256);
- server->secmech.hmacsha256 = NULL;
- return -ENOMEM;
- }
- server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
- server->secmech.sdeschmacsha256->shash.flags = 0x0;
+ rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes);
+ if (rc)
+ goto err;
return 0;
+err:
+ cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256);
+ return rc;
}
-static int
-smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+#ifdef CONFIG_CIFS_SMB311
+int
+smb311_crypto_shash_allocate(struct TCP_Server_Info *server)
{
- unsigned int size;
- int rc;
-
- if (server->secmech.sdesccmacaes != NULL)
- return 0; /* already allocated */
+ struct cifs_secmech *p = &server->secmech;
+ int rc = 0;
- rc = smb2_crypto_shash_allocate(server);
+ rc = cifs_alloc_hash("hmac(sha256)",
+ &p->hmacsha256,
+ &p->sdeschmacsha256);
if (rc)
return rc;
- server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
- if (IS_ERR(server->secmech.cmacaes)) {
- cifs_dbg(VFS, "could not allocate crypto cmac-aes");
- kfree(server->secmech.sdeschmacsha256);
- server->secmech.sdeschmacsha256 = NULL;
- crypto_free_shash(server->secmech.hmacsha256);
- server->secmech.hmacsha256 = NULL;
- rc = PTR_ERR(server->secmech.cmacaes);
- server->secmech.cmacaes = NULL;
- return rc;
- }
+ rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes);
+ if (rc)
+ goto err;
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.cmacaes);
- server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdesccmacaes) {
- cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
- kfree(server->secmech.sdeschmacsha256);
- server->secmech.sdeschmacsha256 = NULL;
- crypto_free_shash(server->secmech.hmacsha256);
- crypto_free_shash(server->secmech.cmacaes);
- server->secmech.hmacsha256 = NULL;
- server->secmech.cmacaes = NULL;
- return -ENOMEM;
- }
- server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
- server->secmech.sdesccmacaes->shash.flags = 0x0;
+ rc = cifs_alloc_hash("sha512", &p->sha512, &p->sdescsha512);
+ if (rc)
+ goto err;
return 0;
+
+err:
+ cifs_free_hash(&p->cmacaes, &p->sdesccmacaes);
+ cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256);
+ return rc;
}
+#endif
static struct cifs_ses *
smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
@@ -457,7 +442,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
return rc;
}
-
+
rc = __cifs_calc_signature(rqst, server, sigptr,
&server->secmech.sdesccmacaes->shash);
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 91710eb571fb..5008af546dd1 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -862,6 +862,8 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
ib_dma_unmap_single(info->id->device, request->sge[0].addr,
request->sge[0].length, DMA_TO_DEVICE);
+ smbd_disconnect_rdma_connection(info);
+
dma_mapping_failed:
mempool_free(request, info->request_mempool);
return rc;
@@ -1025,7 +1027,7 @@ static int smbd_post_send(struct smbd_connection *info,
for (i = 0; i < request->num_sge; i++) {
log_rdma_send(INFO,
- "rdma_request sge[%d] addr=%llu legnth=%u\n",
+ "rdma_request sge[%d] addr=%llu length=%u\n",
i, request->sge[0].addr, request->sge[0].length);
ib_dma_sync_single_for_device(
info->id->device,
@@ -1061,6 +1063,7 @@ static int smbd_post_send(struct smbd_connection *info,
if (atomic_dec_and_test(&info->send_pending))
wake_up(&info->wait_send_pending);
}
+ smbd_disconnect_rdma_connection(info);
} else
/* Reset timer for idle connection after packet is sent */
mod_delayed_work(info->workqueue, &info->idle_timer_work,
@@ -1202,7 +1205,7 @@ static int smbd_post_recv(
if (rc) {
ib_dma_unmap_single(info->id->device, response->sge.addr,
response->sge.length, DMA_FROM_DEVICE);
-
+ smbd_disconnect_rdma_connection(info);
log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
}
@@ -1498,8 +1501,8 @@ int smbd_reconnect(struct TCP_Server_Info *server)
log_rdma_event(INFO, "reconnecting rdma session\n");
if (!server->smbd_conn) {
- log_rdma_event(ERR, "rdma session already destroyed\n");
- return -EINVAL;
+ log_rdma_event(INFO, "rdma session already destroyed\n");
+ goto create_conn;
}
/*
@@ -1512,15 +1515,19 @@ int smbd_reconnect(struct TCP_Server_Info *server)
}
/* wait until the transport is destroyed */
- wait_event(server->smbd_conn->wait_destroy,
- server->smbd_conn->transport_status == SMBD_DESTROYED);
+ if (!wait_event_timeout(server->smbd_conn->wait_destroy,
+ server->smbd_conn->transport_status == SMBD_DESTROYED, 5*HZ))
+ return -EAGAIN;
destroy_workqueue(server->smbd_conn->workqueue);
kfree(server->smbd_conn);
+create_conn:
log_rdma_event(INFO, "creating rdma session\n");
server->smbd_conn = smbd_get_connection(
server, (struct sockaddr *) &server->dstaddr);
+ log_rdma_event(INFO, "created rdma session info=%p\n",
+ server->smbd_conn);
return server->smbd_conn ? 0 : -ENOENT;
}
@@ -2295,7 +2302,7 @@ static void smbd_mr_recovery_work(struct work_struct *work)
rc = ib_dereg_mr(smbdirect_mr->mr);
if (rc) {
log_rdma_mr(ERR,
- "ib_dereg_mr faield rc=%x\n",
+ "ib_dereg_mr failed rc=%x\n",
rc);
smbd_disconnect_rdma_connection(info);
}
@@ -2542,6 +2549,8 @@ dma_map_error:
if (atomic_dec_and_test(&info->mr_used_count))
wake_up(&info->wait_for_mr_cleanup);
+ smbd_disconnect_rdma_connection(info);
+
return NULL;
}
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index c12bffefa3c9..a0b80ac651a6 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -121,25 +121,12 @@ int
mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
{
int rc;
- unsigned int size;
- struct crypto_shash *md4;
- struct sdesc *sdescmd4;
-
- md4 = crypto_alloc_shash("md4", 0, 0);
- if (IS_ERR(md4)) {
- rc = PTR_ERR(md4);
- cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n",
- __func__, rc);
- return rc;
- }
- size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
- sdescmd4 = kmalloc(size, GFP_KERNEL);
- if (!sdescmd4) {
- rc = -ENOMEM;
+ struct crypto_shash *md4 = NULL;
+ struct sdesc *sdescmd4 = NULL;
+
+ rc = cifs_alloc_hash("md4", &md4, &sdescmd4);
+ if (rc)
goto mdfour_err;
- }
- sdescmd4->shash.tfm = md4;
- sdescmd4->shash.flags = 0x0;
rc = crypto_shash_init(&sdescmd4->shash);
if (rc) {
@@ -156,9 +143,7 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__);
mdfour_err:
- crypto_free_shash(md4);
- kfree(sdescmd4);
-
+ cifs_free_hash(&md4, &sdescmd4);
return rc;
}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 9779b3292d8e..279718dcb2ed 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -37,6 +37,7 @@
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
+#include "smb2proto.h"
#include "smbdirect.h"
/* Max number of iovectors we can use off the stack when sending requests. */
@@ -751,6 +752,12 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
if (rc < 0)
goto out;
+#ifdef CONFIG_CIFS_SMB311
+ if (ses->status == CifsNew)
+ smb311_update_preauth_hash(ses, rqst->rq_iov+1,
+ rqst->rq_nvec-1);
+#endif
+
if (timeout == CIFS_ASYNC_OP)
goto out;
@@ -783,12 +790,23 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
buf = (char *)midQ->resp_buf;
resp_iov->iov_base = buf;
- resp_iov->iov_len = get_rfc1002_length(buf) + 4;
+ resp_iov->iov_len = get_rfc1002_length(buf) +
+ ses->server->vals->header_preamble_size;
if (midQ->large_buf)
*resp_buf_type = CIFS_LARGE_BUFFER;
else
*resp_buf_type = CIFS_SMALL_BUFFER;
+#ifdef CONFIG_CIFS_SMB311
+ if (ses->status == CifsNew) {
+ struct kvec iov = {
+ .iov_base = buf + 4,
+ .iov_len = get_rfc1002_length(buf)
+ };
+ smb311_update_preauth_hash(ses, &iov, 1);
+ }
+#endif
+
credits = ses->server->ops->get_credits(midQ);
rc = ses->server->ops->check_receive(midQ, ses->server,
diff --git a/fs/d_path.c b/fs/d_path.c
new file mode 100644
index 000000000000..e8fce6b1174f
--- /dev/null
+++ b/fs/d_path.c
@@ -0,0 +1,470 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+#include "mount.h"
+
+static int prepend(char **buffer, int *buflen, const char *str, int namelen)
+{
+ *buflen -= namelen;
+ if (*buflen < 0)
+ return -ENAMETOOLONG;
+ *buffer -= namelen;
+ memcpy(*buffer, str, namelen);
+ return 0;
+}
+
+/**
+ * prepend_name - prepend a pathname in front of current buffer pointer
+ * @buffer: buffer pointer
+ * @buflen: allocated length of the buffer
+ * @name: name string and length qstr structure
+ *
+ * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
+ * make sure that either the old or the new name pointer and length are
+ * fetched. However, there may be mismatch between length and pointer.
+ * The length cannot be trusted, we need to copy it byte-by-byte until
+ * the length is reached or a null byte is found. It also prepends "/" at
+ * the beginning of the name. The sequence number check at the caller will
+ * retry it again when a d_move() does happen. So any garbage in the buffer
+ * due to mismatched pointer and length will be discarded.
+ *
+ * Load acquire is needed to make sure that we see that terminating NUL.
+ */
+static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
+{
+ const char *dname = smp_load_acquire(&name->name); /* ^^^ */
+ u32 dlen = READ_ONCE(name->len);
+ char *p;
+
+ *buflen -= dlen + 1;
+ if (*buflen < 0)
+ return -ENAMETOOLONG;
+ p = *buffer -= dlen + 1;
+ *p++ = '/';
+ while (dlen--) {
+ char c = *dname++;
+ if (!c)
+ break;
+ *p++ = c;
+ }
+ return 0;
+}
+
+/**
+ * prepend_path - Prepend path string to a buffer
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry
+ * @buffer: pointer to the end of the buffer
+ * @buflen: pointer to buffer length
+ *
+ * The function will first try to write out the pathname without taking any
+ * lock other than the RCU read lock to make sure that dentries won't go away.
+ * It only checks the sequence number of the global rename_lock as any change
+ * in the dentry's d_seq will be preceded by changes in the rename_lock
+ * sequence number. If the sequence number had been changed, it will restart
+ * the whole pathname back-tracing sequence again by taking the rename_lock.
+ * In this case, there is no need to take the RCU read lock as the recursive
+ * parent pointer references will keep the dentry chain alive as long as no
+ * rename operation is performed.
+ */
+static int prepend_path(const struct path *path,
+ const struct path *root,
+ char **buffer, int *buflen)
+{
+ struct dentry *dentry;
+ struct vfsmount *vfsmnt;
+ struct mount *mnt;
+ int error = 0;
+ unsigned seq, m_seq = 0;
+ char *bptr;
+ int blen;
+
+ rcu_read_lock();
+restart_mnt:
+ read_seqbegin_or_lock(&mount_lock, &m_seq);
+ seq = 0;
+ rcu_read_lock();
+restart:
+ bptr = *buffer;
+ blen = *buflen;
+ error = 0;
+ dentry = path->dentry;
+ vfsmnt = path->mnt;
+ mnt = real_mount(vfsmnt);
+ read_seqbegin_or_lock(&rename_lock, &seq);
+ while (dentry != root->dentry || vfsmnt != root->mnt) {
+ struct dentry * parent;
+
+ if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+ struct mount *parent = READ_ONCE(mnt->mnt_parent);
+ /* Escaped? */
+ if (dentry != vfsmnt->mnt_root) {
+ bptr = *buffer;
+ blen = *buflen;
+ error = 3;
+ break;
+ }
+ /* Global root? */
+ if (mnt != parent) {
+ dentry = READ_ONCE(mnt->mnt_mountpoint);
+ mnt = parent;
+ vfsmnt = &mnt->mnt;
+ continue;
+ }
+ if (!error)
+ error = is_mounted(vfsmnt) ? 1 : 2;
+ break;
+ }
+ parent = dentry->d_parent;
+ prefetch(parent);
+ error = prepend_name(&bptr, &blen, &dentry->d_name);
+ if (error)
+ break;
+
+ dentry = parent;
+ }
+ if (!(seq & 1))
+ rcu_read_unlock();
+ if (need_seqretry(&rename_lock, seq)) {
+ seq = 1;
+ goto restart;
+ }
+ done_seqretry(&rename_lock, seq);
+
+ if (!(m_seq & 1))
+ rcu_read_unlock();
+ if (need_seqretry(&mount_lock, m_seq)) {
+ m_seq = 1;
+ goto restart_mnt;
+ }
+ done_seqretry(&mount_lock, m_seq);
+
+ if (error >= 0 && bptr == *buffer) {
+ if (--blen < 0)
+ error = -ENAMETOOLONG;
+ else
+ *--bptr = '/';
+ }
+ *buffer = bptr;
+ *buflen = blen;
+ return error;
+}
+
+/**
+ * __d_path - return the path of a dentry
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name.
+ *
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
+ *
+ * "buflen" should be positive.
+ *
+ * If the path is not reachable from the supplied root, return %NULL.
+ */
+char *__d_path(const struct path *path,
+ const struct path *root,
+ char *buf, int buflen)
+{
+ char *res = buf + buflen;
+ int error;
+
+ prepend(&res, &buflen, "\0", 1);
+ error = prepend_path(path, root, &res, &buflen);
+
+ if (error < 0)
+ return ERR_PTR(error);
+ if (error > 0)
+ return NULL;
+ return res;
+}
+
+char *d_absolute_path(const struct path *path,
+ char *buf, int buflen)
+{
+ struct path root = {};
+ char *res = buf + buflen;
+ int error;
+
+ prepend(&res, &buflen, "\0", 1);
+ error = prepend_path(path, &root, &res, &buflen);
+
+ if (error > 1)
+ error = -EINVAL;
+ if (error < 0)
+ return ERR_PTR(error);
+ return res;
+}
+
+/*
+ * same as __d_path but appends "(deleted)" for unlinked files.
+ */
+static int path_with_deleted(const struct path *path,
+ const struct path *root,
+ char **buf, int *buflen)
+{
+ prepend(buf, buflen, "\0", 1);
+ if (d_unlinked(path->dentry)) {
+ int error = prepend(buf, buflen, " (deleted)", 10);
+ if (error)
+ return error;
+ }
+
+ return prepend_path(path, root, buf, buflen);
+}
+
+static int prepend_unreachable(char **buffer, int *buflen)
+{
+ return prepend(buffer, buflen, "(unreachable)", 13);
+}
+
+static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
+{
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ *root = fs->root;
+ } while (read_seqcount_retry(&fs->seq, seq));
+}
+
+/**
+ * d_path - return the path of a dentry
+ * @path: path to report
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name. If the entry has been deleted
+ * the string " (deleted)" is appended. Note that this is ambiguous.
+ *
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
+ *
+ * "buflen" should be positive.
+ */
+char *d_path(const struct path *path, char *buf, int buflen)
+{
+ char *res = buf + buflen;
+ struct path root;
+ int error;
+
+ /*
+ * We have various synthetic filesystems that never get mounted. On
+ * these filesystems dentries are never used for lookup purposes, and
+ * thus don't need to be hashed. They also don't need a name until a
+ * user wants to identify the object in /proc/pid/fd/. The little hack
+ * below allows us to generate a name for these objects on demand:
+ *
+ * Some pseudo inodes are mountable. When they are mounted
+ * path->dentry == path->mnt->mnt_root. In that case don't call d_dname
+ * and instead have d_path return the mounted path.
+ */
+ if (path->dentry->d_op && path->dentry->d_op->d_dname &&
+ (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
+ return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+
+ rcu_read_lock();
+ get_fs_root_rcu(current->fs, &root);
+ error = path_with_deleted(path, &root, &res, &buflen);
+ rcu_read_unlock();
+
+ if (error < 0)
+ res = ERR_PTR(error);
+ return res;
+}
+EXPORT_SYMBOL(d_path);
+
+/*
+ * Helper function for dentry_operations.d_dname() members
+ */
+char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
+ const char *fmt, ...)
+{
+ va_list args;
+ char temp[64];
+ int sz;
+
+ va_start(args, fmt);
+ sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
+ va_end(args);
+
+ if (sz > sizeof(temp) || sz > buflen)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ buffer += buflen - sz;
+ return memcpy(buffer, temp, sz);
+}
+
+char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ char *end = buffer + buflen;
+ /* these dentries are never renamed, so d_lock is not needed */
+ if (prepend(&end, &buflen, " (deleted)", 11) ||
+ prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
+ prepend(&end, &buflen, "/", 1))
+ end = ERR_PTR(-ENAMETOOLONG);
+ return end;
+}
+EXPORT_SYMBOL(simple_dname);
+
+/*
+ * Write full pathname from the root of the filesystem into the buffer.
+ */
+static char *__dentry_path(struct dentry *d, char *buf, int buflen)
+{
+ struct dentry *dentry;
+ char *end, *retval;
+ int len, seq = 0;
+ int error = 0;
+
+ if (buflen < 2)
+ goto Elong;
+
+ rcu_read_lock();
+restart:
+ dentry = d;
+ end = buf + buflen;
+ len = buflen;
+ prepend(&end, &len, "\0", 1);
+ /* Get '/' right */
+ retval = end-1;
+ *retval = '/';
+ read_seqbegin_or_lock(&rename_lock, &seq);
+ while (!IS_ROOT(dentry)) {
+ struct dentry *parent = dentry->d_parent;
+
+ prefetch(parent);
+ error = prepend_name(&end, &len, &dentry->d_name);
+ if (error)
+ break;
+
+ retval = end;
+ dentry = parent;
+ }
+ if (!(seq & 1))
+ rcu_read_unlock();
+ if (need_seqretry(&rename_lock, seq)) {
+ seq = 1;
+ goto restart;
+ }
+ done_seqretry(&rename_lock, seq);
+ if (error)
+ goto Elong;
+ return retval;
+Elong:
+ return ERR_PTR(-ENAMETOOLONG);
+}
+
+char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+{
+ return __dentry_path(dentry, buf, buflen);
+}
+EXPORT_SYMBOL(dentry_path_raw);
+
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+ char *p = NULL;
+ char *retval;
+
+ if (d_unlinked(dentry)) {
+ p = buf + buflen;
+ if (prepend(&p, &buflen, "//deleted", 10) != 0)
+ goto Elong;
+ buflen++;
+ }
+ retval = __dentry_path(dentry, buf, buflen);
+ if (!IS_ERR(retval) && p)
+ *p = '/'; /* restore '/' overriden with '\0' */
+ return retval;
+Elong:
+ return ERR_PTR(-ENAMETOOLONG);
+}
+
+static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
+ struct path *pwd)
+{
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ *root = fs->root;
+ *pwd = fs->pwd;
+ } while (read_seqcount_retry(&fs->seq, seq));
+}
+
+/*
+ * NOTE! The user-level library version returns a
+ * character pointer. The kernel system call just
+ * returns the length of the buffer filled (which
+ * includes the ending '\0' character), or a negative
+ * error value. So libc would do something like
+ *
+ * char *getcwd(char * buf, size_t size)
+ * {
+ * int retval;
+ *
+ * retval = sys_getcwd(buf, size);
+ * if (retval >= 0)
+ * return buf;
+ * errno = -retval;
+ * return NULL;
+ * }
+ */
+SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
+{
+ int error;
+ struct path pwd, root;
+ char *page = __getname();
+
+ if (!page)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
+
+ error = -ENOENT;
+ if (!d_unlinked(pwd.dentry)) {
+ unsigned long len;
+ char *cwd = page + PATH_MAX;
+ int buflen = PATH_MAX;
+
+ prepend(&cwd, &buflen, "\0", 1);
+ error = prepend_path(&pwd, &root, &cwd, &buflen);
+ rcu_read_unlock();
+
+ if (error < 0)
+ goto out;
+
+ /* Unreachable from current root */
+ if (error > 0) {
+ error = prepend_unreachable(&cwd, &buflen);
+ if (error)
+ goto out;
+ }
+
+ error = -ERANGE;
+ len = PATH_MAX + page - cwd;
+ if (len <= size) {
+ error = len;
+ if (copy_to_user(buf, cwd, len))
+ error = -EFAULT;
+ }
+ } else {
+ rcu_read_unlock();
+ }
+
+out:
+ __putname(page);
+ return error;
+}
diff --git a/fs/dcache.c b/fs/dcache.c
index 7c38f39958bc..593079176123 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -14,7 +14,7 @@
* the dcache entry is deleted or garbage collected.
*/
-#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -24,18 +24,11 @@
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
-#include <linux/mount.h>
-#include <linux/file.h>
-#include <linux/uaccess.h>
#include <linux/security.h>
#include <linux/seqlock.h>
-#include <linux/swap.h>
#include <linux/bootmem.h>
-#include <linux/fs_struct.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
-#include <linux/prefetch.h>
-#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"
@@ -74,9 +67,7 @@
* dentry->d_lock
*
* If no ancestor relationship:
- * if (dentry1 < dentry2)
- * dentry1->d_lock
- * dentry2->d_lock
+ * arbitrary, since it's serialized on rename_lock
*/
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
@@ -440,17 +431,6 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
list_lru_isolate_move(lru, &dentry->d_lru, list);
}
-/*
- * dentry_lru_(add|del)_list) must be called with d_lock held.
- */
-static void dentry_lru_add(struct dentry *dentry)
-{
- if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
- d_lru_add(dentry);
- else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
- dentry->d_flags |= DCACHE_REFERENCED;
-}
-
/**
* d_drop - drop a dentry
* @dentry: dentry to drop
@@ -470,30 +450,29 @@ static void dentry_lru_add(struct dentry *dentry)
*/
static void ___d_drop(struct dentry *dentry)
{
- if (!d_unhashed(dentry)) {
- struct hlist_bl_head *b;
- /*
- * Hashed dentries are normally on the dentry hashtable,
- * with the exception of those newly allocated by
- * d_obtain_root, which are always IS_ROOT:
- */
- if (unlikely(IS_ROOT(dentry)))
- b = &dentry->d_sb->s_roots;
- else
- b = d_hash(dentry->d_name.hash);
+ struct hlist_bl_head *b;
+ /*
+ * Hashed dentries are normally on the dentry hashtable,
+ * with the exception of those newly allocated by
+ * d_obtain_root, which are always IS_ROOT:
+ */
+ if (unlikely(IS_ROOT(dentry)))
+ b = &dentry->d_sb->s_roots;
+ else
+ b = d_hash(dentry->d_name.hash);
- hlist_bl_lock(b);
- __hlist_bl_del(&dentry->d_hash);
- hlist_bl_unlock(b);
- /* After this call, in-progress rcu-walk path lookup will fail. */
- write_seqcount_invalidate(&dentry->d_seq);
- }
+ hlist_bl_lock(b);
+ __hlist_bl_del(&dentry->d_hash);
+ hlist_bl_unlock(b);
}
void __d_drop(struct dentry *dentry)
{
- ___d_drop(dentry);
- dentry->d_hash.pprev = NULL;
+ if (!d_unhashed(dentry)) {
+ ___d_drop(dentry);
+ dentry->d_hash.pprev = NULL;
+ write_seqcount_invalidate(&dentry->d_seq);
+ }
}
EXPORT_SYMBOL(__d_drop);
@@ -589,47 +568,9 @@ static void __dentry_kill(struct dentry *dentry)
dentry_free(dentry);
}
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * If ref is non-zero, then decrement the refcount too.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *dentry_kill(struct dentry *dentry)
- __releases(dentry->d_lock)
-{
- struct inode *inode = dentry->d_inode;
- struct dentry *parent = NULL;
-
- if (inode && unlikely(!spin_trylock(&inode->i_lock)))
- goto failed;
-
- if (!IS_ROOT(dentry)) {
- parent = dentry->d_parent;
- if (unlikely(!spin_trylock(&parent->d_lock))) {
- if (inode)
- spin_unlock(&inode->i_lock);
- goto failed;
- }
- }
-
- __dentry_kill(dentry);
- return parent;
-
-failed:
- spin_unlock(&dentry->d_lock);
- return dentry; /* try again with same dentry */
-}
-
-static inline struct dentry *lock_parent(struct dentry *dentry)
+static struct dentry *__lock_parent(struct dentry *dentry)
{
- struct dentry *parent = dentry->d_parent;
- if (IS_ROOT(dentry))
- return NULL;
- if (unlikely(dentry->d_lockref.count < 0))
- return NULL;
- if (likely(spin_trylock(&parent->d_lock)))
- return parent;
+ struct dentry *parent;
rcu_read_lock();
spin_unlock(&dentry->d_lock);
again:
@@ -655,6 +596,91 @@ again:
return parent;
}
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+ struct dentry *parent = dentry->d_parent;
+ if (IS_ROOT(dentry))
+ return NULL;
+ if (likely(spin_trylock(&parent->d_lock)))
+ return parent;
+ return __lock_parent(dentry);
+}
+
+static inline bool retain_dentry(struct dentry *dentry)
+{
+ WARN_ON(d_in_lookup(dentry));
+
+ /* Unreachable? Get rid of it */
+ if (unlikely(d_unhashed(dentry)))
+ return false;
+
+ if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+ return false;
+
+ if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
+ if (dentry->d_op->d_delete(dentry))
+ return false;
+ }
+ /* retain; LRU fodder */
+ dentry->d_lockref.count--;
+ if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+ d_lru_add(dentry);
+ else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+ dentry->d_flags |= DCACHE_REFERENCED;
+ return true;
+}
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *dentry_kill(struct dentry *dentry)
+ __releases(dentry->d_lock)
+{
+ struct inode *inode = dentry->d_inode;
+ struct dentry *parent = NULL;
+
+ if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+ goto slow_positive;
+
+ if (!IS_ROOT(dentry)) {
+ parent = dentry->d_parent;
+ if (unlikely(!spin_trylock(&parent->d_lock))) {
+ parent = __lock_parent(dentry);
+ if (likely(inode || !dentry->d_inode))
+ goto got_locks;
+ /* negative that became positive */
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ inode = dentry->d_inode;
+ goto slow_positive;
+ }
+ }
+ __dentry_kill(dentry);
+ return parent;
+
+slow_positive:
+ spin_unlock(&dentry->d_lock);
+ spin_lock(&inode->i_lock);
+ spin_lock(&dentry->d_lock);
+ parent = lock_parent(dentry);
+got_locks:
+ if (unlikely(dentry->d_lockref.count != 1)) {
+ dentry->d_lockref.count--;
+ } else if (likely(!retain_dentry(dentry))) {
+ __dentry_kill(dentry);
+ return parent;
+ }
+ /* we are keeping it, after all */
+ if (inode)
+ spin_unlock(&inode->i_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ spin_unlock(&dentry->d_lock);
+ return NULL;
+}
+
/*
* Try to do a lockless dput(), and return whether that was successful.
*
@@ -802,27 +828,11 @@ repeat:
/* Slow case: now with the dentry lock held */
rcu_read_unlock();
- WARN_ON(d_in_lookup(dentry));
-
- /* Unreachable? Get rid of it */
- if (unlikely(d_unhashed(dentry)))
- goto kill_it;
-
- if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
- goto kill_it;
-
- if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
- if (dentry->d_op->d_delete(dentry))
- goto kill_it;
+ if (likely(retain_dentry(dentry))) {
+ spin_unlock(&dentry->d_lock);
+ return;
}
- dentry_lru_add(dentry);
-
- dentry->d_lockref.count--;
- spin_unlock(&dentry->d_lock);
- return;
-
-kill_it:
dentry = dentry_kill(dentry);
if (dentry) {
cond_resched();
@@ -971,56 +981,83 @@ restart:
}
EXPORT_SYMBOL(d_prune_aliases);
-static void shrink_dentry_list(struct list_head *list)
+/*
+ * Lock a dentry from shrink list.
+ * Called under rcu_read_lock() and dentry->d_lock; the former
+ * guarantees that nothing we access will be freed under us.
+ * Note that dentry is *not* protected from concurrent dentry_kill(),
+ * d_delete(), etc.
+ *
+ * Return false if dentry has been disrupted or grabbed, leaving
+ * the caller to kick it off-list. Otherwise, return true and have
+ * that dentry's inode and parent both locked.
+ */
+static bool shrink_lock_dentry(struct dentry *dentry)
{
- struct dentry *dentry, *parent;
+ struct inode *inode;
+ struct dentry *parent;
- while (!list_empty(list)) {
- struct inode *inode;
- dentry = list_entry(list->prev, struct dentry, d_lru);
+ if (dentry->d_lockref.count)
+ return false;
+
+ inode = dentry->d_inode;
+ if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
+ spin_unlock(&dentry->d_lock);
+ spin_lock(&inode->i_lock);
spin_lock(&dentry->d_lock);
- parent = lock_parent(dentry);
+ if (unlikely(dentry->d_lockref.count))
+ goto out;
+ /* changed inode means that somebody had grabbed it */
+ if (unlikely(inode != dentry->d_inode))
+ goto out;
+ }
- /*
- * The dispose list is isolated and dentries are not accounted
- * to the LRU here, so we can simply remove it from the list
- * here regardless of whether it is referenced or not.
- */
- d_shrink_del(dentry);
+ parent = dentry->d_parent;
+ if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
+ return true;
- /*
- * We found an inuse dentry which was not removed from
- * the LRU because of laziness during lookup. Do not free it.
- */
- if (dentry->d_lockref.count > 0) {
- spin_unlock(&dentry->d_lock);
- if (parent)
- spin_unlock(&parent->d_lock);
- continue;
- }
+ spin_unlock(&dentry->d_lock);
+ spin_lock(&parent->d_lock);
+ if (unlikely(parent != dentry->d_parent)) {
+ spin_unlock(&parent->d_lock);
+ spin_lock(&dentry->d_lock);
+ goto out;
+ }
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ if (likely(!dentry->d_lockref.count))
+ return true;
+ spin_unlock(&parent->d_lock);
+out:
+ if (inode)
+ spin_unlock(&inode->i_lock);
+ return false;
+}
+static void shrink_dentry_list(struct list_head *list)
+{
+ while (!list_empty(list)) {
+ struct dentry *dentry, *parent;
- if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
- bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
+ dentry = list_entry(list->prev, struct dentry, d_lru);
+ spin_lock(&dentry->d_lock);
+ rcu_read_lock();
+ if (!shrink_lock_dentry(dentry)) {
+ bool can_free = false;
+ rcu_read_unlock();
+ d_shrink_del(dentry);
+ if (dentry->d_lockref.count < 0)
+ can_free = dentry->d_flags & DCACHE_MAY_FREE;
spin_unlock(&dentry->d_lock);
- if (parent)
- spin_unlock(&parent->d_lock);
if (can_free)
dentry_free(dentry);
continue;
}
-
- inode = dentry->d_inode;
- if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
- d_shrink_add(dentry, list);
- spin_unlock(&dentry->d_lock);
- if (parent)
- spin_unlock(&parent->d_lock);
- continue;
- }
-
+ rcu_read_unlock();
+ d_shrink_del(dentry);
+ parent = dentry->d_parent;
__dentry_kill(dentry);
-
+ if (parent == dentry)
+ continue;
/*
* We need to prune ancestors too. This is necessary to prevent
* quadratic behavior of shrink_dcache_parent(), but is also
@@ -1028,26 +1065,8 @@ static void shrink_dentry_list(struct list_head *list)
* fragmentation.
*/
dentry = parent;
- while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
- parent = lock_parent(dentry);
- if (dentry->d_lockref.count != 1) {
- dentry->d_lockref.count--;
- spin_unlock(&dentry->d_lock);
- if (parent)
- spin_unlock(&parent->d_lock);
- break;
- }
- inode = dentry->d_inode; /* can't be NULL */
- if (unlikely(!spin_trylock(&inode->i_lock))) {
- spin_unlock(&dentry->d_lock);
- if (parent)
- spin_unlock(&parent->d_lock);
- cpu_relax();
- continue;
- }
- __dentry_kill(dentry);
- dentry = parent;
- }
+ while (dentry && !lockref_put_or_lock(&dentry->d_lockref))
+ dentry = dentry_kill(dentry);
}
}
@@ -2374,32 +2393,22 @@ EXPORT_SYMBOL(d_hash_and_lookup);
void d_delete(struct dentry * dentry)
{
- struct inode *inode;
- int isdir = 0;
+ struct inode *inode = dentry->d_inode;
+ int isdir = d_is_dir(dentry);
+
+ spin_lock(&inode->i_lock);
+ spin_lock(&dentry->d_lock);
/*
* Are we the only user?
*/
-again:
- spin_lock(&dentry->d_lock);
- inode = dentry->d_inode;
- isdir = S_ISDIR(inode->i_mode);
if (dentry->d_lockref.count == 1) {
- if (!spin_trylock(&inode->i_lock)) {
- spin_unlock(&dentry->d_lock);
- cpu_relax();
- goto again;
- }
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
- fsnotify_nameremove(dentry, isdir);
- return;
- }
-
- if (!d_unhashed(dentry))
+ } else {
__d_drop(dentry);
-
- spin_unlock(&dentry->d_lock);
-
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&inode->i_lock);
+ }
fsnotify_nameremove(dentry, isdir);
}
EXPORT_SYMBOL(d_delete);
@@ -2474,7 +2483,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
retry:
rcu_read_lock();
- seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1;
+ seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
r_seq = read_seqbegin(&rename_lock);
dentry = __d_lookup_rcu(parent, name, &d_seq);
if (unlikely(dentry)) {
@@ -2495,8 +2504,14 @@ retry:
rcu_read_unlock();
goto retry;
}
+
+ if (unlikely(seq & 1)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+
hlist_bl_lock(b);
- if (unlikely(parent->d_inode->i_dir_seq != seq)) {
+ if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
hlist_bl_unlock(b);
rcu_read_unlock();
goto retry;
@@ -2758,57 +2773,6 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
kfree_rcu(old_name, u.head);
}
-static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
-{
- /*
- * XXXX: do we really need to take target->d_lock?
- */
- if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
- spin_lock(&target->d_parent->d_lock);
- else {
- if (d_ancestor(dentry->d_parent, target->d_parent)) {
- spin_lock(&dentry->d_parent->d_lock);
- spin_lock_nested(&target->d_parent->d_lock,
- DENTRY_D_LOCK_NESTED);
- } else {
- spin_lock(&target->d_parent->d_lock);
- spin_lock_nested(&dentry->d_parent->d_lock,
- DENTRY_D_LOCK_NESTED);
- }
- }
- if (target < dentry) {
- spin_lock_nested(&target->d_lock, 2);
- spin_lock_nested(&dentry->d_lock, 3);
- } else {
- spin_lock_nested(&dentry->d_lock, 2);
- spin_lock_nested(&target->d_lock, 3);
- }
-}
-
-static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
-{
- if (target->d_parent != dentry->d_parent)
- spin_unlock(&dentry->d_parent->d_lock);
- if (target->d_parent != target)
- spin_unlock(&target->d_parent->d_lock);
- spin_unlock(&target->d_lock);
- spin_unlock(&dentry->d_lock);
-}
-
-/*
- * When switching names, the actual string doesn't strictly have to
- * be preserved in the target - because we're dropping the target
- * anyway. As such, we can just do a simple memcpy() to copy over
- * the new name before we switch, unless we are going to rehash
- * it. Note that if we *do* unhash the target, we are not allowed
- * to rehash it without giving it a new name/hash key - whether
- * we swap or overwrite the names here, resulting name won't match
- * the reality in filesystem; it's only there for d_path() purposes.
- * Note that all of this is happening under rename_lock, so the
- * any hash lookup seeing it in the middle of manipulations will
- * be discarded anyway. So we do not care what happens to the hash
- * key in that case.
- */
/*
* __d_move - move a dentry
* @dentry: entry to move
@@ -2823,15 +2787,34 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
static void __d_move(struct dentry *dentry, struct dentry *target,
bool exchange)
{
+ struct dentry *old_parent, *p;
struct inode *dir = NULL;
unsigned n;
- if (!dentry->d_inode)
- printk(KERN_WARNING "VFS: moving negative dcache entry\n");
- BUG_ON(d_ancestor(dentry, target));
+ WARN_ON(!dentry->d_inode);
+ if (WARN_ON(dentry == target))
+ return;
+
BUG_ON(d_ancestor(target, dentry));
+ old_parent = dentry->d_parent;
+ p = d_ancestor(old_parent, target);
+ if (IS_ROOT(dentry)) {
+ BUG_ON(p);
+ spin_lock(&target->d_parent->d_lock);
+ } else if (!p) {
+ /* target is not a descendent of dentry->d_parent */
+ spin_lock(&target->d_parent->d_lock);
+ spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
+ } else {
+ BUG_ON(p == dentry);
+ spin_lock(&old_parent->d_lock);
+ if (p != target)
+ spin_lock_nested(&target->d_parent->d_lock,
+ DENTRY_D_LOCK_NESTED);
+ }
+ spin_lock_nested(&dentry->d_lock, 2);
+ spin_lock_nested(&target->d_lock, 3);
- dentry_lock_for_move(dentry, target);
if (unlikely(d_in_lookup(target))) {
dir = target->d_parent->d_inode;
n = start_dir_add(dir);
@@ -2842,47 +2825,44 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
/* unhash both */
- /* ___d_drop does write_seqcount_barrier, but they're OK to nest. */
- ___d_drop(dentry);
- ___d_drop(target);
+ if (!d_unhashed(dentry))
+ ___d_drop(dentry);
+ if (!d_unhashed(target))
+ ___d_drop(target);
- /* Switch the names.. */
- if (exchange)
- swap_names(dentry, target);
- else
+ /* ... and switch them in the tree */
+ dentry->d_parent = target->d_parent;
+ if (!exchange) {
copy_name(dentry, target);
-
- /* rehash in new place(s) */
- __d_rehash(dentry);
- if (exchange)
- __d_rehash(target);
- else
target->d_hash.pprev = NULL;
-
- /* ... and switch them in the tree */
- if (IS_ROOT(dentry)) {
- /* splicing a tree */
- dentry->d_flags |= DCACHE_RCUACCESS;
- dentry->d_parent = target->d_parent;
- target->d_parent = target;
- list_del_init(&target->d_child);
- list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+ dentry->d_parent->d_lockref.count++;
+ if (dentry == old_parent)
+ dentry->d_flags |= DCACHE_RCUACCESS;
+ else
+ WARN_ON(!--old_parent->d_lockref.count);
} else {
- /* swapping two dentries */
- swap(dentry->d_parent, target->d_parent);
+ target->d_parent = old_parent;
+ swap_names(dentry, target);
list_move(&target->d_child, &target->d_parent->d_subdirs);
- list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
- if (exchange)
- fsnotify_update_flags(target);
- fsnotify_update_flags(dentry);
+ __d_rehash(target);
+ fsnotify_update_flags(target);
}
+ list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+ __d_rehash(dentry);
+ fsnotify_update_flags(dentry);
write_seqcount_end(&target->d_seq);
write_seqcount_end(&dentry->d_seq);
if (dir)
end_dir_add(dir, n);
- dentry_unlock_for_move(dentry, target);
+
+ if (dentry->d_parent != old_parent)
+ spin_unlock(&dentry->d_parent->d_lock);
+ if (dentry != old_parent)
+ spin_unlock(&old_parent->d_lock);
+ spin_unlock(&target->d_lock);
+ spin_unlock(&dentry->d_lock);
}
/*
@@ -3030,12 +3010,14 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
inode->i_sb->s_type->name,
inode->i_sb->s_id);
} else if (!IS_ROOT(new)) {
+ struct dentry *old_parent = dget(new->d_parent);
int err = __d_unalias(inode, dentry, new);
write_sequnlock(&rename_lock);
if (err) {
dput(new);
new = ERR_PTR(err);
}
+ dput(old_parent);
} else {
__d_move(new, dentry, false);
write_sequnlock(&rename_lock);
@@ -3050,467 +3032,6 @@ out:
}
EXPORT_SYMBOL(d_splice_alias);
-static int prepend(char **buffer, int *buflen, const char *str, int namelen)
-{
- *buflen -= namelen;
- if (*buflen < 0)
- return -ENAMETOOLONG;
- *buffer -= namelen;
- memcpy(*buffer, str, namelen);
- return 0;
-}
-
-/**
- * prepend_name - prepend a pathname in front of current buffer pointer
- * @buffer: buffer pointer
- * @buflen: allocated length of the buffer
- * @name: name string and length qstr structure
- *
- * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
- * make sure that either the old or the new name pointer and length are
- * fetched. However, there may be mismatch between length and pointer.
- * The length cannot be trusted, we need to copy it byte-by-byte until
- * the length is reached or a null byte is found. It also prepends "/" at
- * the beginning of the name. The sequence number check at the caller will
- * retry it again when a d_move() does happen. So any garbage in the buffer
- * due to mismatched pointer and length will be discarded.
- *
- * Load acquire is needed to make sure that we see that terminating NUL.
- */
-static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
-{
- const char *dname = smp_load_acquire(&name->name); /* ^^^ */
- u32 dlen = READ_ONCE(name->len);
- char *p;
-
- *buflen -= dlen + 1;
- if (*buflen < 0)
- return -ENAMETOOLONG;
- p = *buffer -= dlen + 1;
- *p++ = '/';
- while (dlen--) {
- char c = *dname++;
- if (!c)
- break;
- *p++ = c;
- }
- return 0;
-}
-
-/**
- * prepend_path - Prepend path string to a buffer
- * @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry
- * @buffer: pointer to the end of the buffer
- * @buflen: pointer to buffer length
- *
- * The function will first try to write out the pathname without taking any
- * lock other than the RCU read lock to make sure that dentries won't go away.
- * It only checks the sequence number of the global rename_lock as any change
- * in the dentry's d_seq will be preceded by changes in the rename_lock
- * sequence number. If the sequence number had been changed, it will restart
- * the whole pathname back-tracing sequence again by taking the rename_lock.
- * In this case, there is no need to take the RCU read lock as the recursive
- * parent pointer references will keep the dentry chain alive as long as no
- * rename operation is performed.
- */
-static int prepend_path(const struct path *path,
- const struct path *root,
- char **buffer, int *buflen)
-{
- struct dentry *dentry;
- struct vfsmount *vfsmnt;
- struct mount *mnt;
- int error = 0;
- unsigned seq, m_seq = 0;
- char *bptr;
- int blen;
-
- rcu_read_lock();
-restart_mnt:
- read_seqbegin_or_lock(&mount_lock, &m_seq);
- seq = 0;
- rcu_read_lock();
-restart:
- bptr = *buffer;
- blen = *buflen;
- error = 0;
- dentry = path->dentry;
- vfsmnt = path->mnt;
- mnt = real_mount(vfsmnt);
- read_seqbegin_or_lock(&rename_lock, &seq);
- while (dentry != root->dentry || vfsmnt != root->mnt) {
- struct dentry * parent;
-
- if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
- struct mount *parent = READ_ONCE(mnt->mnt_parent);
- /* Escaped? */
- if (dentry != vfsmnt->mnt_root) {
- bptr = *buffer;
- blen = *buflen;
- error = 3;
- break;
- }
- /* Global root? */
- if (mnt != parent) {
- dentry = READ_ONCE(mnt->mnt_mountpoint);
- mnt = parent;
- vfsmnt = &mnt->mnt;
- continue;
- }
- if (!error)
- error = is_mounted(vfsmnt) ? 1 : 2;
- break;
- }
- parent = dentry->d_parent;
- prefetch(parent);
- error = prepend_name(&bptr, &blen, &dentry->d_name);
- if (error)
- break;
-
- dentry = parent;
- }
- if (!(seq & 1))
- rcu_read_unlock();
- if (need_seqretry(&rename_lock, seq)) {
- seq = 1;
- goto restart;
- }
- done_seqretry(&rename_lock, seq);
-
- if (!(m_seq & 1))
- rcu_read_unlock();
- if (need_seqretry(&mount_lock, m_seq)) {
- m_seq = 1;
- goto restart_mnt;
- }
- done_seqretry(&mount_lock, m_seq);
-
- if (error >= 0 && bptr == *buffer) {
- if (--blen < 0)
- error = -ENAMETOOLONG;
- else
- *--bptr = '/';
- }
- *buffer = bptr;
- *buflen = blen;
- return error;
-}
-
-/**
- * __d_path - return the path of a dentry
- * @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry
- * @buf: buffer to return value in
- * @buflen: buffer length
- *
- * Convert a dentry into an ASCII path name.
- *
- * Returns a pointer into the buffer or an error code if the
- * path was too long.
- *
- * "buflen" should be positive.
- *
- * If the path is not reachable from the supplied root, return %NULL.
- */
-char *__d_path(const struct path *path,
- const struct path *root,
- char *buf, int buflen)
-{
- char *res = buf + buflen;
- int error;
-
- prepend(&res, &buflen, "\0", 1);
- error = prepend_path(path, root, &res, &buflen);
-
- if (error < 0)
- return ERR_PTR(error);
- if (error > 0)
- return NULL;
- return res;
-}
-
-char *d_absolute_path(const struct path *path,
- char *buf, int buflen)
-{
- struct path root = {};
- char *res = buf + buflen;
- int error;
-
- prepend(&res, &buflen, "\0", 1);
- error = prepend_path(path, &root, &res, &buflen);
-
- if (error > 1)
- error = -EINVAL;
- if (error < 0)
- return ERR_PTR(error);
- return res;
-}
-
-/*
- * same as __d_path but appends "(deleted)" for unlinked files.
- */
-static int path_with_deleted(const struct path *path,
- const struct path *root,
- char **buf, int *buflen)
-{
- prepend(buf, buflen, "\0", 1);
- if (d_unlinked(path->dentry)) {
- int error = prepend(buf, buflen, " (deleted)", 10);
- if (error)
- return error;
- }
-
- return prepend_path(path, root, buf, buflen);
-}
-
-static int prepend_unreachable(char **buffer, int *buflen)
-{
- return prepend(buffer, buflen, "(unreachable)", 13);
-}
-
-static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
-{
- unsigned seq;
-
- do {
- seq = read_seqcount_begin(&fs->seq);
- *root = fs->root;
- } while (read_seqcount_retry(&fs->seq, seq));
-}
-
-/**
- * d_path - return the path of a dentry
- * @path: path to report
- * @buf: buffer to return value in
- * @buflen: buffer length
- *
- * Convert a dentry into an ASCII path name. If the entry has been deleted
- * the string " (deleted)" is appended. Note that this is ambiguous.
- *
- * Returns a pointer into the buffer or an error code if the path was
- * too long. Note: Callers should use the returned pointer, not the passed
- * in buffer, to use the name! The implementation often starts at an offset
- * into the buffer, and may leave 0 bytes at the start.
- *
- * "buflen" should be positive.
- */
-char *d_path(const struct path *path, char *buf, int buflen)
-{
- char *res = buf + buflen;
- struct path root;
- int error;
-
- /*
- * We have various synthetic filesystems that never get mounted. On
- * these filesystems dentries are never used for lookup purposes, and
- * thus don't need to be hashed. They also don't need a name until a
- * user wants to identify the object in /proc/pid/fd/. The little hack
- * below allows us to generate a name for these objects on demand:
- *
- * Some pseudo inodes are mountable. When they are mounted
- * path->dentry == path->mnt->mnt_root. In that case don't call d_dname
- * and instead have d_path return the mounted path.
- */
- if (path->dentry->d_op && path->dentry->d_op->d_dname &&
- (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
- return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
-
- rcu_read_lock();
- get_fs_root_rcu(current->fs, &root);
- error = path_with_deleted(path, &root, &res, &buflen);
- rcu_read_unlock();
-
- if (error < 0)
- res = ERR_PTR(error);
- return res;
-}
-EXPORT_SYMBOL(d_path);
-
-/*
- * Helper function for dentry_operations.d_dname() members
- */
-char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
- const char *fmt, ...)
-{
- va_list args;
- char temp[64];
- int sz;
-
- va_start(args, fmt);
- sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
- va_end(args);
-
- if (sz > sizeof(temp) || sz > buflen)
- return ERR_PTR(-ENAMETOOLONG);
-
- buffer += buflen - sz;
- return memcpy(buffer, temp, sz);
-}
-
-char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
-{
- char *end = buffer + buflen;
- /* these dentries are never renamed, so d_lock is not needed */
- if (prepend(&end, &buflen, " (deleted)", 11) ||
- prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
- prepend(&end, &buflen, "/", 1))
- end = ERR_PTR(-ENAMETOOLONG);
- return end;
-}
-EXPORT_SYMBOL(simple_dname);
-
-/*
- * Write full pathname from the root of the filesystem into the buffer.
- */
-static char *__dentry_path(struct dentry *d, char *buf, int buflen)
-{
- struct dentry *dentry;
- char *end, *retval;
- int len, seq = 0;
- int error = 0;
-
- if (buflen < 2)
- goto Elong;
-
- rcu_read_lock();
-restart:
- dentry = d;
- end = buf + buflen;
- len = buflen;
- prepend(&end, &len, "\0", 1);
- /* Get '/' right */
- retval = end-1;
- *retval = '/';
- read_seqbegin_or_lock(&rename_lock, &seq);
- while (!IS_ROOT(dentry)) {
- struct dentry *parent = dentry->d_parent;
-
- prefetch(parent);
- error = prepend_name(&end, &len, &dentry->d_name);
- if (error)
- break;
-
- retval = end;
- dentry = parent;
- }
- if (!(seq & 1))
- rcu_read_unlock();
- if (need_seqretry(&rename_lock, seq)) {
- seq = 1;
- goto restart;
- }
- done_seqretry(&rename_lock, seq);
- if (error)
- goto Elong;
- return retval;
-Elong:
- return ERR_PTR(-ENAMETOOLONG);
-}
-
-char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
-{
- return __dentry_path(dentry, buf, buflen);
-}
-EXPORT_SYMBOL(dentry_path_raw);
-
-char *dentry_path(struct dentry *dentry, char *buf, int buflen)
-{
- char *p = NULL;
- char *retval;
-
- if (d_unlinked(dentry)) {
- p = buf + buflen;
- if (prepend(&p, &buflen, "//deleted", 10) != 0)
- goto Elong;
- buflen++;
- }
- retval = __dentry_path(dentry, buf, buflen);
- if (!IS_ERR(retval) && p)
- *p = '/'; /* restore '/' overriden with '\0' */
- return retval;
-Elong:
- return ERR_PTR(-ENAMETOOLONG);
-}
-
-static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
- struct path *pwd)
-{
- unsigned seq;
-
- do {
- seq = read_seqcount_begin(&fs->seq);
- *root = fs->root;
- *pwd = fs->pwd;
- } while (read_seqcount_retry(&fs->seq, seq));
-}
-
-/*
- * NOTE! The user-level library version returns a
- * character pointer. The kernel system call just
- * returns the length of the buffer filled (which
- * includes the ending '\0' character), or a negative
- * error value. So libc would do something like
- *
- * char *getcwd(char * buf, size_t size)
- * {
- * int retval;
- *
- * retval = sys_getcwd(buf, size);
- * if (retval >= 0)
- * return buf;
- * errno = -retval;
- * return NULL;
- * }
- */
-SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
-{
- int error;
- struct path pwd, root;
- char *page = __getname();
-
- if (!page)
- return -ENOMEM;
-
- rcu_read_lock();
- get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
-
- error = -ENOENT;
- if (!d_unlinked(pwd.dentry)) {
- unsigned long len;
- char *cwd = page + PATH_MAX;
- int buflen = PATH_MAX;
-
- prepend(&cwd, &buflen, "\0", 1);
- error = prepend_path(&pwd, &root, &cwd, &buflen);
- rcu_read_unlock();
-
- if (error < 0)
- goto out;
-
- /* Unreachable from current root */
- if (error > 0) {
- error = prepend_unreachable(&cwd, &buflen);
- if (error)
- goto out;
- }
-
- error = -ERANGE;
- len = PATH_MAX + page - cwd;
- if (len <= size) {
- error = len;
- if (copy_to_user(buf, cwd, len))
- error = -EFAULT;
- }
- } else {
- rcu_read_unlock();
- }
-
-out:
- __putname(page);
- return error;
-}
-
/*
* Test whether new_dentry is a subdirectory of old_dentry.
*
@@ -3574,6 +3095,8 @@ void d_genocide(struct dentry *parent)
d_walk(parent, parent, d_genocide_kill, NULL);
}
+EXPORT_SYMBOL(d_genocide);
+
void d_tmpfile(struct dentry *dentry, struct inode *inode)
{
inode_dec_link_count(inode);
@@ -3653,8 +3176,6 @@ static void __init dcache_init(void)
struct kmem_cache *names_cachep __read_mostly;
EXPORT_SYMBOL(names_cachep);
-EXPORT_SYMBOL(d_genocide);
-
void __init vfs_caches_init_early(void)
{
int i;
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 0d0461cf2431..57bc96435feb 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -146,7 +146,7 @@ out:
/* And here is where the userspace process can look up the cookie value
* to retrieve the path.
*/
-SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len)
+static int do_lookup_dcookie(u64 cookie64, char __user *buf, size_t len)
{
unsigned long cookie = (unsigned long)cookie64;
int err = -EINVAL;
@@ -203,13 +203,18 @@ out:
return err;
}
+SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len)
+{
+ return do_lookup_dcookie(cookie64, buf, len);
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
{
#ifdef __BIG_ENDIAN
- return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
+ return do_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
#else
- return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len);
+ return do_lookup_dcookie(((u64)w1 << 32) | w0, buf, len);
#endif
}
#endif
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 63a998c3f252..13b01351dd1c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -270,10 +270,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- inode_lock(d_inode(parent));
- dentry = lookup_one_len(name, parent, strlen(name));
- inode_unlock(d_inode(parent));
-
+ dentry = lookup_one_len_unlocked(name, parent, strlen(name));
if (IS_ERR(dentry))
return NULL;
if (!d_really_is_positive(dentry)) {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index e31d6ed3ec32..e072e955ce33 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -138,10 +138,6 @@ static int devpts_ptmx_path(struct path *path)
struct super_block *sb;
int err;
- /* Has the devpts filesystem already been found? */
- if (path->mnt->mnt_sb->s_magic == DEVPTS_SUPER_MAGIC)
- return 0;
-
/* Is a devpts filesystem at "pts" in the same directory? */
err = path_pts(path);
if (err)
@@ -156,25 +152,53 @@ static int devpts_ptmx_path(struct path *path)
return 0;
}
+/*
+ * Try to find a suitable devpts filesystem. We support the following
+ * scenarios:
+ * - The ptmx device node is located in the same directory as the devpts
+ * mount where the pts device nodes are located.
+ * This is e.g. the case when calling open on the /dev/pts/ptmx device
+ * node when the devpts filesystem is mounted at /dev/pts.
+ * - The ptmx device node is located outside the devpts filesystem mount
+ * where the pts device nodes are located. For example, the ptmx device
+ * is a symlink, separate device node, or bind-mount.
+ * A supported scenario is bind-mounting /dev/pts/ptmx to /dev/ptmx and
+ * then calling open on /dev/ptmx. In this case a suitable pts
+ * subdirectory can be found in the common parent directory /dev of the
+ * devpts mount and the ptmx bind-mount, after resolving the /dev/ptmx
+ * bind-mount.
+ * If no suitable pts subdirectory can be found this function will fail.
+ * This is e.g. the case when bind-mounting /dev/pts/ptmx to /ptmx.
+ */
struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi)
{
struct path path;
- int err;
+ int err = 0;
path = filp->f_path;
path_get(&path);
- err = devpts_ptmx_path(&path);
+ /* Walk upward while the start point is a bind mount of
+ * a single file.
+ */
+ while (path.mnt->mnt_root == path.dentry)
+ if (follow_up(&path) == 0)
+ break;
+
+ /* devpts_ptmx_path() finds a devpts fs or returns an error. */
+ if ((path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+ (DEVPTS_SB(path.mnt->mnt_sb) != fsi))
+ err = devpts_ptmx_path(&path);
dput(path.dentry);
- if (err) {
- mntput(path.mnt);
- return ERR_PTR(err);
- }
- if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) {
- mntput(path.mnt);
- return ERR_PTR(-ENODEV);
+ if (!err) {
+ if (DEVPTS_SB(path.mnt->mnt_sb) == fsi)
+ return path.mnt;
+
+ err = -ENODEV;
}
- return path.mnt;
+
+ mntput(path.mnt);
+ return ERR_PTR(err);
}
struct pts_fs_info *devpts_acquire(struct file *filp)
@@ -182,15 +206,19 @@ struct pts_fs_info *devpts_acquire(struct file *filp)
struct pts_fs_info *result;
struct path path;
struct super_block *sb;
- int err;
path = filp->f_path;
path_get(&path);
- err = devpts_ptmx_path(&path);
- if (err) {
- result = ERR_PTR(err);
- goto out;
+ /* Has the devpts filesystem already been found? */
+ if (path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) {
+ int err;
+
+ err = devpts_ptmx_path(&path);
+ if (err) {
+ result = ERR_PTR(err);
+ goto out;
+ }
}
/*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a0ca9e48e993..874607bb6e02 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -315,8 +315,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
dio_warn_stale_pagecache(dio->iocb->ki_filp);
}
- if (!(dio->flags & DIO_SKIP_DIO_COUNT))
- inode_dio_end(dio->inode);
+ inode_dio_end(dio->inode);
if (flags & DIO_COMPLETE_ASYNC) {
/*
@@ -1178,9 +1177,9 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
- size_t count = iov_iter_count(iter);
+ const size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
- loff_t end = offset + count;
+ const loff_t end = offset + count;
struct dio *dio;
struct dio_submit sdio = { 0, };
struct buffer_head map_bh = { 0, };
@@ -1201,7 +1200,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
}
/* watch out for a 0 len io from a tricksy fs */
- if (iov_iter_rw(iter) == READ && !iov_iter_count(iter))
+ if (iov_iter_rw(iter) == READ && !count)
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1252,8 +1251,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
if (is_sync_kiocb(iocb))
dio->is_async = false;
- else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
- iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
+ else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
dio->is_async = false;
else
dio->is_async = true;
@@ -1274,8 +1272,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
if (dio->is_async && iov_iter_rw(iter) == WRITE) {
retval = 0;
- if ((iocb->ki_filp->f_flags & O_DSYNC) ||
- IS_SYNC(iocb->ki_filp->f_mapping->host))
+ if (iocb->ki_flags & IOCB_DSYNC)
retval = dio_set_defer_completion(dio);
else if (!dio->inode->i_sb->s_dio_done_wq) {
/*
@@ -1298,8 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
/*
* Will be decremented at I/O completion time.
*/
- if (!(dio->flags & DIO_SKIP_DIO_COUNT))
- inode_dio_begin(inode);
+ inode_dio_begin(inode);
retval = 0;
sdio.blkbits = blkbits;
@@ -1319,8 +1315,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio->should_dirty = (iter->type == ITER_IOVEC);
sdio.iter = iter;
- sdio.final_block_in_request =
- (offset + iov_iter_count(iter)) >> blkbits;
+ sdio.final_block_in_request = end >> blkbits;
/*
* In case of non-aligned buffers, we may need 2 more
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index cff79ea0c01d..5243989a60cc 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -482,7 +482,6 @@ static void lowcomms_error_report(struct sock *sk)
{
struct connection *con;
struct sockaddr_storage saddr;
- int buflen;
void (*orig_report)(struct sock *) = NULL;
read_lock_bh(&sk->sk_callback_lock);
@@ -492,7 +491,7 @@ static void lowcomms_error_report(struct sock *sk)
orig_report = listen_sock.sk_error_report;
if (con->sock == NULL ||
- kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) {
+ kernel_getpeername(con->sock, (struct sockaddr *)&saddr) < 0) {
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
"sending to node %d, port %d, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
@@ -757,8 +756,8 @@ static int tcp_accept_from_sock(struct connection *con)
/* Get the connected socket's peer */
memset(&peeraddr, 0, sizeof(peeraddr));
- if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
- &len, 2)) {
+ len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2);
+ if (len < 0) {
result = -ECONNABORTED;
goto accept_err;
}
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 5f22e74bbade..8e568428c88b 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -8,6 +8,7 @@
*/
#include <linux/efi.h>
+#include <linux/delay.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/mount.h>
@@ -74,6 +75,11 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
ssize_t size = 0;
int err;
+ while (!__ratelimit(&file->f_cred->user->ratelimit)) {
+ if (!msleep_interruptible(50))
+ return -EINTR;
+ }
+
err = efivar_entry_size(var, &datasize);
/*
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 012f5bd46dfa..08d3bd602f73 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -380,7 +380,7 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
-SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
+static int do_eventfd(unsigned int count, int flags)
{
struct eventfd_ctx *ctx;
int fd;
@@ -409,8 +409,13 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
return fd;
}
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
+{
+ return do_eventfd(count, flags);
+}
+
SYSCALL_DEFINE1(eventfd, unsigned int, count)
{
- return sys_eventfd2(count, 0);
+ return do_eventfd(count, 0);
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0f3494ed3ed0..602ca4285b2e 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1936,7 +1936,7 @@ static void clear_tfile_check_list(void)
/*
* Open an eventpoll file descriptor.
*/
-SYSCALL_DEFINE1(epoll_create1, int, flags)
+static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
@@ -1979,12 +1979,17 @@ out_free_ep:
return error;
}
+SYSCALL_DEFINE1(epoll_create1, int, flags)
+{
+ return do_epoll_create(flags);
+}
+
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
- return sys_epoll_create1(0);
+ return do_epoll_create(0);
}
/*
@@ -2148,8 +2153,8 @@ error_return:
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
-SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
- int, maxevents, int, timeout)
+static int do_epoll_wait(int epfd, struct epoll_event __user *events,
+ int maxevents, int timeout)
{
int error;
struct fd f;
@@ -2190,6 +2195,12 @@ error_fput:
return error;
}
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, int, timeout)
+{
+ return do_epoll_wait(epfd, events, maxevents, timeout);
+}
+
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
@@ -2214,7 +2225,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
set_current_blocked(&ksigmask);
}
- error = sys_epoll_wait(epfd, events, maxevents, timeout);
+ error = do_epoll_wait(epfd, events, maxevents, timeout);
/*
* If we changed the signal mask, we need to restore the original one.
@@ -2257,7 +2268,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
set_current_blocked(&ksigmask);
}
- err = sys_epoll_wait(epfd, events, maxevents, timeout);
+ err = do_epoll_wait(epfd, events, maxevents, timeout);
/*
* If we changed the signal mask, we need to restore the original one.
diff --git a/fs/exec.c b/fs/exec.c
index 7eb8d21bcab9..a919a827d181 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -895,13 +895,13 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
return -EINVAL;
- ret = security_kernel_read_file(file, id);
+ ret = deny_write_access(file);
if (ret)
return ret;
- ret = deny_write_access(file);
+ ret = security_kernel_read_file(file, id);
if (ret)
- return ret;
+ goto out;
i_size = i_size_read(file_inode(file));
if (max_size > 0 && i_size > max_size) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7666c065b96f..de1694512f1f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -827,7 +827,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
unsigned long logic_sb_block;
unsigned long offset = 0;
unsigned long def_mount_opts;
- long ret = -EINVAL;
+ long ret = -ENOMEM;
int blocksize = BLOCK_SIZE;
int db_count;
int i, j;
@@ -835,7 +835,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
int err;
struct ext2_mount_options opts;
- err = -ENOMEM;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
goto failed;
@@ -851,6 +850,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_daxdev = dax_dev;
spin_lock_init(&sbi->s_lock);
+ ret = -EINVAL;
/*
* See what the current blocksize for the device is, and
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f9b3e0a83526..a33d8fb1bf2a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -243,8 +243,6 @@ static int ext4_init_block_bitmap(struct super_block *sb,
*/
ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
sb->s_blocksize * 8, bh->b_data);
- ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
- ext4_group_desc_csum_set(sb, block_group, gdp);
return 0;
}
@@ -340,20 +338,25 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
/* check whether block bitmap block number is set */
blk = ext4_block_bitmap(sb, desc);
offset = blk - group_first_block;
- if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
+ if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize ||
+ !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
/* check whether the inode bitmap block number is set */
blk = ext4_inode_bitmap(sb, desc);
offset = blk - group_first_block;
- if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
+ if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize ||
+ !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
/* check whether the inode table block number is set */
blk = ext4_inode_table(sb, desc);
offset = blk - group_first_block;
+ if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize ||
+ EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= sb->s_blocksize)
+ return blk;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
EXT4_B2C(sbi, offset + sbi->s_itb_per_group),
EXT4_B2C(sbi, offset));
@@ -419,6 +422,7 @@ struct buffer_head *
ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
int err;
@@ -427,6 +431,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
if (!desc)
return ERR_PTR(-EFSCORRUPTED);
bitmap_blk = ext4_block_bitmap(sb, desc);
+ if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
+ ext4_error(sb, "Invalid block bitmap block %llu in "
+ "block_group %u", bitmap_blk, block_group);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
ext4_error(sb, "Cannot get buffer for block bitmap - "
@@ -448,6 +458,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
if (err) {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index da87cf757f7d..e2902d394f1b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -365,13 +365,15 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int dx_dir = is_dx_dir(inode);
- loff_t htree_max = ext4_get_htree_eof(file);
+ loff_t ret, htree_max = ext4_get_htree_eof(file);
if (likely(dx_dir))
- return generic_file_llseek_size(file, offset, whence,
+ ret = generic_file_llseek_size(file, offset, whence,
htree_max, htree_max);
else
- return ext4_llseek(file, offset, whence);
+ ret = ext4_llseek(file, offset, whence);
+ file->f_version = inode_peek_iversion(inode) - 1;
+ return ret;
}
/*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3241475a1733..a42e71203e53 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1522,8 +1522,6 @@ enum {
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
- EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
- nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
@@ -3181,21 +3179,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
-/*
- * Disable DIO read nolock optimization, so new dioreaders will be forced
- * to grab i_mutex
- */
-static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
-{
- ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
- smp_mb();
-}
-static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
-{
- smp_mb();
- ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
-}
-
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
/* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 2d593201cf7a..7c70b08d104c 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -166,13 +166,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
might_sleep();
if (ext4_handle_valid(handle)) {
- struct super_block *sb;
-
- sb = handle->h_transaction->t_journal->j_private;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) {
- jbd2_journal_abort_handle(handle);
- return -EIO;
- }
err = jbd2_journal_get_write_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__, bh,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 054416e9d827..0a7315961bac 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4796,7 +4796,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
/* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/* Preallocate the range including the unaligned edges */
@@ -4807,7 +4806,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
round_down(offset, 1 << blkbits)) >> blkbits,
new_size, flags);
if (ret)
- goto out_dio;
+ goto out_mutex;
}
@@ -4824,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_update_disksize_before_punch(inode, offset, len);
if (ret) {
up_write(&EXT4_I(inode)->i_mmap_sem);
- goto out_dio;
+ goto out_mutex;
}
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
@@ -4834,10 +4833,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags);
up_write(&EXT4_I(inode)->i_mmap_sem);
if (ret)
- goto out_dio;
+ goto out_mutex;
}
if (!partial_begin && !partial_end)
- goto out_dio;
+ goto out_mutex;
/*
* In worst case we have to writeout two nonadjacent unwritten
@@ -4850,7 +4849,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
- goto out_dio;
+ goto out_mutex;
}
inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -4875,8 +4874,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ext4_handle_sync(handle);
ext4_journal_stop(handle);
-out_dio:
- ext4_inode_resume_unlocked_dio(inode);
out_mutex:
inode_unlock(inode);
return ret;
@@ -4964,11 +4961,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
}
/* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
- ext4_inode_resume_unlocked_dio(inode);
if (ret)
goto out;
@@ -5485,7 +5480,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
}
/* Wait for existing dio to complete */
- ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/*
@@ -5562,7 +5556,6 @@ out_stop:
ext4_journal_stop(handle);
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
- ext4_inode_resume_unlocked_dio(inode);
out_mutex:
inode_unlock(inode);
return ret;
@@ -5635,7 +5628,6 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
}
/* Wait for existing dio to complete */
- ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/*
@@ -5737,7 +5729,6 @@ out_stop:
ext4_journal_stop(handle);
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
- ext4_inode_resume_unlocked_dio(inode);
out_mutex:
inode_unlock(inode);
return ret;
@@ -5751,7 +5742,7 @@ out_mutex:
* @lblk1: Start block for first inode
* @lblk2: Start block for second inode
* @count: Number of blocks to swap
- * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @unwritten: Mark second inode's extents as unwritten after swap
* @erp: Pointer to save error value
*
* This helper routine does exactly what is promise "swap extents". All other
@@ -5765,7 +5756,7 @@ out_mutex:
*/
int
ext4_swap_extents(handle_t *handle, struct inode *inode1,
- struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+ struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
ext4_lblk_t count, int unwritten, int *erp)
{
struct ext4_ext_path *path1 = NULL;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7830d28df331..df92e3ec9913 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -66,44 +66,6 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
}
-/* Initializes an uninitialized inode bitmap */
-static int ext4_init_inode_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp)
-{
- struct ext4_group_info *grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- J_ASSERT_BH(bh, buffer_locked(bh));
-
- /* If checksum is bad mark all blocks and inodes use to prevent
- * allocation, essentially implementing a per-group read-only flag. */
- if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, gdp);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return -EFSBADCRC;
- }
-
- memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
- ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
- bh->b_data);
- ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
- ext4_group_desc_csum_set(sb, block_group, gdp);
-
- return 0;
-}
-
void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
{
if (uptodate) {
@@ -160,6 +122,7 @@ static struct buffer_head *
ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
int err;
@@ -169,6 +132,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
return ERR_PTR(-EFSCORRUPTED);
bitmap_blk = ext4_inode_bitmap(sb, desc);
+ if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
+ ext4_error(sb, "Invalid inode bitmap blk %llu in "
+ "block_group %u", bitmap_blk, block_group);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
ext4_error(sb, "Cannot read inode bitmap - "
@@ -187,17 +156,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- err = ext4_init_inode_bitmap(sb, bh, block_group, desc);
+ memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
+ ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+ sb->s_blocksize * 8, bh->b_data);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err) {
- ext4_error(sb, "Failed to init inode bitmap for group "
- "%u: %d", block_group, err);
- goto out;
- }
return bh;
}
ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c94780075b04..129205028300 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2694,15 +2694,6 @@ out:
return err;
}
-static int __writepage(struct page *page, struct writeback_control *wbc,
- void *data)
-{
- struct address_space *mapping = data;
- int ret = ext4_writepage(page, wbc);
- mapping_set_error(mapping, ret);
- return ret;
-}
-
static int ext4_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -2740,11 +2731,7 @@ static int ext4_writepages(struct address_space *mapping,
goto out_writepages;
if (ext4_should_journal_data(inode)) {
- struct blk_plug plug;
-
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __writepage, mapping);
- blk_finish_plug(&plug);
+ ret = generic_writepages(mapping, wbc);
goto out_writepages;
}
@@ -3524,7 +3511,7 @@ retry:
iomap->flags |= IOMAP_F_DIRTY;
iomap->bdev = inode->i_sb->s_bdev;
iomap->dax_dev = sbi->s_daxdev;
- iomap->offset = first_block << blkbits;
+ iomap->offset = (u64)first_block << blkbits;
iomap->length = (u64)map.m_len << blkbits;
if (ret == 0) {
@@ -3669,7 +3656,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
int orphan = 0;
handle_t *handle;
- if (final_size > inode->i_size) {
+ if (final_size > inode->i_size || final_size > ei->i_disksize) {
/* Credits for sb + inode write */
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
@@ -3682,7 +3669,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
goto out;
}
orphan = 1;
- ei->i_disksize = inode->i_size;
+ ext4_update_i_disksize(inode, inode->i_size);
ext4_journal_stop(handle);
}
@@ -3789,9 +3776,10 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
ext4_orphan_del(handle, inode);
if (ret > 0) {
loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
+ if (end > inode->i_size || end > ei->i_disksize) {
+ ext4_update_i_disksize(inode, end);
+ if (end > inode->i_size)
+ i_size_write(inode, end);
/*
* We're going to return a positive `ret'
* here due to non-zero-length I/O, so there's
@@ -4251,7 +4239,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
/* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/*
@@ -4324,7 +4311,6 @@ out_stop:
ext4_journal_stop(handle);
out_dio:
up_write(&EXT4_I(inode)->i_mmap_sem);
- ext4_inode_resume_unlocked_dio(inode);
out_mutex:
inode_unlock(inode);
return ret;
@@ -4746,6 +4732,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
+ if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
+ EXT4_ERROR_INODE(inode, "root inode unallocated");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
@@ -5032,12 +5024,12 @@ static int other_inode_match(struct inode * inode, unsigned long ino,
if ((inode->i_ino != ino) ||
(inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ I_DIRTY_INODE)) ||
((inode->i_state & I_DIRTY_TIME) == 0))
return 0;
spin_lock(&inode->i_lock);
if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+ I_DIRTY_INODE)) == 0) &&
(inode->i_state & I_DIRTY_TIME)) {
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -5506,9 +5498,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
*/
if (orphan) {
if (!ext4_should_journal_data(inode)) {
- ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
- ext4_inode_resume_unlocked_dio(inode);
} else
ext4_wait_for_tail_page_commit(inode);
}
@@ -5999,7 +5989,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return -EROFS;
/* Wait for all existing dio workers */
- ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/*
@@ -6015,7 +6004,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
err = filemap_write_and_wait(inode->i_mapping);
if (err < 0) {
up_write(&EXT4_I(inode)->i_mmap_sem);
- ext4_inode_resume_unlocked_dio(inode);
return err;
}
}
@@ -6038,7 +6026,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (err < 0) {
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_journal_flag_rwsem);
- ext4_inode_resume_unlocked_dio(inode);
return err;
}
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -6050,7 +6037,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
up_write(&EXT4_I(inode)->i_mmap_sem);
- ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7e99ad02f1ba..a7074115d6f6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -124,8 +124,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
truncate_inode_pages(&inode_bl->i_data, 0);
/* Wait for all existing dio workers */
- ext4_inode_block_unlocked_dio(inode);
- ext4_inode_block_unlocked_dio(inode_bl);
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
@@ -186,8 +184,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
ext4_double_up_write_data_sem(inode, inode_bl);
journal_err_out:
- ext4_inode_resume_unlocked_dio(inode);
- ext4_inode_resume_unlocked_dio(inode_bl);
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
return err;
@@ -481,6 +477,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
return 0;
ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags);
+ trace_ext4_shutdown(sb, flags);
switch (flags) {
case EXT4_GOING_FLAGS_DEFAULT:
@@ -492,15 +489,13 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
(void) ext4_force_commit(sb);
- jbd2_journal_abort(sbi->s_journal, 0);
+ jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN);
}
break;
case EXT4_GOING_FLAGS_NOLOGFLUSH:
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
- if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
- msleep(100);
- jbd2_journal_abort(sbi->s_journal, 0);
- }
+ if (sbi->s_journal && !is_journal_aborted(sbi->s_journal))
+ jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN);
break;
default:
return -EINVAL;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b96e4bd3b3ec..8e17efdcbf11 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -601,8 +601,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
lock_two_nondirectories(orig_inode, donor_inode);
/* Wait for all existing dio workers */
- ext4_inode_block_unlocked_dio(orig_inode);
- ext4_inode_block_unlocked_dio(donor_inode);
inode_dio_wait(orig_inode);
inode_dio_wait(donor_inode);
@@ -693,8 +691,6 @@ out:
ext4_ext_drop_refs(path);
kfree(path);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
- ext4_inode_resume_unlocked_dio(orig_inode);
- ext4_inode_resume_unlocked_dio(donor_inode);
unlock_two_nondirectories(orig_inode, donor_inode);
return ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 39bf464c35f1..185f7e61f4cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -101,15 +101,13 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* i_data_sem (rw)
*
* truncate:
- * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
- * i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
- * transaction start -> i_data_sem (rw)
+ * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
+ * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
+ * i_data_sem (rw)
*
* direct IO:
- * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem
- * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) ->
- * transaction start -> i_data_sem (rw)
+ * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
*
* writepages:
* transaction start -> page lock(s) -> i_data_sem (rw)
@@ -448,6 +446,7 @@ void __ext4_error(struct super_block *sb, const char *function,
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
return;
+ trace_ext4_error(sb, function, line);
if (ext4_error_ratelimit(sb)) {
va_start(args, fmt);
vaf.fmt = fmt;
@@ -472,6 +471,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return;
+ trace_ext4_error(inode->i_sb, function, line);
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
es->s_last_error_block = cpu_to_le64(block);
if (ext4_error_ratelimit(inode->i_sb)) {
@@ -507,6 +507,7 @@ void __ext4_error_file(struct file *file, const char *function,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return;
+ trace_ext4_error(inode->i_sb, function, line);
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
if (ext4_error_ratelimit(inode->i_sb)) {
@@ -719,6 +720,7 @@ __acquires(bitlock)
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
return;
+ trace_ext4_error(sb, function, line);
es->s_last_error_ino = cpu_to_le32(ino);
es->s_last_error_block = cpu_to_le64(block);
__save_error_info(sb, function, line);
@@ -2019,7 +2021,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+ int def_errors, def_mount_opt = sbi->s_def_mount_opt;
const struct mount_opts *m;
char sep = nodefs ? '\n' : ',';
@@ -2034,7 +2036,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
(m->flags & MOPT_CLEAR_ERR))
continue;
- if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+ if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
if ((want_set &&
(sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
@@ -2068,7 +2070,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
- if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+ if (nodefs || EXT4_MOUNT_DATA_FLAGS &
+ (sbi->s_mount_opt ^ def_mount_opt)) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
SEQ_OPTS_PUTS("data=journal");
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
@@ -2081,7 +2084,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("inode_readahead_blks=%u",
sbi->s_inode_readahead_blks);
- if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+ if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
(sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
if (nodefs || sbi->s_max_dir_size_kb)
@@ -2333,6 +2336,8 @@ static int ext4_check_descriptors(struct super_block *sb,
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Block bitmap for group %u overlaps "
"superblock", i);
+ if (!sb_rdonly(sb))
+ return 0;
}
if (block_bitmap < first_block || block_bitmap > last_block) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2345,6 +2350,8 @@ static int ext4_check_descriptors(struct super_block *sb,
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Inode bitmap for group %u overlaps "
"superblock", i);
+ if (!sb_rdonly(sb))
+ return 0;
}
if (inode_bitmap < first_block || inode_bitmap > last_block) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2357,6 +2364,8 @@ static int ext4_check_descriptors(struct super_block *sb,
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Inode table for group %u overlaps "
"superblock", i);
+ if (!sb_rdonly(sb))
+ return 0;
}
if (inode_table < first_block ||
inode_table + sbi->s_itb_per_group - 1 > last_block) {
@@ -3490,15 +3499,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
/* Load the checksum driver */
- if (ext4_has_feature_metadata_csum(sb) ||
- ext4_has_feature_ea_inode(sb)) {
- sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(sbi->s_chksum_driver)) {
- ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
- ret = PTR_ERR(sbi->s_chksum_driver);
- sbi->s_chksum_driver = NULL;
- goto failed_mount;
- }
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+ ret = PTR_ERR(sbi->s_chksum_driver);
+ sbi->s_chksum_driver = NULL;
+ goto failed_mount;
}
/* Check superblock checksum */
@@ -3660,6 +3666,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
"using the ext4 subsystem");
else {
+ /*
+ * If we're probing be silent, if this looks like
+ * it's actually an ext[34] filesystem.
+ */
+ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
+ goto failed_mount;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
"to feature incompatibilities");
goto failed_mount;
@@ -3671,6 +3683,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
"using the ext4 subsystem");
else {
+ /*
+ * If we're probing be silent, if this looks like
+ * it's actually an ext4 filesystem.
+ */
+ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
+ goto failed_mount;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
"to feature incompatibilities");
goto failed_mount;
@@ -4094,10 +4112,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* cope, else JOURNAL_DATA
*/
if (jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
set_opt(sb, ORDERED_DATA);
- else
+ sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ } else {
set_opt(sb, JOURNAL_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ }
break;
case EXT4_MOUNT_ORDERED_DATA:
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 1205261f130c..9ebd26c957c2 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -49,8 +49,7 @@ struct ext4_attr {
} u;
};
-static ssize_t session_write_kbytes_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
+static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
@@ -61,8 +60,7 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a,
sbi->s_sectors_written_start) >> 1);
}
-static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
+static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
@@ -74,8 +72,7 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
-static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
+static ssize_t inode_readahead_blks_store(struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
unsigned long t;
@@ -92,8 +89,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
return count;
}
-static ssize_t reserved_clusters_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
+static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
unsigned long long val;
@@ -109,8 +105,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a,
return count;
}
-static ssize_t trigger_test_error(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
+static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
int len = count;
@@ -268,9 +263,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
(s64) EXT4_C2B(sbi,
percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
case attr_session_write_kbytes:
- return session_write_kbytes_show(a, sbi, buf);
+ return session_write_kbytes_show(sbi, buf);
case attr_lifetime_write_kbytes:
- return lifetime_write_kbytes_show(a, sbi, buf);
+ return lifetime_write_kbytes_show(sbi, buf);
case attr_reserved_clusters:
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)
@@ -306,7 +301,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
switch (a->attr_id) {
case attr_reserved_clusters:
- return reserved_clusters_store(a, sbi, buf, len);
+ return reserved_clusters_store(sbi, buf, len);
case attr_pointer_ui:
if (!ptr)
return 0;
@@ -316,9 +311,9 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
*((unsigned int *) ptr) = t;
return len;
case attr_inode_readahead:
- return inode_readahead_blks_store(a, sbi, buf, len);
+ return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
- return trigger_test_error(a, sbi, buf, len);
+ return trigger_test_error(sbi, buf, len);
}
return 0;
}
@@ -330,13 +325,6 @@ static void ext4_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
-static void ext4_kset_release(struct kobject *kobj)
-{
- struct kset *kset = container_of(kobj, struct kset, kobj);
-
- kfree(kset);
-}
-
static const struct sysfs_ops ext4_attr_ops = {
.show = ext4_attr_show,
.store = ext4_attr_store,
@@ -348,19 +336,14 @@ static struct kobj_type ext4_sb_ktype = {
.release = ext4_sb_release,
};
-static struct kobj_type ext4_ktype = {
- .sysfs_ops = &ext4_attr_ops,
- .release = ext4_kset_release,
-};
-
-static struct kset *ext4_kset;
-
static struct kobj_type ext4_feat_ktype = {
.default_attrs = ext4_feat_attrs,
.sysfs_ops = &ext4_attr_ops,
.release = (void (*)(struct kobject *))kfree,
};
+static struct kobject *ext4_root;
+
static struct kobject *ext4_feat;
#define PROC_FILE_SHOW_DEFN(name) \
@@ -398,9 +381,8 @@ int ext4_register_sysfs(struct super_block *sb)
const struct ext4_proc_files *p;
int err;
- sbi->s_kobj.kset = ext4_kset;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL,
+ err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root,
"%s", sb->s_id);
if (err) {
kobject_put(&sbi->s_kobj);
@@ -436,26 +418,18 @@ int __init ext4_init_sysfs(void)
{
int ret;
- ext4_kset = kzalloc(sizeof(*ext4_kset), GFP_KERNEL);
- if (!ext4_kset)
+ ext4_root = kobject_create_and_add("ext4", fs_kobj);
+ if (!ext4_root)
return -ENOMEM;
- kobject_set_name(&ext4_kset->kobj, "ext4");
- ext4_kset->kobj.parent = fs_kobj;
- ext4_kset->kobj.ktype = &ext4_ktype;
- ret = kset_register(ext4_kset);
- if (ret)
- goto kset_err;
-
ext4_feat = kzalloc(sizeof(*ext4_feat), GFP_KERNEL);
if (!ext4_feat) {
ret = -ENOMEM;
- goto kset_err;
+ goto root_err;
}
- ext4_feat->kset = ext4_kset;
ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype,
- NULL, "features");
+ ext4_root, "features");
if (ret)
goto feat_err;
@@ -464,17 +438,19 @@ int __init ext4_init_sysfs(void)
feat_err:
kobject_put(ext4_feat);
-kset_err:
- kset_unregister(ext4_kset);
- ext4_kset = NULL;
+ ext4_feat = NULL;
+root_err:
+ kobject_put(ext4_root);
+ ext4_root = NULL;
return ret;
}
void ext4_exit_sysfs(void)
{
kobject_put(ext4_feat);
- kset_unregister(ext4_kset);
- ext4_kset = NULL;
+ ext4_feat = NULL;
+ kobject_put(ext4_root);
+ ext4_root = NULL;
remove_proc_entry(proc_dirname, NULL);
ext4_proc_root = NULL;
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 63656dbafdc4..499cb4b1fbd2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -195,10 +195,13 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
/* Check the values */
while (!IS_LAST_ENTRY(entry)) {
- if (entry->e_value_size != 0 &&
- entry->e_value_inum == 0) {
+ u32 size = le32_to_cpu(entry->e_value_size);
+
+ if (size > EXT4_XATTR_SIZE_MAX)
+ return -EFSCORRUPTED;
+
+ if (size != 0 && entry->e_value_inum == 0) {
u16 offs = le16_to_cpu(entry->e_value_offs);
- u32 size = le32_to_cpu(entry->e_value_size);
void *value;
/*
@@ -222,25 +225,36 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
}
static inline int
-ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
+__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
+ const char *function, unsigned int line)
{
- int error;
+ int error = -EFSCORRUPTED;
if (buffer_verified(bh))
return 0;
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1))
- return -EFSCORRUPTED;
+ goto errout;
+ error = -EFSBADCRC;
if (!ext4_xattr_block_csum_verify(inode, bh))
- return -EFSBADCRC;
+ goto errout;
error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
bh->b_data);
- if (!error)
+errout:
+ if (error)
+ __ext4_error_inode(inode, function, line, 0,
+ "corrupted xattr block %llu",
+ (unsigned long long) bh->b_blocknr);
+ else
set_buffer_verified(bh);
return error;
}
+#define ext4_xattr_check_block(inode, bh) \
+ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__)
+
+
static int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
@@ -262,18 +276,22 @@ errout:
__xattr_check_inode((inode), (header), (end), __func__, __LINE__)
static int
-ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
- const char *name, int sorted)
+xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
+ void *end, int name_index, const char *name, int sorted)
{
- struct ext4_xattr_entry *entry;
+ struct ext4_xattr_entry *entry, *next;
size_t name_len;
int cmp = 1;
if (name == NULL)
return -EINVAL;
name_len = strlen(name);
- entry = *pentry;
- for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) {
+ next = EXT4_XATTR_NEXT(entry);
+ if ((void *) next >= end) {
+ EXT4_ERROR_INODE(inode, "corrupted xattr entries");
+ return -EFSCORRUPTED;
+ }
cmp = name_index - entry->e_name_index;
if (!cmp)
cmp = name_len - entry->e_name_len;
@@ -495,6 +513,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
struct buffer_head *bh = NULL;
struct ext4_xattr_entry *entry;
size_t size;
+ void *end;
int error;
struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
@@ -511,20 +530,20 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext4_xattr_check_block(inode, bh)) {
- EXT4_ERROR_INODE(inode, "bad block %llu",
- EXT4_I(inode)->i_file_acl);
- error = -EFSCORRUPTED;
+ error = ext4_xattr_check_block(inode, bh);
+ if (error)
goto cleanup;
- }
ext4_xattr_block_cache_insert(ea_block_cache, bh);
entry = BFIRST(bh);
- error = ext4_xattr_find_entry(&entry, name_index, name, 1);
+ end = bh->b_data + bh->b_size;
+ error = xattr_find_entry(inode, &entry, end, name_index, name, 1);
if (error)
goto cleanup;
size = le32_to_cpu(entry->e_value_size);
+ error = -ERANGE;
+ if (unlikely(size > EXT4_XATTR_SIZE_MAX))
+ goto cleanup;
if (buffer) {
- error = -ERANGE;
if (size > buffer_size)
goto cleanup;
if (entry->e_value_inum) {
@@ -533,8 +552,12 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
if (error)
goto cleanup;
} else {
- memcpy(buffer, bh->b_data +
- le16_to_cpu(entry->e_value_offs), size);
+ u16 offset = le16_to_cpu(entry->e_value_offs);
+ void *p = bh->b_data + offset;
+
+ if (unlikely(p + size > end))
+ goto cleanup;
+ memcpy(buffer, p, size);
}
}
error = size;
@@ -568,12 +591,14 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
if (error)
goto cleanup;
entry = IFIRST(header);
- error = ext4_xattr_find_entry(&entry, name_index, name, 0);
+ error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
if (error)
goto cleanup;
size = le32_to_cpu(entry->e_value_size);
+ error = -ERANGE;
+ if (unlikely(size > EXT4_XATTR_SIZE_MAX))
+ goto cleanup;
if (buffer) {
- error = -ERANGE;
if (size > buffer_size)
goto cleanup;
if (entry->e_value_inum) {
@@ -582,8 +607,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
if (error)
goto cleanup;
} else {
- memcpy(buffer, (void *)IFIRST(header) +
- le16_to_cpu(entry->e_value_offs), size);
+ u16 offset = le16_to_cpu(entry->e_value_offs);
+ void *p = (void *)IFIRST(header) + offset;
+
+ if (unlikely(p + size > end))
+ goto cleanup;
+ memcpy(buffer, p, size);
}
}
error = size;
@@ -676,12 +705,9 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext4_xattr_check_block(inode, bh)) {
- EXT4_ERROR_INODE(inode, "bad block %llu",
- EXT4_I(inode)->i_file_acl);
- error = -EFSCORRUPTED;
+ error = ext4_xattr_check_block(inode, bh);
+ if (error)
goto cleanup;
- }
ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
@@ -808,10 +834,9 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
goto out;
}
- if (ext4_xattr_check_block(inode, bh)) {
- ret = -EFSCORRUPTED;
+ ret = ext4_xattr_check_block(inode, bh);
+ if (ret)
goto out;
- }
for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry))
@@ -1793,19 +1818,16 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
- if (ext4_xattr_check_block(inode, bs->bh)) {
- EXT4_ERROR_INODE(inode, "bad block %llu",
- EXT4_I(inode)->i_file_acl);
- error = -EFSCORRUPTED;
+ error = ext4_xattr_check_block(inode, bs->bh);
+ if (error)
goto cleanup;
- }
/* Find the named attribute. */
bs->s.base = BHDR(bs->bh);
bs->s.first = BFIRST(bs->bh);
bs->s.end = bs->bh->b_data + bs->bh->b_size;
bs->s.here = bs->s.first;
- error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
- i->name, 1);
+ error = xattr_find_entry(inode, &bs->s.here, bs->s.end,
+ i->name_index, i->name, 1);
if (error && error != -ENODATA)
goto cleanup;
bs->s.not_found = error;
@@ -2164,8 +2186,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
if (error)
return error;
/* Find the named attribute. */
- error = ext4_xattr_find_entry(&is->s.here, i->name_index,
- i->name, 0);
+ error = xattr_find_entry(inode, &is->s.here, is->s.end,
+ i->name_index, i->name, 0);
if (error && error != -ENODATA)
return error;
is->s.not_found = error;
@@ -2721,13 +2743,9 @@ retry:
error = -EIO;
if (!bh)
goto cleanup;
- if (ext4_xattr_check_block(inode, bh)) {
- EXT4_ERROR_INODE(inode, "bad block %llu",
- EXT4_I(inode)->i_file_acl);
- error = -EFSCORRUPTED;
- brelse(bh);
+ error = ext4_xattr_check_block(inode, bh);
+ if (error)
goto cleanup;
- }
base = BHDR(bh);
end = bh->b_data + bh->b_size;
min_offs = end - base;
@@ -2884,11 +2902,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
goto cleanup;
}
error = ext4_xattr_check_block(inode, bh);
- if (error) {
- EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
- EXT4_I(inode)->i_file_acl, error);
+ if (error)
goto cleanup;
- }
if (ext4_has_feature_ea_inode(inode->i_sb)) {
for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index dd54c4f995c8..f39cad2abe2a 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -71,6 +71,17 @@ struct ext4_xattr_entry {
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
/*
+ * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking
+ * for file system consistency errors, we use a somewhat bigger value.
+ * This allows XATTR_SIZE_MAX to grow in the future, but by using this
+ * instead of INT_MAX for certain consistency checks, we don't need to
+ * worry about arithmetic overflows. (Actually XATTR_SIZE_MAX is
+ * defined in include/uapi/linux/limits.h, so changing it is going
+ * not going to be trivial....)
+ */
+#define EXT4_XATTR_SIZE_MAX (1 << 24)
+
+/*
* The minimum size of EA value when you start storing it in an external inode
* size of block - size of header - size of 1 entry - 4 null bytes
*/
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 512dca8abc7d..bf779461df13 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -68,6 +68,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
.old_blkaddr = index,
.new_blkaddr = index,
.encrypted_page = NULL,
+ .is_meta = is_meta,
};
if (unlikely(!is_meta))
@@ -162,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
.op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
.encrypted_page = NULL,
.in_list = false,
+ .is_meta = (type != META_POR),
};
struct blk_plug plug;
@@ -569,13 +571,8 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
struct node_info ni;
int err = acquire_orphan_inode(sbi);
- if (err) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_msg(sbi->sb, KERN_WARNING,
- "%s: orphan failed (ino=%x), run fsck to fix.",
- __func__, ino);
- return err;
- }
+ if (err)
+ goto err_out;
__add_ino_entry(sbi, ino, 0, ORPHAN_INO);
@@ -589,6 +586,11 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
return PTR_ERR(inode);
}
+ err = dquot_initialize(inode);
+ if (err)
+ goto err_out;
+
+ dquot_initialize(inode);
clear_nlink(inode);
/* truncate all the data during iput */
@@ -598,14 +600,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
/* ENOMEM was fully retried in f2fs_evict_inode. */
if (ni.blk_addr != NULL_ADDR) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_msg(sbi->sb, KERN_WARNING,
- "%s: orphan failed (ino=%x) by kernel, retry mount.",
- __func__, ino);
- return -EIO;
+ err = -EIO;
+ goto err_out;
}
__remove_ino_entry(sbi, ino, ORPHAN_INO);
return 0;
+
+err_out:
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: orphan failed (ino=%x), run fsck to fix.",
+ __func__, ino);
+ return err;
}
int recover_orphan_inodes(struct f2fs_sb_info *sbi)
@@ -1136,6 +1142,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (cpc->reason & CP_TRIMMED)
__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
if (cpc->reason & CP_UMOUNT)
__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
@@ -1162,6 +1170,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
+static void commit_checkpoint(struct f2fs_sb_info *sbi,
+ void *src, block_t blk_addr)
+{
+ struct writeback_control wbc = {
+ .for_reclaim = 0,
+ };
+
+ /*
+ * pagevec_lookup_tag and lock_page again will take
+ * some extra time. Therefore, update_meta_pages and
+ * sync_meta_pages are combined in this function.
+ */
+ struct page *page = grab_meta_page(sbi, blk_addr);
+ int err;
+
+ memcpy(page_address(page), src, PAGE_SIZE);
+ set_page_dirty(page);
+
+ f2fs_wait_on_page_writeback(page, META, true);
+ f2fs_bug_on(sbi, PageWriteback(page));
+ if (unlikely(!clear_page_dirty_for_io(page)))
+ f2fs_bug_on(sbi, 1);
+
+ /* writeout cp pack 2 page */
+ err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
+ f2fs_bug_on(sbi, err);
+
+ f2fs_put_page(page, 0);
+
+ /* submit checkpoint (with barrier if NOBARRIER is not set) */
+ f2fs_submit_merged_write(sbi, META_FLUSH);
+}
+
static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1264,16 +1305,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
}
- /* need to wait for end_io results */
- wait_on_all_pages_writeback(sbi);
- if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
-
- /* flush all device cache */
- err = f2fs_flush_device_cache(sbi);
- if (err)
- return err;
-
/* write out checkpoint buffer at block 0 */
update_meta_page(sbi, ckpt, start_blk++);
@@ -1301,26 +1332,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk += NR_CURSEG_NODE_TYPE;
}
- /* writeout checkpoint block */
- update_meta_page(sbi, ckpt, start_blk);
+ /* update user_block_counts */
+ sbi->last_valid_block_count = sbi->total_valid_block_count;
+ percpu_counter_set(&sbi->alloc_valid_block_count, 0);
+
+ /* Here, we have one bio having CP pack except cp pack 2 page */
+ sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- /* wait for previous submitted node/meta pages writeback */
+ /* wait for previous submitted meta pages writeback */
wait_on_all_pages_writeback(sbi);
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
- filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
-
- /* update user_block_counts */
- sbi->last_valid_block_count = sbi->total_valid_block_count;
- percpu_counter_set(&sbi->alloc_valid_block_count, 0);
-
- /* Here, we only have one bio having CP pack */
- sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
+ /* flush all device cache */
+ err = f2fs_flush_device_cache(sbi);
+ if (err)
+ return err;
- /* wait for previous submitted meta pages writeback */
+ /* barrier and flush checkpoint cp pack 2 page if it can */
+ commit_checkpoint(sbi, ckpt, start_blk);
wait_on_all_pages_writeback(sbi);
release_ino_entry(sbi, false);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7578ed1a85e0..db50686f5096 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -175,15 +175,22 @@ static bool __same_bdev(struct f2fs_sb_info *sbi,
*/
static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
struct writeback_control *wbc,
- int npages, bool is_read)
+ int npages, bool is_read,
+ enum page_type type, enum temp_type temp)
{
struct bio *bio;
bio = f2fs_bio_alloc(sbi, npages, true);
f2fs_target_device(sbi, blk_addr, bio);
- bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
- bio->bi_private = is_read ? NULL : sbi;
+ if (is_read) {
+ bio->bi_end_io = f2fs_read_end_io;
+ bio->bi_private = NULL;
+ } else {
+ bio->bi_end_io = f2fs_write_end_io;
+ bio->bi_private = sbi;
+ bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp);
+ }
if (wbc)
wbc_init_bio(wbc, bio);
@@ -196,13 +203,12 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
if (!is_read_io(bio_op(bio))) {
unsigned int start;
- if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
- current->plug && (type == DATA || type == NODE))
- blk_finish_plug(current->plug);
-
if (type != DATA && type != NODE)
goto submit_io;
+ if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug)
+ blk_finish_plug(current->plug);
+
start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
start %= F2FS_IO_SIZE(sbi);
@@ -377,12 +383,13 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
struct page *page = fio->encrypted_page ?
fio->encrypted_page : fio->page;
+ verify_block_addr(fio, fio->new_blkaddr);
trace_f2fs_submit_page_bio(page, fio);
f2fs_trace_ios(fio, 0);
/* Allocate a new bio */
bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc,
- 1, is_read_io(fio->op));
+ 1, is_read_io(fio->op), fio->type, fio->temp);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
@@ -422,8 +429,8 @@ next:
}
if (fio->old_blkaddr != NEW_ADDR)
- verify_block_addr(sbi, fio->old_blkaddr);
- verify_block_addr(sbi, fio->new_blkaddr);
+ verify_block_addr(fio, fio->old_blkaddr);
+ verify_block_addr(fio, fio->new_blkaddr);
bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
@@ -445,7 +452,8 @@ alloc_new:
goto out_fail;
}
io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc,
- BIO_MAX_PAGES, false);
+ BIO_MAX_PAGES, false,
+ fio->type, fio->temp);
io->fio = *fio;
}
@@ -832,13 +840,6 @@ alloc:
return 0;
}
-static inline bool __force_buffered_io(struct inode *inode, int rw)
-{
- return (f2fs_encrypted_file(inode) ||
- (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
- F2FS_I_SB(inode)->s_ndevs);
-}
-
int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
@@ -870,7 +871,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
if (direct_io) {
map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint);
- flag = __force_buffered_io(inode, WRITE) ?
+ flag = f2fs_force_buffered_io(inode, WRITE) ?
F2FS_GET_BLOCK_PRE_AIO :
F2FS_GET_BLOCK_PRE_DIO;
goto map_blocks;
@@ -1114,6 +1115,31 @@ out:
return err;
}
+bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
+{
+ struct f2fs_map_blocks map;
+ block_t last_lblk;
+ int err;
+
+ if (pos + len > i_size_read(inode))
+ return false;
+
+ map.m_lblk = F2FS_BYTES_TO_BLK(pos);
+ map.m_next_pgofs = NULL;
+ map.m_next_extent = NULL;
+ map.m_seg_type = NO_CHECK_TYPE;
+ last_lblk = F2FS_BLK_ALIGN(pos + len);
+
+ while (map.m_lblk < last_lblk) {
+ map.m_len = last_lblk - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
+ if (err || map.m_len == 0)
+ return false;
+ map.m_lblk += map.m_len;
+ }
+ return true;
+}
+
static int __get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create, int flag,
pgoff_t *next_pgofs, int seg_type)
@@ -2287,25 +2313,41 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
int rw = iov_iter_rw(iter);
int err;
+ enum rw_hint hint = iocb->ki_hint;
+ int whint_mode = F2FS_OPTION(sbi).whint_mode;
err = check_direct_IO(inode, iter, offset);
if (err)
return err;
- if (__force_buffered_io(inode, rw))
+ if (f2fs_force_buffered_io(inode, rw))
return 0;
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
- down_read(&F2FS_I(inode)->dio_rwsem[rw]);
+ if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
+ iocb->ki_hint = WRITE_LIFE_NOT_SET;
+
+ if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ iocb->ki_hint = hint;
+ err = -EAGAIN;
+ goto out;
+ }
+ down_read(&F2FS_I(inode)->dio_rwsem[rw]);
+ }
+
err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
up_read(&F2FS_I(inode)->dio_rwsem[rw]);
if (rw == WRITE) {
+ if (whint_mode == WHINT_MODE_OFF)
+ iocb->ki_hint = hint;
if (err > 0) {
f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
err);
@@ -2315,6 +2357,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
}
+out:
trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
return err;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index f00b5ed8c011..fe661274ff10 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -94,14 +94,12 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
- dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
+ dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
make_dentry_ptr_block(NULL, &d, dentry_blk);
de = find_target_dentry(fname, namehash, max_slots, &d);
if (de)
*res_page = dentry_page;
- else
- kunmap(dentry_page);
return de;
}
@@ -287,7 +285,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
de = f2fs_find_entry(dir, qstr, page);
if (de) {
res = le32_to_cpu(de->ino);
- f2fs_dentry_kunmap(dir, *page);
f2fs_put_page(*page, 0);
}
@@ -302,7 +299,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
f2fs_wait_on_page_writeback(page, type, true);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode->i_mode);
- f2fs_dentry_kunmap(dir, page);
set_page_dirty(page);
dir->i_mtime = dir->i_ctime = current_time(dir);
@@ -350,13 +346,11 @@ static int make_empty_dir(struct inode *inode,
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
- dentry_blk = kmap_atomic(dentry_page);
+ dentry_blk = page_address(dentry_page);
make_dentry_ptr_block(NULL, &d, dentry_blk);
do_make_empty_dir(inode, parent, &d);
- kunmap_atomic(dentry_blk);
-
set_page_dirty(dentry_page);
f2fs_put_page(dentry_page, 1);
return 0;
@@ -367,6 +361,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
struct page *dpage)
{
struct page *page;
+ int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir));
int err;
if (is_inode_flag_set(inode, FI_NEW_INODE)) {
@@ -393,7 +388,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
+ if ((f2fs_encrypted_inode(dir) || dummy_encrypt) &&
+ f2fs_may_encrypt(inode)) {
err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
goto put_error;
@@ -402,8 +398,6 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
if (IS_ERR(page))
return page;
-
- set_cold_node(inode, page);
}
if (new_name) {
@@ -547,13 +541,12 @@ start:
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
- dentry_blk = kmap(dentry_page);
+ dentry_blk = page_address(dentry_page);
bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
slots, NR_DENTRY_IN_BLOCK);
if (bit_pos < NR_DENTRY_IN_BLOCK)
goto add_dentry;
- kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
@@ -588,7 +581,6 @@ fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
- kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
return err;
@@ -642,7 +634,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
F2FS_I(dir)->task = NULL;
}
if (de) {
- f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
err = -EEXIST;
} else if (IS_ERR(page)) {
@@ -713,7 +704,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
- add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO);
+ if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT)
+ add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO);
if (f2fs_has_inline_dentry(dir))
return f2fs_delete_inline_entry(dentry, page, dir, inode);
@@ -730,7 +722,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
NR_DENTRY_IN_BLOCK,
0);
- kunmap(page); /* kunmap - pair of f2fs_find_entry */
set_page_dirty(page);
dir->i_ctime = dir->i_mtime = current_time(dir);
@@ -775,7 +766,7 @@ bool f2fs_empty_dir(struct inode *dir)
return false;
}
- dentry_blk = kmap_atomic(dentry_page);
+ dentry_blk = page_address(dentry_page);
if (bidx == 0)
bit_pos = 2;
else
@@ -783,7 +774,6 @@ bool f2fs_empty_dir(struct inode *dir)
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
NR_DENTRY_IN_BLOCK,
bit_pos);
- kunmap_atomic(dentry_blk);
f2fs_put_page(dentry_page, 1);
@@ -901,19 +891,17 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
}
}
- dentry_blk = kmap(dentry_page);
+ dentry_blk = page_address(dentry_page);
make_dentry_ptr_block(inode, &d, dentry_blk);
err = f2fs_fill_dentries(ctx, &d,
n * NR_DENTRY_IN_BLOCK, &fstr);
if (err) {
- kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
break;
}
- kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
out_free:
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ff2352a0ed15..d5a861bf2b42 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -460,7 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode,
struct rb_node *insert_parent)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct rb_node **p = &et->root.rb_node;
+ struct rb_node **p;
struct rb_node *parent = NULL;
struct extent_node *en = NULL;
@@ -706,6 +706,9 @@ void f2fs_drop_extent_tree(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ if (!f2fs_may_extent_tree(inode))
+ return;
+
set_inode_flag(inode, FI_NO_EXTENT);
write_lock(&et->lock);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6300ac5bcbe4..1df7f10476d6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -98,9 +98,10 @@ extern char *fault_name[FAULT_MAX];
#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000
#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
-#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option)
+#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
+#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option)
#define ver_after(a, b) (typecheck(unsigned long long, a) && \
typecheck(unsigned long long, b) && \
@@ -113,7 +114,26 @@ typedef u32 block_t; /*
typedef u32 nid_t;
struct f2fs_mount_info {
- unsigned int opt;
+ unsigned int opt;
+ int write_io_size_bits; /* Write IO size bits */
+ block_t root_reserved_blocks; /* root reserved blocks */
+ kuid_t s_resuid; /* reserved blocks for uid */
+ kgid_t s_resgid; /* reserved blocks for gid */
+ int active_logs; /* # of active logs */
+ int inline_xattr_size; /* inline xattr size */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct f2fs_fault_info fault_info; /* For fault injection */
+#endif
+#ifdef CONFIG_QUOTA
+ /* Names of quota files with journalled quota */
+ char *s_qf_names[MAXQUOTAS];
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
+ /* For which write hints are passed down to block layer */
+ int whint_mode;
+ int alloc_mode; /* segment allocation policy */
+ int fsync_mode; /* fsync policy */
+ bool test_dummy_encryption; /* test dummy encryption */
};
#define F2FS_FEATURE_ENCRYPT 0x0001
@@ -125,6 +145,8 @@ struct f2fs_mount_info {
#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040
#define F2FS_FEATURE_QUOTA_INO 0x0080
#define F2FS_FEATURE_INODE_CRTIME 0x0100
+#define F2FS_FEATURE_LOST_FOUND 0x0200
+#define F2FS_FEATURE_VERITY 0x0400 /* reserved */
#define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -450,7 +472,7 @@ static inline void make_dentry_ptr_block(struct inode *inode,
d->inode = inode;
d->max = NR_DENTRY_IN_BLOCK;
d->nr_bitmap = SIZE_OF_DENTRY_BITMAP;
- d->bitmap = &t->dentry_bitmap;
+ d->bitmap = t->dentry_bitmap;
d->dentry = t->dentry;
d->filename = t->filename;
}
@@ -576,6 +598,8 @@ enum {
#define FADVISE_ENCRYPT_BIT 0x04
#define FADVISE_ENC_NAME_BIT 0x08
#define FADVISE_KEEP_SIZE_BIT 0x10
+#define FADVISE_HOT_BIT 0x20
+#define FADVISE_VERITY_BIT 0x40 /* reserved */
#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
@@ -590,6 +614,9 @@ enum {
#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT)
#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT)
+#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT)
+#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT)
#define DEF_DIR_LEVEL 0
@@ -637,6 +664,7 @@ struct f2fs_inode_info {
kprojid_t i_projid; /* id for project quota */
int i_inline_xattr_size; /* inline xattr size */
struct timespec i_crtime; /* inode creation time */
+ struct timespec i_disk_time[4]; /* inode disk times */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -743,7 +771,7 @@ struct f2fs_nm_info {
unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */
spinlock_t nid_list_lock; /* protect nid lists ops */
struct mutex build_lock; /* lock for build free nids */
- unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
+ unsigned char **free_nid_bitmap;
unsigned char *nat_block_bitmap;
unsigned short *free_nid_count; /* free nid count of NAT block */
@@ -976,6 +1004,7 @@ struct f2fs_io_info {
bool submitted; /* indicate IO submission */
int need_lock; /* indicate we need to lock cp_rwsem */
bool in_list; /* indicate fio is in io_list */
+ bool is_meta; /* indicate borrow meta inode mapping or not */
enum iostat_type io_type; /* io type */
struct writeback_control *io_wbc; /* writeback control */
};
@@ -1037,10 +1066,34 @@ enum {
MAX_TIME,
};
+enum {
+ WHINT_MODE_OFF, /* not pass down write hints */
+ WHINT_MODE_USER, /* try to pass down hints given by users */
+ WHINT_MODE_FS, /* pass down hints with F2FS policy */
+};
+
+enum {
+ ALLOC_MODE_DEFAULT, /* stay default */
+ ALLOC_MODE_REUSE, /* reuse segments as much as possible */
+};
+
+enum fsync_mode {
+ FSYNC_MODE_POSIX, /* fsync follows posix semantics */
+ FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */
+};
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#define DUMMY_ENCRYPTION_ENABLED(sbi) \
+ (unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
+#else
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
+#endif
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
struct f2fs_super_block *raw_super; /* raw super block pointer */
+ struct rw_semaphore sb_lock; /* lock for raw super block */
int valid_super_block; /* valid super block no */
unsigned long s_flag; /* flags for sbi */
@@ -1060,7 +1113,6 @@ struct f2fs_sb_info {
struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE];
/* bio ordering for NODE/DATA */
- int write_io_size_bits; /* Write IO size bits */
mempool_t *write_io_dummy; /* Dummy pages */
/* for checkpoint */
@@ -1110,9 +1162,7 @@ struct f2fs_sb_info {
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
loff_t max_file_blocks; /* max block index of file */
- int active_logs; /* # of active logs */
int dir_level; /* directory level */
- int inline_xattr_size; /* inline xattr size */
unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */
int readdir_ra; /* readahead inode in readdir */
@@ -1122,9 +1172,6 @@ struct f2fs_sb_info {
block_t last_valid_block_count; /* for recovery */
block_t reserved_blocks; /* configurable reserved blocks */
block_t current_reserved_blocks; /* current reserved blocks */
- block_t root_reserved_blocks; /* root reserved blocks */
- kuid_t s_resuid; /* reserved blocks for uid */
- kgid_t s_resgid; /* reserved blocks for gid */
unsigned int nquota_files; /* # of quota sysfile */
@@ -1209,17 +1256,6 @@ struct f2fs_sb_info {
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_chksum_seed;
-
- /* For fault injection */
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- struct f2fs_fault_info fault_info;
-#endif
-
-#ifdef CONFIG_QUOTA
- /* Names of quota files with journalled quota */
- char *s_qf_names[MAXQUOTAS];
- int s_jquota_fmt; /* Format of quota to use */
-#endif
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
@@ -1229,7 +1265,7 @@ struct f2fs_sb_info {
__func__, __builtin_return_address(0))
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
- struct f2fs_fault_info *ffi = &sbi->fault_info;
+ struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
if (!ffi->inject_rate)
return false;
@@ -1586,12 +1622,12 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
return false;
if (IS_NOQUOTA(inode))
return true;
- if (capable(CAP_SYS_RESOURCE))
+ if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid()))
return true;
- if (uid_eq(sbi->s_resuid, current_fsuid()))
+ if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) &&
+ in_group_p(F2FS_OPTION(sbi).s_resgid))
return true;
- if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) &&
- in_group_p(sbi->s_resgid))
+ if (capable(CAP_SYS_RESOURCE))
return true;
return false;
}
@@ -1627,7 +1663,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
sbi->current_reserved_blocks;
if (!__allow_reserved_blocks(sbi, inode))
- avail_user_block_count -= sbi->root_reserved_blocks;
+ avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
diff = sbi->total_valid_block_count - avail_user_block_count;
@@ -1762,6 +1798,12 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
int offset;
+ if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) {
+ offset = (flag == SIT_BITMAP) ?
+ le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0;
+ return &ckpt->sit_nat_version_bitmap + offset;
+ }
+
if (__cp_payload(sbi) > 0) {
if (flag == NAT_BITMAP)
return &ckpt->sit_nat_version_bitmap;
@@ -1828,7 +1870,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
sbi->current_reserved_blocks + 1;
if (!__allow_reserved_blocks(sbi, inode))
- valid_block_count += sbi->root_reserved_blocks;
+ valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
if (unlikely(valid_block_count > sbi->user_block_count)) {
spin_unlock(&sbi->stat_lock);
@@ -2399,12 +2441,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode)
return is_inode_flag_set(inode, FI_INLINE_DENTRY);
}
-static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
-{
- if (!f2fs_has_inline_dentry(dir))
- kunmap(page);
-}
-
static inline int is_file(struct inode *inode, int type)
{
return F2FS_I(inode)->i_advise & type;
@@ -2436,7 +2472,17 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
}
if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
file_keep_isize(inode) ||
- i_size_read(inode) & PAGE_MASK)
+ i_size_read(inode) & ~PAGE_MASK)
+ return false;
+
+ if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+ return false;
+ if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+ return false;
+ if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+ return false;
+ if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3,
+ &F2FS_I(inode)->i_crtime))
return false;
down_read(&F2FS_I(inode)->i_sem);
@@ -2446,9 +2492,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
return ret;
}
-static inline int f2fs_readonly(struct super_block *sb)
+static inline bool f2fs_readonly(struct super_block *sb)
{
- return sb->s_flags & SB_RDONLY;
+ return sb_rdonly(sb);
}
static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
@@ -2596,6 +2642,8 @@ void handle_failed_inode(struct inode *inode);
/*
* namei.c
*/
+int update_extension_list(struct f2fs_sb_info *sbi, const char *name,
+ bool hot, bool set);
struct dentry *f2fs_get_parent(struct dentry *child);
/*
@@ -2768,6 +2816,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init create_segment_manager_caches(void);
void destroy_segment_manager_caches(void);
int rw_hint_to_seg_type(enum rw_hint hint);
+enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type,
+ enum temp_type temp);
/*
* checkpoint.c
@@ -2850,6 +2900,7 @@ int f2fs_release_page(struct page *page, gfp_t wait);
int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode);
#endif
+bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
/*
* gc.c
@@ -3172,45 +3223,21 @@ static inline bool f2fs_bio_encrypted(struct bio *bio)
return bio->bi_private != NULL;
}
-static inline int f2fs_sb_has_crypto(struct super_block *sb)
-{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
-}
-
-static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
-{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
-}
-
-static inline int f2fs_sb_has_extra_attr(struct super_block *sb)
-{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR);
-}
-
-static inline int f2fs_sb_has_project_quota(struct super_block *sb)
-{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA);
-}
-
-static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
-{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
-}
-
-static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb)
-{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
-}
-
-static inline int f2fs_sb_has_quota_ino(struct super_block *sb)
-{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
+#define F2FS_FEATURE_FUNCS(name, flagname) \
+static inline int f2fs_sb_has_##name(struct super_block *sb) \
+{ \
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \
}
-static inline int f2fs_sb_has_inode_crtime(struct super_block *sb)
-{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME);
-}
+F2FS_FEATURE_FUNCS(encrypt, ENCRYPT);
+F2FS_FEATURE_FUNCS(blkzoned, BLKZONED);
+F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR);
+F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA);
+F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM);
+F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO);
+F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME);
+F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
#ifdef CONFIG_BLK_DEV_ZONED
static inline int get_blkz_type(struct f2fs_sb_info *sbi,
@@ -3230,7 +3257,7 @@ static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
{
struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
- return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb);
+ return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb);
}
static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
@@ -3259,4 +3286,11 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
#endif
}
+static inline bool f2fs_force_buffered_io(struct inode *inode, int rw)
+{
+ return (f2fs_encrypted_file(inode) ||
+ (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
+ F2FS_I_SB(inode)->s_ndevs);
+}
+
#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 672a542e5464..6b94f19b3fa8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -163,9 +163,10 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
cp_reason = CP_NODE_NEED_CP;
else if (test_opt(sbi, FASTBOOT))
cp_reason = CP_FASTBOOT_MODE;
- else if (sbi->active_logs == 2)
+ else if (F2FS_OPTION(sbi).active_logs == 2)
cp_reason = CP_SPEC_LOG_NUM;
- else if (need_dentry_mark(sbi, inode->i_ino) &&
+ else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT &&
+ need_dentry_mark(sbi, inode->i_ino) &&
exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO))
cp_reason = CP_RECOVER_DIR;
@@ -479,6 +480,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
+
+ filp->f_mode |= FMODE_NOWAIT;
+
return dquot_file_open(inode, filp);
}
@@ -569,7 +573,6 @@ truncate_out:
int truncate_blocks(struct inode *inode, u64 from, bool lock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- unsigned int blocksize = inode->i_sb->s_blocksize;
struct dnode_of_data dn;
pgoff_t free_from;
int count = 0, err = 0;
@@ -578,7 +581,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
trace_f2fs_truncate_blocks_enter(inode, from);
- free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
+ free_from = (pgoff_t)F2FS_BLK_ALIGN(from);
if (free_from >= sbi->max_file_blocks)
goto free_partial;
@@ -1348,8 +1351,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
}
out:
- if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
- f2fs_i_size_write(inode, new_size);
+ if (new_size > i_size_read(inode)) {
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ file_set_keep_isize(inode);
+ else
+ f2fs_i_size_write(inode, new_size);
+ }
out_sem:
up_write(&F2FS_I(inode)->i_mmap_sem);
@@ -1711,6 +1718,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
inode_lock(inode);
+ down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
if (f2fs_is_volatile_file(inode))
goto err_out;
@@ -1729,6 +1738,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
}
err_out:
+ up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -1938,7 +1948,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- if (!f2fs_sb_has_crypto(inode->i_sb))
+ if (!f2fs_sb_has_encrypt(inode->i_sb))
return -EOPNOTSUPP;
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -1948,7 +1958,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
{
- if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb))
+ if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
}
@@ -1959,16 +1969,18 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
- if (!f2fs_sb_has_crypto(inode->i_sb))
+ if (!f2fs_sb_has_encrypt(inode->i_sb))
return -EOPNOTSUPP;
- if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
- goto got_it;
-
err = mnt_want_write_file(filp);
if (err)
return err;
+ down_write(&sbi->sb_lock);
+
+ if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
+ goto got_it;
+
/* update superblock with uuid */
generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
@@ -1976,15 +1988,16 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
if (err) {
/* undo new data */
memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
- mnt_drop_write_file(filp);
- return err;
+ goto out_err;
}
- mnt_drop_write_file(filp);
got_it:
if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
16))
- return -EFAULT;
- return 0;
+ err = -EFAULT;
+out_err:
+ up_write(&sbi->sb_lock);
+ mnt_drop_write_file(filp);
+ return err;
}
static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
@@ -2045,8 +2058,10 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
return ret;
end = range.start + range.len;
- if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi))
- return -EINVAL;
+ if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) {
+ ret = -EINVAL;
+ goto out;
+ }
do_more:
if (!range.sync) {
if (!mutex_trylock(&sbi->gc_mutex)) {
@@ -2885,25 +2900,54 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
- inode_lock(inode);
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
+
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
+
ret = generic_write_checks(iocb, from);
if (ret > 0) {
+ bool preallocated = false;
+ size_t target_size = 0;
int err;
if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
set_inode_flag(inode, FI_NO_PREALLOC);
- err = f2fs_preallocate_blocks(iocb, from);
- if (err) {
- clear_inode_flag(inode, FI_NO_PREALLOC);
- inode_unlock(inode);
- return err;
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ (iocb->ki_flags & IOCB_DIRECT)) {
+ if (!f2fs_overwrite_io(inode, iocb->ki_pos,
+ iov_iter_count(from)) ||
+ f2fs_has_inline_data(inode) ||
+ f2fs_force_buffered_io(inode, WRITE)) {
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+
+ } else {
+ preallocated = true;
+ target_size = iocb->ki_pos + iov_iter_count(from);
+
+ err = f2fs_preallocate_blocks(iocb, from);
+ if (err) {
+ clear_inode_flag(inode, FI_NO_PREALLOC);
+ inode_unlock(inode);
+ return err;
+ }
}
blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
blk_finish_plug(&plug);
clear_inode_flag(inode, FI_NO_PREALLOC);
+ /* if we couldn't write data, we should deallocate blocks. */
+ if (preallocated && i_size_read(inode) < target_size)
+ f2fs_truncate(inode);
+
if (ret > 0)
f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index aa720cc44509..bfb7a4a3a929 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -76,14 +76,15 @@ static int gc_thread_func(void *data)
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
- if (!mutex_trylock(&sbi->gc_mutex))
- goto next;
-
if (gc_th->gc_urgent) {
wait_ms = gc_th->urgent_sleep_time;
+ mutex_lock(&sbi->gc_mutex);
goto do_gc;
}
+ if (!mutex_trylock(&sbi->gc_mutex))
+ goto next;
+
if (!is_idle(sbi)) {
increase_sleep_time(gc_th, &wait_ms);
mutex_unlock(&sbi->gc_mutex);
@@ -161,12 +162,17 @@ static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type)
{
int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
- if (gc_th && gc_th->gc_idle) {
+ if (!gc_th)
+ return gc_mode;
+
+ if (gc_th->gc_idle) {
if (gc_th->gc_idle == 1)
gc_mode = GC_CB;
else if (gc_th->gc_idle == 2)
gc_mode = GC_GREEDY;
}
+ if (gc_th->gc_urgent)
+ gc_mode = GC_GREEDY;
return gc_mode;
}
@@ -188,11 +194,14 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
}
/* we need to check every dirty segments in the FG_GC case */
- if (gc_type != FG_GC && p->max_search > sbi->max_victim_search)
+ if (gc_type != FG_GC &&
+ (sbi->gc_thread && !sbi->gc_thread->gc_urgent) &&
+ p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
- /* let's select beginning hot/small space first */
- if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+ /* let's select beginning hot/small space first in no_heap mode*/
+ if (test_opt(sbi, NOHEAP) &&
+ (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
p->offset = 0;
else
p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 90e38d8ea688..3b77d6421218 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -369,7 +369,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
f2fs_wait_on_page_writeback(page, DATA, true);
zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE);
- dentry_blk = kmap_atomic(page);
+ dentry_blk = page_address(page);
make_dentry_ptr_inline(dir, &src, inline_dentry);
make_dentry_ptr_block(dir, &dst, dentry_blk);
@@ -386,7 +386,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max);
memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN);
- kunmap_atomic(dentry_blk);
if (!PageUptodate(page))
SetPageUptodate(page);
set_page_dirty(page);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 205add3d0f3a..e0d9e8f27ed2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -284,6 +284,10 @@ static int do_read_inode(struct inode *inode)
fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
}
+ F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
+ F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
+ F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+ F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -328,7 +332,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+ inode_nohighmem(inode);
} else if (S_ISLNK(inode->i_mode)) {
if (f2fs_encrypted_inode(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
@@ -439,12 +443,15 @@ void update_inode(struct inode *inode, struct page *node_page)
}
__set_inode_rdev(inode, ri);
- set_cold_node(inode, node_page);
/* deleted inode */
if (inode->i_nlink == 0)
clear_inline_node(node_page);
+ F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
+ F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
+ F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+ F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
}
void update_inode_page(struct inode *inode)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b68e7b03959f..d5098efe577c 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -78,7 +78,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
set_inode_flag(inode, FI_NEW_INODE);
/* If the directory encrypted, then we should encrypt the inode. */
- if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
+ if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
if (f2fs_sb_has_extra_attr(sbi->sb)) {
@@ -97,7 +98,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
if (f2fs_has_inline_xattr(inode))
- xattr_size = sbi->inline_xattr_size;
+ xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
/* Otherwise, will be 0 */
} else if (f2fs_has_inline_xattr(inode) ||
f2fs_has_inline_dentry(inode)) {
@@ -142,7 +143,7 @@ fail_drop:
return ERR_PTR(err);
}
-static int is_multimedia_file(const unsigned char *s, const char *sub)
+static int is_extension_exist(const unsigned char *s, const char *sub)
{
size_t slen = strlen(s);
size_t sublen = strlen(sub);
@@ -168,19 +169,94 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
/*
* Set multimedia files as cold files for hot/cold data separation
*/
-static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
+static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
const unsigned char *name)
{
- int i;
- __u8 (*extlist)[8] = sbi->raw_super->extension_list;
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+ int i, cold_count, hot_count;
+
+ down_read(&sbi->sb_lock);
+
+ cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ hot_count = sbi->raw_super->hot_ext_count;
- int count = le32_to_cpu(sbi->raw_super->extension_count);
- for (i = 0; i < count; i++) {
- if (is_multimedia_file(name, extlist[i])) {
+ for (i = 0; i < cold_count + hot_count; i++) {
+ if (!is_extension_exist(name, extlist[i]))
+ continue;
+ if (i < cold_count)
file_set_cold(inode);
- break;
- }
+ else
+ file_set_hot(inode);
+ break;
}
+
+ up_read(&sbi->sb_lock);
+}
+
+int update_extension_list(struct f2fs_sb_info *sbi, const char *name,
+ bool hot, bool set)
+{
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+ int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ int hot_count = sbi->raw_super->hot_ext_count;
+ int total_count = cold_count + hot_count;
+ int start, count;
+ int i;
+
+ if (set) {
+ if (total_count == F2FS_MAX_EXTENSION)
+ return -EINVAL;
+ } else {
+ if (!hot && !cold_count)
+ return -EINVAL;
+ if (hot && !hot_count)
+ return -EINVAL;
+ }
+
+ if (hot) {
+ start = cold_count;
+ count = total_count;
+ } else {
+ start = 0;
+ count = cold_count;
+ }
+
+ for (i = start; i < count; i++) {
+ if (strcmp(name, extlist[i]))
+ continue;
+
+ if (set)
+ return -EINVAL;
+
+ memcpy(extlist[i], extlist[i + 1],
+ F2FS_EXTENSION_LEN * (total_count - i - 1));
+ memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN);
+ if (hot)
+ sbi->raw_super->hot_ext_count = hot_count - 1;
+ else
+ sbi->raw_super->extension_count =
+ cpu_to_le32(cold_count - 1);
+ return 0;
+ }
+
+ if (!set)
+ return -EINVAL;
+
+ if (hot) {
+ strncpy(extlist[count], name, strlen(name));
+ sbi->raw_super->hot_ext_count = hot_count + 1;
+ } else {
+ char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];
+
+ memcpy(buf, &extlist[cold_count],
+ F2FS_EXTENSION_LEN * hot_count);
+ memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN);
+ strncpy(extlist[cold_count], name, strlen(name));
+ memcpy(&extlist[cold_count + 1], buf,
+ F2FS_EXTENSION_LEN * hot_count);
+ sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1);
+ }
+ return 0;
}
static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
@@ -203,7 +279,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
return PTR_ERR(inode);
if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
- set_cold_files(sbi, inode, dentry->d_name.name);
+ set_file_temperature(sbi, inode, dentry->d_name.name);
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
@@ -317,7 +393,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
de = f2fs_find_entry(dir, &dot, &page);
if (de) {
- f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
} else if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -329,14 +404,12 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
}
de = f2fs_find_entry(dir, &dotdot, &page);
- if (de) {
- f2fs_dentry_kunmap(dir, page);
+ if (de)
f2fs_put_page(page, 0);
- } else if (IS_ERR(page)) {
+ else if (IS_ERR(page))
err = PTR_ERR(page);
- } else {
+ else
err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
- }
out:
if (!err)
clear_inode_flag(dir, FI_INLINE_DOTS);
@@ -377,7 +450,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
}
ino = le32_to_cpu(de->ino);
- f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
inode = f2fs_iget(dir->i_sb, ino);
@@ -452,7 +524,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
err = acquire_orphan_inode(sbi);
if (err) {
f2fs_unlock_op(sbi);
- f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
goto fail;
}
@@ -579,7 +650,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+ inode_nohighmem(inode);
set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
@@ -717,10 +788,12 @@ out:
static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+ if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- if (f2fs_encrypted_inode(dir)) {
+ if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
int err = fscrypt_get_encryption_info(dir);
if (err)
return err;
@@ -893,16 +966,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (old_dir_entry) {
- if (old_dir != new_dir && !whiteout) {
+ if (old_dir != new_dir && !whiteout)
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
- } else {
- f2fs_dentry_kunmap(old_inode, old_dir_page);
+ else
f2fs_put_page(old_dir_page, 0);
- }
f2fs_i_links_write(old_dir, false);
}
- add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+ if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
+ add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
f2fs_unlock_op(sbi);
@@ -912,20 +984,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
put_out_dir:
f2fs_unlock_op(sbi);
- if (new_page) {
- f2fs_dentry_kunmap(new_dir, new_page);
+ if (new_page)
f2fs_put_page(new_page, 0);
- }
out_whiteout:
if (whiteout)
iput(whiteout);
out_dir:
- if (old_dir_entry) {
- f2fs_dentry_kunmap(old_inode, old_dir_page);
+ if (old_dir_entry)
f2fs_put_page(old_dir_page, 0);
- }
out_old:
- f2fs_dentry_kunmap(old_dir, old_page);
f2fs_put_page(old_page, 0);
out:
return err;
@@ -1057,8 +1124,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
}
f2fs_mark_inode_dirty_sync(new_dir, false);
- add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO);
- add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+ if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) {
+ add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO);
+ add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+ }
f2fs_unlock_op(sbi);
@@ -1067,19 +1136,15 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
return 0;
out_new_dir:
if (new_dir_entry) {
- f2fs_dentry_kunmap(new_inode, new_dir_page);
f2fs_put_page(new_dir_page, 0);
}
out_old_dir:
if (old_dir_entry) {
- f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
out_new:
- f2fs_dentry_kunmap(new_dir, new_page);
f2fs_put_page(new_page, 0);
out_old:
- f2fs_dentry_kunmap(old_dir, old_page);
f2fs_put_page(old_page, 0);
out:
return err;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 177c438e4a56..9a99243054ba 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -193,8 +193,8 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
__free_nat_entry(e);
}
-static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne)
+static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
{
nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
struct nat_entry_set *head;
@@ -209,15 +209,36 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
head->entry_cnt = 0;
f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
}
+ return head;
+}
+
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ struct nat_entry_set *head;
+ bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR;
+
+ if (!new_ne)
+ head = __grab_nat_entry_set(nm_i, ne);
+
+ /*
+ * update entry_cnt in below condition:
+ * 1. update NEW_ADDR to valid block address;
+ * 2. update old block address to new one;
+ */
+ if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
+ !get_nat_flag(ne, IS_DIRTY)))
+ head->entry_cnt++;
+
+ set_nat_flag(ne, IS_PREALLOC, new_ne);
if (get_nat_flag(ne, IS_DIRTY))
goto refresh_list;
nm_i->dirty_nat_cnt++;
- head->entry_cnt++;
set_nat_flag(ne, IS_DIRTY, true);
refresh_list:
- if (nat_get_blkaddr(ne) == NEW_ADDR)
+ if (new_ne)
list_del_init(&ne->list);
else
list_move_tail(&ne->list, &head->entry_list);
@@ -1076,7 +1097,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
f2fs_wait_on_page_writeback(page, NODE, true);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
- set_cold_node(dn->inode, page);
+ set_cold_node(page, S_ISDIR(dn->inode->i_mode));
if (!PageUptodate(page))
SetPageUptodate(page);
if (set_page_dirty(page))
@@ -2291,6 +2312,7 @@ retry:
if (!PageUptodate(ipage))
SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
+ set_cold_node(page, false);
src = F2FS_INODE(page);
dst = F2FS_INODE(ipage);
@@ -2580,8 +2602,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
if (!enabled_nat_bits(sbi, NULL))
return 0;
- nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
- F2FS_BLKSIZE - 1);
+ nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
nm_i->nat_bits = f2fs_kzalloc(sbi,
nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
if (!nm_i->nat_bits)
@@ -2707,12 +2728,20 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
static int init_free_nid_cache(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
+ int i;
- nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks *
- NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
+ nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks *
+ sizeof(unsigned char *), GFP_KERNEL);
if (!nm_i->free_nid_bitmap)
return -ENOMEM;
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi,
+ NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL);
+ if (!nm_i->free_nid_bitmap)
+ return -ENOMEM;
+ }
+
nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8,
GFP_KERNEL);
if (!nm_i->nat_block_bitmap)
@@ -2803,7 +2832,13 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
up_write(&nm_i->nat_tree_lock);
kvfree(nm_i->nat_block_bitmap);
- kvfree(nm_i->free_nid_bitmap);
+ if (nm_i->free_nid_bitmap) {
+ int i;
+
+ for (i = 0; i < nm_i->nat_blocks; i++)
+ kvfree(nm_i->free_nid_bitmap[i]);
+ kfree(nm_i->free_nid_bitmap);
+ }
kvfree(nm_i->free_nid_count);
kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 081ef0d672bf..b95e49e4a928 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -44,6 +44,7 @@ enum {
HAS_FSYNCED_INODE, /* is the inode fsynced before? */
HAS_LAST_FSYNC, /* has the latest node fsync mark? */
IS_DIRTY, /* this nat entry is dirty? */
+ IS_PREALLOC, /* nat entry is preallocated */
};
/*
@@ -422,12 +423,12 @@ static inline void clear_inline_node(struct page *page)
ClearPageChecked(page);
}
-static inline void set_cold_node(struct inode *inode, struct page *page)
+static inline void set_cold_node(struct page *page, bool is_dir)
{
struct f2fs_node *rn = F2FS_NODE(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
- if (S_ISDIR(inode->i_mode))
+ if (is_dir)
flag &= ~(0x1 << COLD_BIT_SHIFT);
else
flag |= (0x1 << COLD_BIT_SHIFT);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 337f3363f48f..1b23d3febe4c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -144,7 +144,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
retry:
de = __f2fs_find_entry(dir, &fname, &page);
if (de && inode->i_ino == le32_to_cpu(de->ino))
- goto out_unmap_put;
+ goto out_put;
if (de) {
einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino));
@@ -153,19 +153,19 @@ retry:
err = PTR_ERR(einode);
if (err == -ENOENT)
err = -EEXIST;
- goto out_unmap_put;
+ goto out_put;
}
err = dquot_initialize(einode);
if (err) {
iput(einode);
- goto out_unmap_put;
+ goto out_put;
}
err = acquire_orphan_inode(F2FS_I_SB(inode));
if (err) {
iput(einode);
- goto out_unmap_put;
+ goto out_put;
}
f2fs_delete_entry(de, page, dir, einode);
iput(einode);
@@ -180,8 +180,7 @@ retry:
goto retry;
goto out;
-out_unmap_put:
- f2fs_dentry_kunmap(dir, page);
+out_put:
f2fs_put_page(page, 0);
out:
if (file_enc_name(inode))
@@ -243,6 +242,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
struct curseg_info *curseg;
struct page *page = NULL;
block_t blkaddr;
+ unsigned int loop_cnt = 0;
+ unsigned int free_blocks = sbi->user_block_count -
+ valid_user_blocks(sbi);
int err = 0;
/* get node pages in the current segment */
@@ -295,6 +297,17 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
if (IS_INODE(page) && is_dent_dnode(page))
entry->last_dentry = blkaddr;
next:
+ /* sanity check in order to detect looped node chain */
+ if (++loop_cnt >= free_blocks ||
+ blkaddr == next_blkaddr_of_node(page)) {
+ f2fs_msg(sbi->sb, KERN_NOTICE,
+ "%s: detect looped node chain, "
+ "blkaddr:%u, next:%u",
+ __func__, blkaddr, next_blkaddr_of_node(page));
+ err = -EINVAL;
+ break;
+ }
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index b16a8e6625aa..5854cc4e1d67 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1411,12 +1411,11 @@ static int issue_discard_thread(void *data)
if (kthread_should_stop())
return 0;
- if (dcc->discard_wake) {
+ if (dcc->discard_wake)
dcc->discard_wake = 0;
- if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
- init_discard_policy(&dpolicy,
- DPOLICY_FORCE, 1);
- }
+
+ if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
+ init_discard_policy(&dpolicy, DPOLICY_FORCE, 1);
sb_start_intwrite(sbi->sb);
@@ -1485,7 +1484,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkstart, block_t blklen)
{
#ifdef CONFIG_BLK_DEV_ZONED
- if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+ if (f2fs_sb_has_blkzoned(sbi->sb) &&
bdev_zoned_model(bdev) != BLK_ZONED_NONE)
return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
#endif
@@ -1683,7 +1682,7 @@ find_next:
sbi->blocks_per_seg, cur_pos);
len = next_pos - cur_pos;
- if (f2fs_sb_mounted_blkzoned(sbi->sb) ||
+ if (f2fs_sb_has_blkzoned(sbi->sb) ||
(force && len < cpc->trim_minlen))
goto skip;
@@ -1727,7 +1726,7 @@ void init_discard_policy(struct discard_policy *dpolicy,
} else if (discard_type == DPOLICY_FORCE) {
dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
- dpolicy->io_aware = true;
+ dpolicy->io_aware = false;
} else if (discard_type == DPOLICY_FSTRIM) {
dpolicy->io_aware = false;
} else if (discard_type == DPOLICY_UMOUNT) {
@@ -1863,7 +1862,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
sbi->discard_blks--;
/* don't overwrite by SSR to keep node chain */
- if (se->type == CURSEG_WARM_NODE) {
+ if (IS_NODESEG(se->type)) {
if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
se->ckpt_valid_blocks++;
}
@@ -2164,11 +2163,17 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
if (sbi->segs_per_sec != 1)
return CURSEG_I(sbi, type)->segno;
- if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+ if (test_opt(sbi, NOHEAP) &&
+ (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
return 0;
if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
return SIT_I(sbi)->last_victim[ALLOC_NEXT];
+
+ /* find segments from 0 to reuse freed segments */
+ if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
+ return 0;
+
return CURSEG_I(sbi, type)->segno;
}
@@ -2455,6 +2460,101 @@ int rw_hint_to_seg_type(enum rw_hint hint)
}
}
+/* This returns write hints for each segment type. This hints will be
+ * passed down to block layer. There are mapping tables which depend on
+ * the mount option 'whint_mode'.
+ *
+ * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
+ *
+ * 2) whint_mode=user-based. F2FS tries to pass down hints given by users.
+ *
+ * User F2FS Block
+ * ---- ---- -----
+ * META WRITE_LIFE_NOT_SET
+ * HOT_NODE "
+ * WARM_NODE "
+ * COLD_NODE "
+ * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
+ * extension list " "
+ *
+ * -- buffered io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE " "
+ * WRITE_LIFE_MEDIUM " "
+ * WRITE_LIFE_LONG " "
+ *
+ * -- direct io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE " WRITE_LIFE_NONE
+ * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
+ * WRITE_LIFE_LONG " WRITE_LIFE_LONG
+ *
+ * 3) whint_mode=fs-based. F2FS passes down hints with its policy.
+ *
+ * User F2FS Block
+ * ---- ---- -----
+ * META WRITE_LIFE_MEDIUM;
+ * HOT_NODE WRITE_LIFE_NOT_SET
+ * WARM_NODE "
+ * COLD_NODE WRITE_LIFE_NONE
+ * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
+ * extension list " "
+ *
+ * -- buffered io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
+ * WRITE_LIFE_NONE " "
+ * WRITE_LIFE_MEDIUM " "
+ * WRITE_LIFE_LONG " "
+ *
+ * -- direct io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE " WRITE_LIFE_NONE
+ * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
+ * WRITE_LIFE_LONG " WRITE_LIFE_LONG
+ */
+
+enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi,
+ enum page_type type, enum temp_type temp)
+{
+ if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) {
+ if (type == DATA) {
+ if (temp == WARM)
+ return WRITE_LIFE_NOT_SET;
+ else if (temp == HOT)
+ return WRITE_LIFE_SHORT;
+ else if (temp == COLD)
+ return WRITE_LIFE_EXTREME;
+ } else {
+ return WRITE_LIFE_NOT_SET;
+ }
+ } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) {
+ if (type == DATA) {
+ if (temp == WARM)
+ return WRITE_LIFE_LONG;
+ else if (temp == HOT)
+ return WRITE_LIFE_SHORT;
+ else if (temp == COLD)
+ return WRITE_LIFE_EXTREME;
+ } else if (type == NODE) {
+ if (temp == WARM || temp == HOT)
+ return WRITE_LIFE_NOT_SET;
+ else if (temp == COLD)
+ return WRITE_LIFE_NONE;
+ } else if (type == META) {
+ return WRITE_LIFE_MEDIUM;
+ }
+ }
+ return WRITE_LIFE_NOT_SET;
+}
+
static int __get_segment_type_2(struct f2fs_io_info *fio)
{
if (fio->type == DATA)
@@ -2487,7 +2587,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (is_cold_data(fio->page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
- if (is_inode_flag_set(inode, FI_HOT_DATA))
+ if (file_is_hot(inode) ||
+ is_inode_flag_set(inode, FI_HOT_DATA))
return CURSEG_HOT_DATA;
return rw_hint_to_seg_type(inode->i_write_hint);
} else {
@@ -2502,7 +2603,7 @@ static int __get_segment_type(struct f2fs_io_info *fio)
{
int type = 0;
- switch (fio->sbi->active_logs) {
+ switch (F2FS_OPTION(fio->sbi).active_logs) {
case 2:
type = __get_segment_type_2(fio);
break;
@@ -2642,6 +2743,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
+ .temp = HOT,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = page->index,
@@ -2688,8 +2790,15 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
int rewrite_data_page(struct f2fs_io_info *fio)
{
int err;
+ struct f2fs_sb_info *sbi = fio->sbi;
fio->new_blkaddr = fio->old_blkaddr;
+ /* i/o temperature is needed for passing down write hints */
+ __get_segment_type(fio);
+
+ f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi,
+ GET_SEGNO(sbi, fio->new_blkaddr))->type));
+
stat_inc_inplace_blocks(fio->sbi);
err = f2fs_submit_page_bio(fio);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index f11c4bc82c78..3325d0769723 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -53,13 +53,19 @@
((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
(sbi)->segs_per_sec)) \
-#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
-#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
+#define MAIN_BLKADDR(sbi) \
+ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr))
+#define SEG0_BLKADDR(sbi) \
+ (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr))
#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
#define MAIN_SECS(sbi) ((sbi)->total_sections)
-#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
+#define TOTAL_SEGS(sbi) \
+ (SM_I(sbi) ? SM_I(sbi)->segment_count : \
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count))
#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
@@ -596,6 +602,8 @@ static inline int utilization(struct f2fs_sb_info *sbi)
#define DEF_MIN_FSYNC_BLOCKS 8
#define DEF_MIN_HOT_BLOCKS 16
+#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */
+
enum {
F2FS_IPU_FORCE,
F2FS_IPU_SSR,
@@ -630,10 +638,17 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1);
}
-static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr)
{
- BUG_ON(blk_addr < SEG0_BLKADDR(sbi)
- || blk_addr >= MAX_BLKADDR(sbi));
+ struct f2fs_sb_info *sbi = fio->sbi;
+
+ if (PAGE_TYPE_OF_BIO(fio->type) == META &&
+ (!is_read_io(fio->op) || fio->is_meta))
+ BUG_ON(blk_addr < SEG0_BLKADDR(sbi) ||
+ blk_addr >= MAIN_BLKADDR(sbi));
+ else
+ BUG_ON(blk_addr < MAIN_BLKADDR(sbi) ||
+ blk_addr >= MAX_BLKADDR(sbi));
}
/*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8173ae688814..42d564c5ccd0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -60,7 +60,7 @@ char *fault_name[FAULT_MAX] = {
static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
unsigned int rate)
{
- struct f2fs_fault_info *ffi = &sbi->fault_info;
+ struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
if (rate) {
atomic_set(&ffi->inject_ops, 0);
@@ -129,6 +129,10 @@ enum {
Opt_jqfmt_vfsold,
Opt_jqfmt_vfsv0,
Opt_jqfmt_vfsv1,
+ Opt_whint,
+ Opt_alloc,
+ Opt_fsync,
+ Opt_test_dummy_encryption,
Opt_err,
};
@@ -182,6 +186,10 @@ static match_table_t f2fs_tokens = {
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
+ {Opt_whint, "whint_mode=%s"},
+ {Opt_alloc, "alloc_mode=%s"},
+ {Opt_fsync, "fsync_mode=%s"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_err, NULL},
};
@@ -202,21 +210,24 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
block_t limit = (sbi->user_block_count << 1) / 1000;
/* limit is 0.2% */
- if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) {
- sbi->root_reserved_blocks = limit;
+ if (test_opt(sbi, RESERVE_ROOT) &&
+ F2FS_OPTION(sbi).root_reserved_blocks > limit) {
+ F2FS_OPTION(sbi).root_reserved_blocks = limit;
f2fs_msg(sbi->sb, KERN_INFO,
"Reduce reserved blocks for root = %u",
- sbi->root_reserved_blocks);
+ F2FS_OPTION(sbi).root_reserved_blocks);
}
if (!test_opt(sbi, RESERVE_ROOT) &&
- (!uid_eq(sbi->s_resuid,
+ (!uid_eq(F2FS_OPTION(sbi).s_resuid,
make_kuid(&init_user_ns, F2FS_DEF_RESUID)) ||
- !gid_eq(sbi->s_resgid,
+ !gid_eq(F2FS_OPTION(sbi).s_resgid,
make_kgid(&init_user_ns, F2FS_DEF_RESGID))))
f2fs_msg(sbi->sb, KERN_INFO,
"Ignore s_resuid=%u, s_resgid=%u w/o reserve_root",
- from_kuid_munged(&init_user_ns, sbi->s_resuid),
- from_kgid_munged(&init_user_ns, sbi->s_resgid));
+ from_kuid_munged(&init_user_ns,
+ F2FS_OPTION(sbi).s_resuid),
+ from_kgid_munged(&init_user_ns,
+ F2FS_OPTION(sbi).s_resgid));
}
static void init_once(void *foo)
@@ -236,7 +247,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
char *qname;
int ret = -EINVAL;
- if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) {
+ if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) {
f2fs_msg(sb, KERN_ERR,
"Cannot change journaled "
"quota options when quota turned on");
@@ -254,8 +265,8 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
"Not enough memory for storing quotafile name");
return -EINVAL;
}
- if (sbi->s_qf_names[qtype]) {
- if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+ if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
+ if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
ret = 0;
else
f2fs_msg(sb, KERN_ERR,
@@ -268,7 +279,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
"quotafile must be on filesystem root");
goto errout;
}
- sbi->s_qf_names[qtype] = qname;
+ F2FS_OPTION(sbi).s_qf_names[qtype] = qname;
set_opt(sbi, QUOTA);
return 0;
errout:
@@ -280,13 +291,13 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) {
+ if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) {
f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options"
" when quota turned on");
return -EINVAL;
}
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ kfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
+ F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
return 0;
}
@@ -302,15 +313,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
"Cannot enable project quota enforcement.");
return -1;
}
- if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] ||
- sbi->s_qf_names[PRJQUOTA]) {
- if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+ if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) {
+ if (test_opt(sbi, USRQUOTA) &&
+ F2FS_OPTION(sbi).s_qf_names[USRQUOTA])
clear_opt(sbi, USRQUOTA);
- if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+ if (test_opt(sbi, GRPQUOTA) &&
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA])
clear_opt(sbi, GRPQUOTA);
- if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA])
+ if (test_opt(sbi, PRJQUOTA) &&
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
clear_opt(sbi, PRJQUOTA);
if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) ||
@@ -320,19 +335,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
return -1;
}
- if (!sbi->s_jquota_fmt) {
+ if (!F2FS_OPTION(sbi).s_jquota_fmt) {
f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format "
"not specified");
return -1;
}
}
- if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) {
+ if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) {
f2fs_msg(sbi->sb, KERN_INFO,
"QUOTA feature is enabled, so ignore jquota_fmt");
- sbi->s_jquota_fmt = 0;
+ F2FS_OPTION(sbi).s_jquota_fmt = 0;
}
- if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) {
+ if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) {
f2fs_msg(sbi->sb, KERN_INFO,
"Filesystem with quota feature cannot be mounted RDWR "
"without CONFIG_QUOTA");
@@ -403,14 +418,14 @@ static int parse_options(struct super_block *sb, char *options)
q = bdev_get_queue(sb->s_bdev);
if (blk_queue_discard(q)) {
set_opt(sbi, DISCARD);
- } else if (!f2fs_sb_mounted_blkzoned(sb)) {
+ } else if (!f2fs_sb_has_blkzoned(sb)) {
f2fs_msg(sb, KERN_WARNING,
"mounting with \"discard\" option, but "
"the device does not support discard");
}
break;
case Opt_nodiscard:
- if (f2fs_sb_mounted_blkzoned(sb)) {
+ if (f2fs_sb_has_blkzoned(sb)) {
f2fs_msg(sb, KERN_WARNING,
"discard is required for zoned block devices");
return -EINVAL;
@@ -440,7 +455,7 @@ static int parse_options(struct super_block *sb, char *options)
if (args->from && match_int(args, &arg))
return -EINVAL;
set_opt(sbi, INLINE_XATTR_SIZE);
- sbi->inline_xattr_size = arg;
+ F2FS_OPTION(sbi).inline_xattr_size = arg;
break;
#else
case Opt_user_xattr:
@@ -480,7 +495,7 @@ static int parse_options(struct super_block *sb, char *options)
return -EINVAL;
if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
return -EINVAL;
- sbi->active_logs = arg;
+ F2FS_OPTION(sbi).active_logs = arg;
break;
case Opt_disable_ext_identify:
set_opt(sbi, DISABLE_EXT_IDENTIFY);
@@ -524,9 +539,9 @@ static int parse_options(struct super_block *sb, char *options)
if (test_opt(sbi, RESERVE_ROOT)) {
f2fs_msg(sb, KERN_INFO,
"Preserve previous reserve_root=%u",
- sbi->root_reserved_blocks);
+ F2FS_OPTION(sbi).root_reserved_blocks);
} else {
- sbi->root_reserved_blocks = arg;
+ F2FS_OPTION(sbi).root_reserved_blocks = arg;
set_opt(sbi, RESERVE_ROOT);
}
break;
@@ -539,7 +554,7 @@ static int parse_options(struct super_block *sb, char *options)
"Invalid uid value %d", arg);
return -EINVAL;
}
- sbi->s_resuid = uid;
+ F2FS_OPTION(sbi).s_resuid = uid;
break;
case Opt_resgid:
if (args->from && match_int(args, &arg))
@@ -550,7 +565,7 @@ static int parse_options(struct super_block *sb, char *options)
"Invalid gid value %d", arg);
return -EINVAL;
}
- sbi->s_resgid = gid;
+ F2FS_OPTION(sbi).s_resgid = gid;
break;
case Opt_mode:
name = match_strdup(&args[0]);
@@ -559,7 +574,7 @@ static int parse_options(struct super_block *sb, char *options)
return -ENOMEM;
if (strlen(name) == 8 &&
!strncmp(name, "adaptive", 8)) {
- if (f2fs_sb_mounted_blkzoned(sb)) {
+ if (f2fs_sb_has_blkzoned(sb)) {
f2fs_msg(sb, KERN_WARNING,
"adaptive mode is not allowed with "
"zoned block device feature");
@@ -585,7 +600,7 @@ static int parse_options(struct super_block *sb, char *options)
1 << arg, BIO_MAX_PAGES);
return -EINVAL;
}
- sbi->write_io_size_bits = arg;
+ F2FS_OPTION(sbi).write_io_size_bits = arg;
break;
case Opt_fault_injection:
if (args->from && match_int(args, &arg))
@@ -646,13 +661,13 @@ static int parse_options(struct super_block *sb, char *options)
return ret;
break;
case Opt_jqfmt_vfsold:
- sbi->s_jquota_fmt = QFMT_VFS_OLD;
+ F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD;
break;
case Opt_jqfmt_vfsv0:
- sbi->s_jquota_fmt = QFMT_VFS_V0;
+ F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0;
break;
case Opt_jqfmt_vfsv1:
- sbi->s_jquota_fmt = QFMT_VFS_V1;
+ F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1;
break;
case Opt_noquota:
clear_opt(sbi, QUOTA);
@@ -679,6 +694,73 @@ static int parse_options(struct super_block *sb, char *options)
"quota operations not supported");
break;
#endif
+ case Opt_whint:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (strlen(name) == 10 &&
+ !strncmp(name, "user-based", 10)) {
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER;
+ } else if (strlen(name) == 3 &&
+ !strncmp(name, "off", 3)) {
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
+ } else if (strlen(name) == 8 &&
+ !strncmp(name, "fs-based", 8)) {
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_alloc:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+
+ if (strlen(name) == 7 &&
+ !strncmp(name, "default", 7)) {
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+ } else if (strlen(name) == 5 &&
+ !strncmp(name, "reuse", 5)) {
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_fsync:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (strlen(name) == 5 &&
+ !strncmp(name, "posix", 5)) {
+ F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
+ } else if (strlen(name) == 6 &&
+ !strncmp(name, "strict", 6)) {
+ F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_test_dummy_encryption:
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ if (!f2fs_sb_has_encrypt(sb)) {
+ f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
+ return -EINVAL;
+ }
+
+ F2FS_OPTION(sbi).test_dummy_encryption = true;
+ f2fs_msg(sb, KERN_INFO,
+ "Test dummy encryption mode enabled");
+#else
+ f2fs_msg(sb, KERN_INFO,
+ "Test dummy encryption mount option ignored");
+#endif
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -699,14 +781,22 @@ static int parse_options(struct super_block *sb, char *options)
}
if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+ if (!f2fs_sb_has_extra_attr(sb) ||
+ !f2fs_sb_has_flexible_inline_xattr(sb)) {
+ f2fs_msg(sb, KERN_ERR,
+ "extra_attr or flexible_inline_xattr "
+ "feature is off");
+ return -EINVAL;
+ }
if (!test_opt(sbi, INLINE_XATTR)) {
f2fs_msg(sb, KERN_ERR,
"inline_xattr_size option should be "
"set with inline_xattr option");
return -EINVAL;
}
- if (!sbi->inline_xattr_size ||
- sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE -
+ if (!F2FS_OPTION(sbi).inline_xattr_size ||
+ F2FS_OPTION(sbi).inline_xattr_size >=
+ DEF_ADDRS_PER_INODE -
F2FS_TOTAL_EXTRA_ATTR_SIZE -
DEF_INLINE_RESERVED_SIZE -
DEF_MIN_INLINE_SIZE) {
@@ -715,6 +805,12 @@ static int parse_options(struct super_block *sb, char *options)
return -EINVAL;
}
}
+
+ /* Not pass down write hints if the number of active logs is lesser
+ * than NR_CURSEG_TYPE.
+ */
+ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE)
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
return 0;
}
@@ -731,7 +827,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
- fi->i_advise = 0;
init_rwsem(&fi->i_sem);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
@@ -743,10 +838,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_rwsem(&fi->i_mmap_sem);
init_rwsem(&fi->i_xattr_sem);
-#ifdef CONFIG_QUOTA
- memset(&fi->i_dquot, 0, sizeof(fi->i_dquot));
- fi->i_reserved_quota = 0;
-#endif
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
@@ -956,7 +1047,7 @@ static void f2fs_put_super(struct super_block *sb)
mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
- kfree(sbi->s_qf_names[i]);
+ kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
destroy_percpu_info(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
@@ -1070,8 +1161,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = total_count - start_count;
buf->f_bfree = user_block_count - valid_user_blocks(sbi) -
sbi->current_reserved_blocks;
- if (buf->f_bfree > sbi->root_reserved_blocks)
- buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks;
+ if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks)
+ buf->f_bavail = buf->f_bfree -
+ F2FS_OPTION(sbi).root_reserved_blocks;
else
buf->f_bavail = 0;
@@ -1106,10 +1198,10 @@ static inline void f2fs_show_quota_options(struct seq_file *seq,
#ifdef CONFIG_QUOTA
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- if (sbi->s_jquota_fmt) {
+ if (F2FS_OPTION(sbi).s_jquota_fmt) {
char *fmtname = "";
- switch (sbi->s_jquota_fmt) {
+ switch (F2FS_OPTION(sbi).s_jquota_fmt) {
case QFMT_VFS_OLD:
fmtname = "vfsold";
break;
@@ -1123,14 +1215,17 @@ static inline void f2fs_show_quota_options(struct seq_file *seq,
seq_printf(seq, ",jqfmt=%s", fmtname);
}
- if (sbi->s_qf_names[USRQUOTA])
- seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
+ if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA])
+ seq_show_option(seq, "usrjquota",
+ F2FS_OPTION(sbi).s_qf_names[USRQUOTA]);
- if (sbi->s_qf_names[GRPQUOTA])
- seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
+ if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA])
+ seq_show_option(seq, "grpjquota",
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]);
- if (sbi->s_qf_names[PRJQUOTA])
- seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]);
+ if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
+ seq_show_option(seq, "prjjquota",
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]);
#endif
}
@@ -1165,7 +1260,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",noinline_xattr");
if (test_opt(sbi, INLINE_XATTR_SIZE))
seq_printf(seq, ",inline_xattr_size=%u",
- sbi->inline_xattr_size);
+ F2FS_OPTION(sbi).inline_xattr_size);
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
if (test_opt(sbi, POSIX_ACL))
@@ -1201,18 +1296,20 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, "adaptive");
else if (test_opt(sbi, LFS))
seq_puts(seq, "lfs");
- seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+ seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
if (test_opt(sbi, RESERVE_ROOT))
seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u",
- sbi->root_reserved_blocks,
- from_kuid_munged(&init_user_ns, sbi->s_resuid),
- from_kgid_munged(&init_user_ns, sbi->s_resgid));
+ F2FS_OPTION(sbi).root_reserved_blocks,
+ from_kuid_munged(&init_user_ns,
+ F2FS_OPTION(sbi).s_resuid),
+ from_kgid_munged(&init_user_ns,
+ F2FS_OPTION(sbi).s_resgid));
if (F2FS_IO_SIZE_BITS(sbi))
seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi));
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (test_opt(sbi, FAULT_INJECTION))
seq_printf(seq, ",fault_injection=%u",
- sbi->fault_info.inject_rate);
+ F2FS_OPTION(sbi).fault_info.inject_rate);
#endif
#ifdef CONFIG_QUOTA
if (test_opt(sbi, QUOTA))
@@ -1225,15 +1322,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",prjquota");
#endif
f2fs_show_quota_options(seq, sbi->sb);
+ if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER)
+ seq_printf(seq, ",whint_mode=%s", "user-based");
+ else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
+ seq_printf(seq, ",whint_mode=%s", "fs-based");
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ if (F2FS_OPTION(sbi).test_dummy_encryption)
+ seq_puts(seq, ",test_dummy_encryption");
+#endif
+
+ if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT)
+ seq_printf(seq, ",alloc_mode=%s", "default");
+ else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
+ seq_printf(seq, ",alloc_mode=%s", "reuse");
+ if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX)
+ seq_printf(seq, ",fsync_mode=%s", "posix");
+ else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
+ seq_printf(seq, ",fsync_mode=%s", "strict");
return 0;
}
static void default_options(struct f2fs_sb_info *sbi)
{
/* init some FS parameters */
- sbi->active_logs = NR_CURSEG_TYPE;
- sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+ F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE;
+ F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+ F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
+ F2FS_OPTION(sbi).test_dummy_encryption = false;
+ sbi->readdir_ra = 1;
set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_XATTR);
@@ -1243,7 +1362,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, NOHEAP);
sbi->sb->s_flags |= SB_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
- if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
+ if (f2fs_sb_has_blkzoned(sbi->sb)) {
set_opt_mode(sbi, F2FS_MOUNT_LFS);
set_opt(sbi, DISCARD);
} else {
@@ -1270,16 +1389,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct f2fs_mount_info org_mount_opt;
unsigned long old_sb_flags;
- int err, active_logs;
+ int err;
bool need_restart_gc = false;
bool need_stop_gc = false;
bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- struct f2fs_fault_info ffi = sbi->fault_info;
-#endif
#ifdef CONFIG_QUOTA
- int s_jquota_fmt;
- char *s_qf_names[MAXQUOTAS];
int i, j;
#endif
@@ -1289,21 +1403,21 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
*/
org_mount_opt = sbi->mount_opt;
old_sb_flags = sb->s_flags;
- active_logs = sbi->active_logs;
#ifdef CONFIG_QUOTA
- s_jquota_fmt = sbi->s_jquota_fmt;
+ org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i]) {
- s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
- GFP_KERNEL);
- if (!s_qf_names[i]) {
+ if (F2FS_OPTION(sbi).s_qf_names[i]) {
+ org_mount_opt.s_qf_names[i] =
+ kstrdup(F2FS_OPTION(sbi).s_qf_names[i],
+ GFP_KERNEL);
+ if (!org_mount_opt.s_qf_names[i]) {
for (j = 0; j < i; j++)
- kfree(s_qf_names[j]);
+ kfree(org_mount_opt.s_qf_names[j]);
return -ENOMEM;
}
} else {
- s_qf_names[i] = NULL;
+ org_mount_opt.s_qf_names[i] = NULL;
}
}
#endif
@@ -1373,7 +1487,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
- if (*flags & SB_RDONLY) {
+ if (*flags & SB_RDONLY ||
+ F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) {
writeback_inodes_sb(sb, WB_REASON_SYNC);
sync_inodes_sb(sb);
@@ -1399,7 +1514,7 @@ skip:
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- kfree(s_qf_names[i]);
+ kfree(org_mount_opt.s_qf_names[i]);
#endif
/* Update the POSIXACL Flag */
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
@@ -1417,18 +1532,14 @@ restore_gc:
}
restore_opts:
#ifdef CONFIG_QUOTA
- sbi->s_jquota_fmt = s_jquota_fmt;
+ F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- kfree(sbi->s_qf_names[i]);
- sbi->s_qf_names[i] = s_qf_names[i];
+ kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+ F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i];
}
#endif
sbi->mount_opt = org_mount_opt;
- sbi->active_logs = active_logs;
sb->s_flags = old_sb_flags;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- sbi->fault_info = ffi;
-#endif
return err;
}
@@ -1456,7 +1567,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
while (toread > 0) {
tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
repeat:
- page = read_mapping_page(mapping, blkidx, NULL);
+ page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
if (IS_ERR(page)) {
if (PTR_ERR(page) == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, HZ/50);
@@ -1550,8 +1661,8 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode)
static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
{
- return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type],
- sbi->s_jquota_fmt, type);
+ return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type],
+ F2FS_OPTION(sbi).s_jquota_fmt, type);
}
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
@@ -1570,7 +1681,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
}
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i]) {
+ if (F2FS_OPTION(sbi).s_qf_names[i]) {
err = f2fs_quota_on_mount(sbi, i);
if (!err) {
enabled = 1;
@@ -1797,11 +1908,28 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
void *fs_data)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ /*
+ * Encrypting the root directory is not allowed because fsck
+ * expects lost+found directory to exist and remain unencrypted
+ * if LOST_FOUND feature is enabled.
+ *
+ */
+ if (f2fs_sb_has_lost_found(sbi->sb) &&
+ inode->i_ino == F2FS_ROOT_INO(sbi))
+ return -EPERM;
+
return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
ctx, len, fs_data, XATTR_CREATE);
}
+static bool f2fs_dummy_context(struct inode *inode)
+{
+ return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode));
+}
+
static unsigned f2fs_max_namelen(struct inode *inode)
{
return S_ISLNK(inode->i_mode) ?
@@ -1812,6 +1940,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
+ .dummy_context = f2fs_dummy_context,
.empty_dir = f2fs_empty_dir,
.max_namelen = f2fs_max_namelen,
};
@@ -1894,7 +2023,6 @@ static int __f2fs_commit_super(struct buffer_head *bh,
lock_buffer(bh);
if (super)
memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
- set_buffer_uptodate(bh);
set_buffer_dirty(bh);
unlock_buffer(bh);
@@ -2181,6 +2309,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->dirty_device = 0;
spin_lock_init(&sbi->dev_lock);
+
+ init_rwsem(&sbi->sb_lock);
}
static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -2206,7 +2336,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
unsigned int n = 0;
int err = -EIO;
- if (!f2fs_sb_mounted_blkzoned(sbi->sb))
+ if (!f2fs_sb_has_blkzoned(sbi->sb))
return 0;
if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
@@ -2334,7 +2464,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
}
/* write back-up superblock first */
- bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1);
+ bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1);
if (!bh)
return -EIO;
err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
@@ -2345,7 +2475,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
return err;
/* write current valid superblock */
- bh = sb_getblk(sbi->sb, sbi->valid_super_block);
+ bh = sb_bread(sbi->sb, sbi->valid_super_block);
if (!bh)
return -EIO;
err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
@@ -2413,7 +2543,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
#ifdef CONFIG_BLK_DEV_ZONED
if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
- !f2fs_sb_mounted_blkzoned(sbi->sb)) {
+ !f2fs_sb_has_blkzoned(sbi->sb)) {
f2fs_msg(sbi->sb, KERN_ERR,
"Zoned block device feature not enabled\n");
return -EINVAL;
@@ -2447,6 +2577,18 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
return 0;
}
+static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_i = SM_I(sbi);
+
+ /* adjust parameters according to the volume size */
+ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ sm_i->dcc_info->discard_granularity = 1;
+ sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
+ }
+}
+
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
@@ -2494,8 +2636,8 @@ try_onemore:
sb->s_fs_info = sbi;
sbi->raw_super = raw_super;
- sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
- sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
+ F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
+ F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
/* precompute checksum seed for metadata */
if (f2fs_sb_has_inode_chksum(sb))
@@ -2508,7 +2650,7 @@ try_onemore:
* devices, but mandatory for host-managed zoned block devices.
*/
#ifndef CONFIG_BLK_DEV_ZONED
- if (f2fs_sb_mounted_blkzoned(sb)) {
+ if (f2fs_sb_has_blkzoned(sb)) {
f2fs_msg(sb, KERN_ERR,
"Zoned block device support is not enabled\n");
err = -EOPNOTSUPP;
@@ -2724,7 +2866,7 @@ try_onemore:
* Turn on quotas which were not enabled for read-only mounts if
* filesystem has quota feature, so that they are updated correctly.
*/
- if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) {
+ if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) {
err = f2fs_enable_quotas(sb);
if (err) {
f2fs_msg(sb, KERN_ERR,
@@ -2799,6 +2941,8 @@ skip_recovery:
f2fs_join_shrinker(sbi);
+ f2fs_tuning_parameters(sbi);
+
f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx",
cur_cp_version(F2FS_CKPT(sbi)));
f2fs_update_time(sbi, CP_TIME);
@@ -2807,7 +2951,7 @@ skip_recovery:
free_meta:
#ifdef CONFIG_QUOTA
- if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb))
+ if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb))
f2fs_quota_off_umount(sbi->sb);
#endif
f2fs_sync_inode_meta(sbi);
@@ -2851,7 +2995,7 @@ free_bio_info:
free_options:
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
- kfree(sbi->s_qf_names[i]);
+ kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
kfree(options);
free_sb_buf:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index d978c7b6ea04..f33a56d6e6dd 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -58,7 +58,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
#ifdef CONFIG_F2FS_FAULT_INJECTION
else if (struct_type == FAULT_INFO_RATE ||
struct_type == FAULT_INFO_TYPE)
- return (unsigned char *)&sbi->fault_info;
+ return (unsigned char *)&F2FS_OPTION(sbi).fault_info;
#endif
return NULL;
}
@@ -92,10 +92,10 @@ static ssize_t features_show(struct f2fs_attr *a,
if (!sb->s_bdev->bd_part)
return snprintf(buf, PAGE_SIZE, "0\n");
- if (f2fs_sb_has_crypto(sb))
+ if (f2fs_sb_has_encrypt(sb))
len += snprintf(buf, PAGE_SIZE - len, "%s",
"encryption");
- if (f2fs_sb_mounted_blkzoned(sb))
+ if (f2fs_sb_has_blkzoned(sb))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "blkzoned");
if (f2fs_sb_has_extra_attr(sb))
@@ -116,6 +116,9 @@ static ssize_t features_show(struct f2fs_attr *a,
if (f2fs_sb_has_inode_crtime(sb))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_crtime");
+ if (f2fs_sb_has_lost_found(sb))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "lost_found");
len += snprintf(buf + len, PAGE_SIZE - len, "\n");
return len;
}
@@ -136,6 +139,27 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
if (!ptr)
return -EINVAL;
+ if (!strcmp(a->attr.name, "extension_list")) {
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] =
+ sbi->raw_super->extension_list;
+ int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ int hot_count = sbi->raw_super->hot_ext_count;
+ int len = 0, i;
+
+ len += snprintf(buf + len, PAGE_SIZE - len,
+ "cold file extenstion:\n");
+ for (i = 0; i < cold_count; i++)
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ extlist[i]);
+
+ len += snprintf(buf + len, PAGE_SIZE - len,
+ "hot file extenstion:\n");
+ for (i = cold_count; i < cold_count + hot_count; i++)
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ extlist[i]);
+ return len;
+ }
+
ui = (unsigned int *)(ptr + a->offset);
return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
@@ -154,6 +178,41 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
if (!ptr)
return -EINVAL;
+ if (!strcmp(a->attr.name, "extension_list")) {
+ const char *name = strim((char *)buf);
+ bool set = true, hot;
+
+ if (!strncmp(name, "[h]", 3))
+ hot = true;
+ else if (!strncmp(name, "[c]", 3))
+ hot = false;
+ else
+ return -EINVAL;
+
+ name += 3;
+
+ if (*name == '!') {
+ name++;
+ set = false;
+ }
+
+ if (strlen(name) >= F2FS_EXTENSION_LEN)
+ return -EINVAL;
+
+ down_write(&sbi->sb_lock);
+
+ ret = update_extension_list(sbi, name, hot, set);
+ if (ret)
+ goto out;
+
+ ret = f2fs_commit_super(sbi, false);
+ if (ret)
+ update_extension_list(sbi, name, hot, !set);
+out:
+ up_write(&sbi->sb_lock);
+ return ret ? ret : count;
+ }
+
ui = (unsigned int *)(ptr + a->offset);
ret = kstrtoul(skip_spaces(buf), 0, &t);
@@ -166,7 +225,7 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
if (t > (unsigned long)(sbi->user_block_count -
- sbi->root_reserved_blocks)) {
+ F2FS_OPTION(sbi).root_reserved_blocks)) {
spin_unlock(&sbi->stat_lock);
return -EINVAL;
}
@@ -236,6 +295,7 @@ enum feat_id {
FEAT_FLEXIBLE_INLINE_XATTR,
FEAT_QUOTA_INO,
FEAT_INODE_CRTIME,
+ FEAT_LOST_FOUND,
};
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -251,6 +311,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
case FEAT_FLEXIBLE_INLINE_XATTR:
case FEAT_QUOTA_INO:
case FEAT_INODE_CRTIME:
+ case FEAT_LOST_FOUND:
return snprintf(buf, PAGE_SIZE, "supported\n");
}
return 0;
@@ -307,6 +368,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
#ifdef CONFIG_F2FS_FAULT_INJECTION
F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
@@ -329,6 +391,7 @@ F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
+F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -357,6 +420,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(iostat_enable),
ATTR_LIST(readdir_ra),
ATTR_LIST(gc_pin_file_thresh),
+ ATTR_LIST(extension_list),
#ifdef CONFIG_F2FS_FAULT_INJECTION
ATTR_LIST(inject_rate),
ATTR_LIST(inject_type),
@@ -383,6 +447,7 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTR_LIST(flexible_inline_xattr),
ATTR_LIST(quota_ino),
ATTR_LIST(inode_crtime),
+ ATTR_LIST(lost_found),
NULL,
};
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 1e97f1fda90c..d737ff082472 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -607,8 +607,8 @@ static int fixup_compat_flock(struct flock *flock)
return 0;
}
-COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
- compat_ulong_t, arg)
+static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
+ compat_ulong_t arg)
{
struct fd f = fdget_raw(fd);
struct flock flock;
@@ -672,6 +672,12 @@ out_put:
return err;
}
+COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg)
+{
+ return do_compat_fcntl64(fd, cmd, arg);
+}
+
COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
compat_ulong_t, arg)
{
@@ -684,7 +690,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
case F_OFD_SETLKW:
return -EINVAL;
}
- return compat_sys_fcntl64(fd, cmd, arg);
+ return do_compat_fcntl64(fd, cmd, arg);
}
#endif
diff --git a/fs/file.c b/fs/file.c
index 42f0db4bd0fb..7ffd6e9d103d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -638,6 +638,7 @@ out_unlock:
spin_unlock(&files->file_lock);
return -EBADF;
}
+EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
void do_close_on_exec(struct files_struct *files)
{
@@ -870,7 +871,7 @@ out_unlock:
return err;
}
-SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
+static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
int err = -EBADF;
struct file *file;
@@ -904,6 +905,11 @@ out_unlock:
return err;
}
+SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
+{
+ return ksys_dup3(oldfd, newfd, flags);
+}
+
SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
if (unlikely(newfd == oldfd)) { /* corner case */
@@ -916,10 +922,10 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
rcu_read_unlock();
return retval;
}
- return sys_dup3(oldfd, newfd, 0);
+ return ksys_dup3(oldfd, newfd, 0);
}
-SYSCALL_DEFINE1(dup, unsigned int, fildes)
+int ksys_dup(unsigned int fildes)
{
int ret = -EBADF;
struct file *file = fget_raw(fildes);
@@ -934,6 +940,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
return ret;
}
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
+{
+ return ksys_dup(fildes);
+}
+
int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
int err;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d4d04fee568a..1280f915079b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1343,7 +1343,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
dirty = inode->i_state & I_DIRTY;
if (inode->i_state & I_DIRTY_TIME) {
- if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ if ((dirty & I_DIRTY_INODE) ||
wbc->sync_mode == WB_SYNC_ALL ||
unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
unlikely(time_after(jiffies,
@@ -2112,7 +2112,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
*/
void __mark_inode_dirty(struct inode *inode, int flags)
{
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
struct super_block *sb = inode->i_sb;
int dirtytime;
@@ -2122,7 +2121,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* Don't do this for I_DIRTY_PAGES - that doesn't actually
* dirty the inode itself
*/
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
+ if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode)
@@ -2197,7 +2196,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (dirtytime)
inode->dirtied_time_when = jiffies;
- if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
+ if (inode->i_state & I_DIRTY)
dirty_list = &wb->b_dirty;
else
dirty_list = &wb->b_dirty_time;
@@ -2221,8 +2220,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
out_unlock_inode:
spin_unlock(&inode->i_lock);
-
-#undef I_DIRTY_INODE
}
EXPORT_SYMBOL(__mark_inode_dirty);
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 56cce7fdd39e..c184c5a356ff 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -125,7 +125,7 @@ struct fscache_cache *fscache_select_cache_for_object(
}
/* the parent is unbacked */
- if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
/* cookie not an index and is unbacked */
spin_unlock(&cookie->lock);
_leave(" = NULL [cookie ub,ni]");
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index ff84258132bb..7dc55b93a830 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -21,12 +21,54 @@ struct kmem_cache *fscache_cookie_jar;
static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
-static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+#define fscache_cookie_hash_shift 15
+static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift];
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
+ loff_t object_size);
static int fscache_alloc_object(struct fscache_cache *cache,
struct fscache_cookie *cookie);
static int fscache_attach_object(struct fscache_cookie *cookie,
struct fscache_object *object);
+static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
+{
+ struct hlist_node *object;
+ const u8 *k;
+ unsigned loop;
+
+ pr_err("%c-cookie c=%p [p=%p fl=%lx nc=%u na=%u]\n",
+ prefix, cookie, cookie->parent, cookie->flags,
+ atomic_read(&cookie->n_children),
+ atomic_read(&cookie->n_active));
+ pr_err("%c-cookie d=%p n=%p\n",
+ prefix, cookie->def, cookie->netfs_data);
+
+ object = READ_ONCE(cookie->backing_objects.first);
+ if (object)
+ pr_err("%c-cookie o=%p\n",
+ prefix, hlist_entry(object, struct fscache_object, cookie_link));
+
+ pr_err("%c-key=[%u] '", prefix, cookie->key_len);
+ k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
+ cookie->inline_key : cookie->key;
+ for (loop = 0; loop < cookie->key_len; loop++)
+ pr_cont("%02x", k[loop]);
+ pr_cont("'\n");
+}
+
+void fscache_free_cookie(struct fscache_cookie *cookie)
+{
+ if (cookie) {
+ BUG_ON(!hlist_empty(&cookie->backing_objects));
+ if (cookie->aux_len > sizeof(cookie->inline_aux))
+ kfree(cookie->aux);
+ if (cookie->key_len > sizeof(cookie->inline_key))
+ kfree(cookie->key);
+ kmem_cache_free(fscache_cookie_jar, cookie);
+ }
+}
+
/*
* initialise an cookie jar slab element prior to any use
*/
@@ -41,6 +83,170 @@ void fscache_cookie_init_once(void *_cookie)
}
/*
+ * Set the index key in a cookie. The cookie struct has space for a 12-byte
+ * key plus length and hash, but if that's not big enough, it's instead a
+ * pointer to a buffer containing 3 bytes of hash, 1 byte of length and then
+ * the key data.
+ */
+static int fscache_set_key(struct fscache_cookie *cookie,
+ const void *index_key, size_t index_key_len)
+{
+ unsigned long long h;
+ u32 *buf;
+ int i;
+
+ cookie->key_len = index_key_len;
+
+ if (index_key_len > sizeof(cookie->inline_key)) {
+ buf = kzalloc(index_key_len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ cookie->key = buf;
+ } else {
+ buf = (u32 *)cookie->inline_key;
+ buf[0] = 0;
+ buf[1] = 0;
+ buf[2] = 0;
+ }
+
+ memcpy(buf, index_key, index_key_len);
+
+ /* Calculate a hash and combine this with the length in the first word
+ * or first half word
+ */
+ h = (unsigned long)cookie->parent;
+ h += index_key_len + cookie->type;
+ for (i = 0; i < (index_key_len + sizeof(u32) - 1) / sizeof(u32); i++)
+ h += buf[i];
+
+ cookie->key_hash = h ^ (h >> 32);
+ return 0;
+}
+
+static long fscache_compare_cookie(const struct fscache_cookie *a,
+ const struct fscache_cookie *b)
+{
+ const void *ka, *kb;
+
+ if (a->key_hash != b->key_hash)
+ return (long)a->key_hash - (long)b->key_hash;
+ if (a->parent != b->parent)
+ return (long)a->parent - (long)b->parent;
+ if (a->key_len != b->key_len)
+ return (long)a->key_len - (long)b->key_len;
+ if (a->type != b->type)
+ return (long)a->type - (long)b->type;
+
+ if (a->key_len <= sizeof(a->inline_key)) {
+ ka = &a->inline_key;
+ kb = &b->inline_key;
+ } else {
+ ka = a->key;
+ kb = b->key;
+ }
+ return memcmp(ka, kb, a->key_len);
+}
+
+/*
+ * Allocate a cookie.
+ */
+struct fscache_cookie *fscache_alloc_cookie(
+ struct fscache_cookie *parent,
+ const struct fscache_cookie_def *def,
+ const void *index_key, size_t index_key_len,
+ const void *aux_data, size_t aux_data_len,
+ void *netfs_data,
+ loff_t object_size)
+{
+ struct fscache_cookie *cookie;
+
+ /* allocate and initialise a cookie */
+ cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+ if (!cookie)
+ return NULL;
+
+ cookie->key_len = index_key_len;
+ cookie->aux_len = aux_data_len;
+
+ if (fscache_set_key(cookie, index_key, index_key_len) < 0)
+ goto nomem;
+
+ if (cookie->aux_len <= sizeof(cookie->inline_aux)) {
+ memcpy(cookie->inline_aux, aux_data, cookie->aux_len);
+ } else {
+ cookie->aux = kmemdup(aux_data, cookie->aux_len, GFP_KERNEL);
+ if (!cookie->aux)
+ goto nomem;
+ }
+
+ atomic_set(&cookie->usage, 1);
+ atomic_set(&cookie->n_children, 0);
+
+ /* We keep the active count elevated until relinquishment to prevent an
+ * attempt to wake up every time the object operations queue quiesces.
+ */
+ atomic_set(&cookie->n_active, 1);
+
+ cookie->def = def;
+ cookie->parent = parent;
+ cookie->netfs_data = netfs_data;
+ cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET);
+ cookie->type = def->type;
+
+ /* radix tree insertion won't use the preallocation pool unless it's
+ * told it may not wait */
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ return cookie;
+
+nomem:
+ fscache_free_cookie(cookie);
+ return NULL;
+}
+
+/*
+ * Attempt to insert the new cookie into the hash. If there's a collision, we
+ * return the old cookie if it's not in use and an error otherwise.
+ */
+struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate)
+{
+ struct fscache_cookie *cursor;
+ struct hlist_bl_head *h;
+ struct hlist_bl_node *p;
+ unsigned int bucket;
+
+ bucket = candidate->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1);
+ h = &fscache_cookie_hash[bucket];
+
+ hlist_bl_lock(h);
+ hlist_bl_for_each_entry(cursor, p, h, hash_link) {
+ if (fscache_compare_cookie(candidate, cursor) == 0)
+ goto collision;
+ }
+
+ __set_bit(FSCACHE_COOKIE_ACQUIRED, &candidate->flags);
+ fscache_cookie_get(candidate->parent, fscache_cookie_get_acquire_parent);
+ atomic_inc(&candidate->parent->n_children);
+ hlist_bl_add_head(&candidate->hash_link, h);
+ hlist_bl_unlock(h);
+ return candidate;
+
+collision:
+ if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) {
+ trace_fscache_cookie(cursor, fscache_cookie_collision,
+ atomic_read(&cursor->usage));
+ pr_err("Duplicate cookie detected\n");
+ fscache_print_cookie(cursor, 'O');
+ fscache_print_cookie(candidate, 'N');
+ hlist_bl_unlock(h);
+ return NULL;
+ }
+
+ fscache_cookie_get(cursor, fscache_cookie_get_reacquire);
+ hlist_bl_unlock(h);
+ return cursor;
+}
+
+/*
* request a cookie to represent an object (index, datafile, xattr, etc)
* - parent specifies the parent object
* - the top level index cookie for each netfs is stored in the fscache_netfs
@@ -58,10 +264,13 @@ void fscache_cookie_init_once(void *_cookie)
struct fscache_cookie *__fscache_acquire_cookie(
struct fscache_cookie *parent,
const struct fscache_cookie_def *def,
+ const void *index_key, size_t index_key_len,
+ const void *aux_data, size_t aux_data_len,
void *netfs_data,
+ loff_t object_size,
bool enable)
{
- struct fscache_cookie *cookie;
+ struct fscache_cookie *candidate, *cookie;
BUG_ON(!def);
@@ -69,6 +278,13 @@ struct fscache_cookie *__fscache_acquire_cookie(
parent ? (char *) parent->def->name : "<no-parent>",
def->name, netfs_data, enable);
+ if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255)
+ return NULL;
+ if (!aux_data || !aux_data_len) {
+ aux_data = NULL;
+ aux_data_len = 0;
+ }
+
fscache_stat(&fscache_n_acquires);
/* if there's no parent cookie, then we don't create one here either */
@@ -79,41 +295,31 @@ struct fscache_cookie *__fscache_acquire_cookie(
}
/* validate the definition */
- BUG_ON(!def->get_key);
BUG_ON(!def->name[0]);
BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
- parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+ parent->type != FSCACHE_COOKIE_TYPE_INDEX);
- /* allocate and initialise a cookie */
- cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
- if (!cookie) {
+ candidate = fscache_alloc_cookie(parent, def,
+ index_key, index_key_len,
+ aux_data, aux_data_len,
+ netfs_data, object_size);
+ if (!candidate) {
fscache_stat(&fscache_n_acquires_oom);
_leave(" [ENOMEM]");
return NULL;
}
- atomic_set(&cookie->usage, 1);
- atomic_set(&cookie->n_children, 0);
-
- /* We keep the active count elevated until relinquishment to prevent an
- * attempt to wake up every time the object operations queue quiesces.
- */
- atomic_set(&cookie->n_active, 1);
-
- atomic_inc(&parent->usage);
- atomic_inc(&parent->n_children);
+ cookie = fscache_hash_cookie(candidate);
+ if (!cookie) {
+ trace_fscache_cookie(candidate, fscache_cookie_discard, 1);
+ goto out;
+ }
- cookie->def = def;
- cookie->parent = parent;
- cookie->netfs_data = netfs_data;
- cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET);
+ if (cookie == candidate)
+ candidate = NULL;
- /* radix tree insertion won't use the preallocation pool unless it's
- * told it may not wait */
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
-
- switch (cookie->def->type) {
+ switch (cookie->type) {
case FSCACHE_COOKIE_TYPE_INDEX:
fscache_stat(&fscache_n_cookie_index);
break;
@@ -125,16 +331,19 @@ struct fscache_cookie *__fscache_acquire_cookie(
break;
}
+ trace_fscache_acquire(cookie);
+
if (enable) {
/* if the object is an index then we need do nothing more here
* - we create indices on disk when we need them as an index
* may exist in multiple caches */
- if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
- if (fscache_acquire_non_index_cookie(cookie) == 0) {
+ if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) {
set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
} else {
atomic_dec(&parent->n_children);
- __fscache_cookie_put(cookie);
+ fscache_cookie_put(cookie,
+ fscache_cookie_put_acquire_nobufs);
fscache_stat(&fscache_n_acquires_nobufs);
_leave(" = NULL");
return NULL;
@@ -145,7 +354,9 @@ struct fscache_cookie *__fscache_acquire_cookie(
}
fscache_stat(&fscache_n_acquires_ok);
- _leave(" = %p", cookie);
+
+out:
+ fscache_free_cookie(candidate);
return cookie;
}
EXPORT_SYMBOL(__fscache_acquire_cookie);
@@ -154,24 +365,30 @@ EXPORT_SYMBOL(__fscache_acquire_cookie);
* Enable a cookie to permit it to accept new operations.
*/
void __fscache_enable_cookie(struct fscache_cookie *cookie,
+ const void *aux_data,
+ loff_t object_size,
bool (*can_enable)(void *data),
void *data)
{
_enter("%p", cookie);
+ trace_fscache_enable(cookie);
+
wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
TASK_UNINTERRUPTIBLE);
+ fscache_update_aux(cookie, aux_data);
+
if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
goto out_unlock;
if (can_enable && !can_enable(data)) {
/* The netfs decided it didn't want to enable after all */
- } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ } else if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
/* Wait for outstanding disablement to complete */
__fscache_wait_on_invalidate(cookie);
- if (fscache_acquire_non_index_cookie(cookie) == 0)
+ if (fscache_acquire_non_index_cookie(cookie, object_size) == 0)
set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
} else {
set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
@@ -188,11 +405,11 @@ EXPORT_SYMBOL(__fscache_enable_cookie);
* - this must make sure the index chain is instantiated and instantiate the
* object representation too
*/
-static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
+ loff_t object_size)
{
struct fscache_object *object;
struct fscache_cache *cache;
- uint64_t i_size;
int ret;
_enter("");
@@ -231,9 +448,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
return ret;
}
- /* pass on how big the object we're caching is supposed to be */
- cookie->def->get_attr(cookie->netfs_data, &i_size);
-
spin_lock(&cookie->lock);
if (hlist_empty(&cookie->backing_objects)) {
spin_unlock(&cookie->lock);
@@ -243,7 +457,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
- fscache_set_store_limit(object, i_size);
+ fscache_set_store_limit(object, object_size);
/* initiate the process of looking up all the objects in the chain
* (done by fscache_initialise_object()) */
@@ -318,7 +532,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
* attached to the cookie */
if (fscache_attach_object(cookie, object) < 0) {
fscache_stat(&fscache_n_cop_put_object);
- cache->ops->put_object(object);
+ cache->ops->put_object(object, fscache_obj_put_attach_fail);
fscache_stat_d(&fscache_n_cop_put_object);
}
@@ -338,7 +552,7 @@ object_already_extant:
error_put:
fscache_stat(&fscache_n_cop_put_object);
- cache->ops->put_object(object);
+ cache->ops->put_object(object, fscache_obj_put_alloc_fail);
fscache_stat_d(&fscache_n_cop_put_object);
error:
_leave(" = %d", ret);
@@ -398,7 +612,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
/* attach to the cookie */
object->cookie = cookie;
- atomic_inc(&cookie->usage);
+ fscache_cookie_get(cookie, fscache_cookie_get_attach_object);
hlist_add_head(&object->cookie_link, &cookie->backing_objects);
fscache_objlist_add(object);
@@ -426,10 +640,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
* there, and if it's doing that, it may as well just retire the
* cookie.
*/
- ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
-
- /* We will be updating the cookie too. */
- BUG_ON(!cookie->def->get_aux);
+ ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
/* If there's an object, we tell the object state machine to handle the
* invalidation on our behalf, otherwise there's nothing to do.
@@ -473,7 +684,7 @@ EXPORT_SYMBOL(__fscache_wait_on_invalidate);
/*
* update the index entries backing a cookie
*/
-void __fscache_update_cookie(struct fscache_cookie *cookie)
+void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data)
{
struct fscache_object *object;
@@ -487,10 +698,10 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
_enter("{%s}", cookie->def->name);
- BUG_ON(!cookie->def->get_aux);
-
spin_lock(&cookie->lock);
+ fscache_update_aux(cookie, aux_data);
+
if (fscache_cookie_enabled(cookie)) {
/* update the index entry on disk in each cache backing this
* cookie.
@@ -509,13 +720,17 @@ EXPORT_SYMBOL(__fscache_update_cookie);
/*
* Disable a cookie to stop it from accepting new requests from the netfs.
*/
-void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
+void __fscache_disable_cookie(struct fscache_cookie *cookie,
+ const void *aux_data,
+ bool invalidate)
{
struct fscache_object *object;
bool awaken = false;
_enter("%p,%u", cookie, invalidate);
+ trace_fscache_disable(cookie);
+
ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
if (atomic_read(&cookie->n_children) != 0) {
@@ -526,6 +741,9 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
TASK_UNINTERRUPTIBLE);
+
+ fscache_update_aux(cookie, aux_data);
+
if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
goto out_unlock_enable;
@@ -557,12 +775,13 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
* n_active reaches 0). This makes sure outstanding reads and writes
* have completed.
*/
- if (!atomic_dec_and_test(&cookie->n_active))
- wait_on_atomic_t(&cookie->n_active, atomic_t_wait,
- TASK_UNINTERRUPTIBLE);
+ if (!atomic_dec_and_test(&cookie->n_active)) {
+ wait_var_event(&cookie->n_active,
+ !atomic_read(&cookie->n_active));
+ }
/* Make sure any pending writes are cancelled. */
- if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+ if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX)
fscache_invalidate_writes(cookie);
/* Reset the cookie state if it wasn't relinquished */
@@ -584,7 +803,9 @@ EXPORT_SYMBOL(__fscache_disable_cookie);
* - all dependents of this cookie must have already been unregistered
* (indices/files/pages)
*/
-void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie,
+ const void *aux_data,
+ bool retire)
{
fscache_stat(&fscache_n_relinquishes);
if (retire)
@@ -600,10 +821,13 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
cookie, cookie->def->name, cookie->netfs_data,
atomic_read(&cookie->n_active), retire);
+ trace_fscache_relinquish(cookie, retire);
+
/* No further netfs-accessing operations on this cookie permitted */
- set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+ if (test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags))
+ BUG();
- __fscache_disable_cookie(cookie, retire);
+ __fscache_disable_cookie(cookie, aux_data, retire);
/* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
@@ -618,35 +842,54 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
/* Dispose of the netfs's link to the cookie */
ASSERTCMP(atomic_read(&cookie->usage), >, 0);
- fscache_cookie_put(cookie);
+ fscache_cookie_put(cookie, fscache_cookie_put_relinquish);
_leave("");
}
EXPORT_SYMBOL(__fscache_relinquish_cookie);
/*
- * destroy a cookie
+ * Remove a cookie from the hash table.
*/
-void __fscache_cookie_put(struct fscache_cookie *cookie)
+static void fscache_unhash_cookie(struct fscache_cookie *cookie)
+{
+ struct hlist_bl_head *h;
+ unsigned int bucket;
+
+ bucket = cookie->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1);
+ h = &fscache_cookie_hash[bucket];
+
+ hlist_bl_lock(h);
+ hlist_bl_del(&cookie->hash_link);
+ hlist_bl_unlock(h);
+}
+
+/*
+ * Drop a reference to a cookie.
+ */
+void fscache_cookie_put(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
{
struct fscache_cookie *parent;
+ int usage;
_enter("%p", cookie);
- for (;;) {
- _debug("FREE COOKIE %p", cookie);
- parent = cookie->parent;
- BUG_ON(!hlist_empty(&cookie->backing_objects));
- kmem_cache_free(fscache_cookie_jar, cookie);
+ do {
+ usage = atomic_dec_return(&cookie->usage);
+ trace_fscache_cookie(cookie, where, usage);
- if (!parent)
- break;
+ if (usage > 0)
+ return;
+ BUG_ON(usage < 0);
+
+ parent = cookie->parent;
+ fscache_unhash_cookie(cookie);
+ fscache_free_cookie(cookie);
cookie = parent;
- BUG_ON(atomic_read(&cookie->usage) <= 0);
- if (!atomic_dec_and_test(&cookie->usage))
- break;
- }
+ where = fscache_cookie_put_parent;
+ } while (cookie);
_leave("");
}
@@ -656,7 +899,8 @@ void __fscache_cookie_put(struct fscache_cookie *cookie)
*
* NOTE: it only serves no-index type
*/
-int __fscache_check_consistency(struct fscache_cookie *cookie)
+int __fscache_check_consistency(struct fscache_cookie *cookie,
+ const void *aux_data)
{
struct fscache_operation *op;
struct fscache_object *object;
@@ -665,7 +909,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
_enter("%p,", cookie);
- ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+ ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
@@ -677,13 +921,16 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
if (!op)
return -ENOMEM;
- fscache_operation_init(op, NULL, NULL, NULL);
+ fscache_operation_init(cookie, op, NULL, NULL, NULL);
op->flags = FSCACHE_OP_MYTHREAD |
(1 << FSCACHE_OP_WAITING) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
+ trace_fscache_page_op(cookie, NULL, op, fscache_page_op_check_consistency);
spin_lock(&cookie->lock);
+ fscache_update_aux(cookie, aux_data);
+
if (!fscache_cookie_enabled(cookie) ||
hlist_empty(&cookie->backing_objects))
goto inconsistent;
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index 5a117df2a9ef..aa46e48d8c75 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -13,16 +13,11 @@
#include <linux/module.h>
#include "internal.h"
-static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax);
-
-static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax);
-
static
enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
const void *data,
- uint16_t datalen);
+ uint16_t datalen,
+ loff_t object_size);
/*
* The root index is owned by FS-Cache itself.
@@ -60,6 +55,7 @@ struct fscache_cookie fscache_fsdef_index = {
.backing_objects = HLIST_HEAD_INIT,
.def = &fscache_fsdef_index_def,
.flags = 1 << FSCACHE_COOKIE_ENABLED,
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
};
EXPORT_SYMBOL(fscache_fsdef_index);
@@ -71,59 +67,18 @@ EXPORT_SYMBOL(fscache_fsdef_index);
struct fscache_cookie_def fscache_fsdef_netfs_def = {
.name = "FSDEF.netfs",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = fscache_fsdef_netfs_get_key,
- .get_aux = fscache_fsdef_netfs_get_aux,
.check_aux = fscache_fsdef_netfs_check_aux,
};
/*
- * get the key data for an FSDEF index record - this is the name of the netfs
- * for which this entry is created
- */
-static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct fscache_netfs *netfs = cookie_netfs_data;
- unsigned klen;
-
- _enter("{%s.%u},", netfs->name, netfs->version);
-
- klen = strlen(netfs->name);
- if (klen > bufmax)
- return 0;
-
- memcpy(buffer, netfs->name, klen);
- return klen;
-}
-
-/*
- * get the auxiliary data for an FSDEF index record - this is the index
- * structure version number of the netfs for which this version is created
- */
-static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct fscache_netfs *netfs = cookie_netfs_data;
- unsigned dlen;
-
- _enter("{%s.%u},", netfs->name, netfs->version);
-
- dlen = sizeof(uint32_t);
- if (dlen > bufmax)
- return 0;
-
- memcpy(buffer, &netfs->version, dlen);
- return dlen;
-}
-
-/*
* check that the index structure version number stored in the auxiliary data
* matches the one the netfs gave us
*/
static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
void *cookie_netfs_data,
const void *data,
- uint16_t datalen)
+ uint16_t datalen,
+ loff_t object_size)
{
struct fscache_netfs *netfs = cookie_netfs_data;
uint32_t version;
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 0ff4b49a0037..500650f938fe 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -29,6 +29,7 @@
#define pr_fmt(fmt) "FS-Cache: " fmt
#include <linux/fscache-cache.h>
+#include <trace/events/fscache.h>
#include <linux/sched.h>
#define FSCACHE_MIN_THREADS 4
@@ -48,8 +49,16 @@ extern struct fscache_cache *fscache_select_cache_for_object(
*/
extern struct kmem_cache *fscache_cookie_jar;
+extern void fscache_free_cookie(struct fscache_cookie *);
extern void fscache_cookie_init_once(void *);
-extern void __fscache_cookie_put(struct fscache_cookie *);
+extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *,
+ const struct fscache_cookie_def *,
+ const void *, size_t,
+ const void *, size_t,
+ void *, loff_t);
+extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *);
+extern void fscache_cookie_put(struct fscache_cookie *,
+ enum fscache_cookie_trace);
/*
* fsdef.c
@@ -311,14 +320,12 @@ static inline void fscache_raise_event(struct fscache_object *object,
fscache_enqueue_object(object);
}
-/*
- * drop a reference to a cookie
- */
-static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+static inline void fscache_cookie_get(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
{
- BUG_ON(atomic_read(&cookie->usage) <= 0);
- if (atomic_dec_and_test(&cookie->usage))
- __fscache_cookie_put(cookie);
+ int usage = atomic_inc_return(&cookie->usage);
+
+ trace_fscache_cookie(cookie, where, usage);
}
/*
@@ -342,6 +349,27 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
cookie->def->put_context(cookie->netfs_data, context);
}
+/*
+ * Update the auxiliary data on a cookie.
+ */
+static inline
+void fscache_update_aux(struct fscache_cookie *cookie, const void *aux_data)
+{
+ void *p;
+
+ if (!aux_data)
+ return;
+ if (cookie->aux_len <= sizeof(cookie->inline_aux))
+ p = cookie->inline_aux;
+ else
+ p = cookie->aux;
+
+ if (memcmp(p, aux_data, cookie->aux_len) != 0) {
+ memcpy(p, aux_data, cookie->aux_len);
+ set_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags);
+ }
+}
+
/*****************************************************************************/
/*
* debug tracing
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 249968dcbf5c..7dce110bf17d 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -16,6 +16,7 @@
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
+#define CREATE_TRACE_POINTS
#include "internal.h"
MODULE_DESCRIPTION("FS Cache Manager");
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index a8aa00be4444..c2f605483cc5 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -14,69 +14,51 @@
#include <linux/slab.h>
#include "internal.h"
-static LIST_HEAD(fscache_netfs_list);
-
/*
* register a network filesystem for caching
*/
int __fscache_register_netfs(struct fscache_netfs *netfs)
{
- struct fscache_netfs *ptr;
- struct fscache_cookie *cookie;
- int ret;
+ struct fscache_cookie *candidate, *cookie;
_enter("{%s}", netfs->name);
- INIT_LIST_HEAD(&netfs->link);
-
/* allocate a cookie for the primary index */
- cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
-
- if (!cookie) {
+ candidate = fscache_alloc_cookie(&fscache_fsdef_index,
+ &fscache_fsdef_netfs_def,
+ netfs->name, strlen(netfs->name),
+ &netfs->version, sizeof(netfs->version),
+ netfs, 0);
+ if (!candidate) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
- /* initialise the primary index cookie */
- atomic_set(&cookie->usage, 1);
- atomic_set(&cookie->n_children, 0);
- atomic_set(&cookie->n_active, 1);
-
- cookie->def = &fscache_fsdef_netfs_def;
- cookie->parent = &fscache_fsdef_index;
- cookie->netfs_data = netfs;
- cookie->flags = 1 << FSCACHE_COOKIE_ENABLED;
-
- spin_lock_init(&cookie->lock);
- spin_lock_init(&cookie->stores_lock);
- INIT_HLIST_HEAD(&cookie->backing_objects);
+ candidate->flags = 1 << FSCACHE_COOKIE_ENABLED;
/* check the netfs type is not already present */
- down_write(&fscache_addremove_sem);
-
- ret = -EEXIST;
- list_for_each_entry(ptr, &fscache_netfs_list, link) {
- if (strcmp(ptr->name, netfs->name) == 0)
- goto already_registered;
+ cookie = fscache_hash_cookie(candidate);
+ if (!cookie)
+ goto already_registered;
+ if (cookie != candidate) {
+ trace_fscache_cookie(candidate, fscache_cookie_discard, 1);
+ fscache_free_cookie(candidate);
}
- atomic_inc(&cookie->parent->usage);
+ fscache_cookie_get(cookie->parent, fscache_cookie_get_register_netfs);
atomic_inc(&cookie->parent->n_children);
netfs->primary_index = cookie;
- list_add(&netfs->link, &fscache_netfs_list);
- ret = 0;
pr_notice("Netfs '%s' registered for caching\n", netfs->name);
+ trace_fscache_netfs(netfs);
+ _leave(" = 0");
+ return 0;
already_registered:
- up_write(&fscache_addremove_sem);
-
- if (ret < 0)
- kmem_cache_free(fscache_cookie_jar, cookie);
-
- _leave(" = %d", ret);
- return ret;
+ fscache_cookie_put(candidate, fscache_cookie_put_dup_netfs);
+ _leave(" = -EEXIST");
+ return -EEXIST;
}
EXPORT_SYMBOL(__fscache_register_netfs);
@@ -88,15 +70,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs)
{
_enter("{%s.%u}", netfs->name, netfs->version);
- down_write(&fscache_addremove_sem);
-
- list_del(&netfs->link);
- fscache_relinquish_cookie(netfs->primary_index, 0);
-
- up_write(&fscache_addremove_sem);
-
- pr_notice("Netfs '%s' unregistered from caching\n",
- netfs->name);
+ fscache_relinquish_cookie(netfs->primary_index, NULL, false);
+ pr_notice("Netfs '%s' unregistered from caching\n", netfs->name);
_leave("");
}
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 0438d4cd91ef..43e6e28c164f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -36,8 +36,6 @@ struct fscache_objlist_data {
#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */
#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */
#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */
-
- u8 buf[512]; /* key and aux data buffer */
};
/*
@@ -170,7 +168,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
struct fscache_cookie *cookie;
unsigned long config = data->config;
char _type[3], *type;
- u8 *buf = data->buf, *p;
+ u8 *p;
if ((unsigned long) v == 1) {
seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
@@ -254,7 +252,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
if (fscache_use_cookie(obj)) {
uint16_t keylen = 0, auxlen = 0;
- switch (cookie->def->type) {
+ switch (cookie->type) {
case 0:
type = "IX";
break;
@@ -263,7 +261,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
break;
default:
snprintf(_type, sizeof(_type), "%02u",
- cookie->def->type);
+ cookie->type);
type = _type;
break;
}
@@ -274,30 +272,30 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
cookie->flags,
cookie->netfs_data);
- if (cookie->def->get_key &&
- config & FSCACHE_OBJLIST_CONFIG_KEY)
- keylen = cookie->def->get_key(cookie->netfs_data,
- buf, 400);
+ if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+ keylen = cookie->key_len;
- if (cookie->def->get_aux &&
- config & FSCACHE_OBJLIST_CONFIG_AUX)
- auxlen = cookie->def->get_aux(cookie->netfs_data,
- buf + keylen, 512 - keylen);
- fscache_unuse_cookie(obj);
+ if (config & FSCACHE_OBJLIST_CONFIG_AUX)
+ auxlen = cookie->aux_len;
if (keylen > 0 || auxlen > 0) {
seq_puts(m, " ");
- for (p = buf; keylen > 0; keylen--)
+ p = keylen <= sizeof(cookie->inline_key) ?
+ cookie->inline_key : cookie->key;
+ for (; keylen > 0; keylen--)
seq_printf(m, "%02x", *p++);
if (auxlen > 0) {
if (config & FSCACHE_OBJLIST_CONFIG_KEY)
seq_puts(m, ", ");
+ p = auxlen <= sizeof(cookie->inline_aux) ?
+ cookie->inline_aux : cookie->aux;
for (; auxlen > 0; auxlen--)
seq_printf(m, "%02x", *p++);
}
}
seq_puts(m, "\n");
+ fscache_unuse_cookie(obj);
} else {
seq_puts(m, "<no_netfs>\n");
}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 7a182c87f378..1085ca12e25c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -138,10 +138,13 @@ static const struct fscache_transition fscache_osm_run_oob[] = {
{ 0, NULL }
};
-static int fscache_get_object(struct fscache_object *);
-static void fscache_put_object(struct fscache_object *);
+static int fscache_get_object(struct fscache_object *,
+ enum fscache_obj_ref_trace);
+static void fscache_put_object(struct fscache_object *,
+ enum fscache_obj_ref_trace);
static bool fscache_enqueue_dependents(struct fscache_object *, int);
static void fscache_dequeue_object(struct fscache_object *);
+static void fscache_update_aux_data(struct fscache_object *);
/*
* we need to notify the parent when an op completes that we had outstanding
@@ -170,6 +173,7 @@ static void fscache_object_sm_dispatcher(struct fscache_object *object)
const struct fscache_transition *t;
const struct fscache_state *state, *new_state;
unsigned long events, event_mask;
+ bool oob;
int event = -1;
ASSERT(object != NULL);
@@ -188,6 +192,7 @@ restart_masked:
if (events & object->oob_event_mask) {
_debug("{OBJ%x} oob %lx",
object->debug_id, events & object->oob_event_mask);
+ oob = true;
for (t = object->oob_table; t->events; t++) {
if (events & t->events) {
state = t->transit_to;
@@ -199,6 +204,7 @@ restart_masked:
}
}
}
+ oob = false;
/* Wait states are just transition tables */
if (!state->work) {
@@ -207,6 +213,8 @@ restart_masked:
if (events & t->events) {
new_state = t->transit_to;
event = fls(events & t->events) - 1;
+ trace_fscache_osm(object, state,
+ true, false, event);
clear_bit(event, &object->events);
_debug("{OBJ%x} ev %d: %s -> %s",
object->debug_id, event,
@@ -226,6 +234,7 @@ restart_masked:
execute_work_state:
_debug("{OBJ%x} exec %s", object->debug_id, state->name);
+ trace_fscache_osm(object, state, false, oob, event);
new_state = state->work(object, event);
event = -1;
if (new_state == NO_TRANSIT) {
@@ -279,7 +288,7 @@ static void fscache_object_work_func(struct work_struct *work)
start = jiffies;
fscache_object_sm_dispatcher(object);
fscache_hist(fscache_objs_histogram, start);
- fscache_put_object(object);
+ fscache_put_object(object, fscache_obj_put_work);
}
/**
@@ -397,7 +406,7 @@ static const struct fscache_state *fscache_initialise_object(struct fscache_obje
fscache_stat(&fscache_n_cop_grab_object);
success = false;
if (fscache_object_is_live(parent) &&
- object->cache->ops->grab_object(object)) {
+ object->cache->ops->grab_object(object, fscache_obj_get_add_to_deps)) {
list_add(&object->dep_link, &parent->dependents);
success = true;
}
@@ -703,6 +712,11 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
ASSERT(cookie != NULL);
ASSERT(!hlist_unhashed(&object->cookie_link));
+ if (test_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags)) {
+ _debug("final update");
+ fscache_update_aux_data(object);
+ }
+
/* Make sure the cookie no longer points here and that the netfs isn't
* waiting for us.
*/
@@ -745,7 +759,7 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
}
/* this just shifts the object release to the work processor */
- fscache_put_object(object);
+ fscache_put_object(object, fscache_obj_put_drop_obj);
fscache_stat(&fscache_n_object_dead);
_leave("");
@@ -755,12 +769,13 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
/*
* get a ref on an object
*/
-static int fscache_get_object(struct fscache_object *object)
+static int fscache_get_object(struct fscache_object *object,
+ enum fscache_obj_ref_trace why)
{
int ret;
fscache_stat(&fscache_n_cop_grab_object);
- ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+ ret = object->cache->ops->grab_object(object, why) ? 0 : -EAGAIN;
fscache_stat_d(&fscache_n_cop_grab_object);
return ret;
}
@@ -768,10 +783,11 @@ static int fscache_get_object(struct fscache_object *object)
/*
* Discard a ref on an object
*/
-static void fscache_put_object(struct fscache_object *object)
+static void fscache_put_object(struct fscache_object *object,
+ enum fscache_obj_ref_trace why)
{
fscache_stat(&fscache_n_cop_put_object);
- object->cache->ops->put_object(object);
+ object->cache->ops->put_object(object, why);
fscache_stat_d(&fscache_n_cop_put_object);
}
@@ -786,7 +802,7 @@ void fscache_object_destroy(struct fscache_object *object)
fscache_objlist_remove(object);
/* We can get rid of the cookie now */
- fscache_cookie_put(object->cookie);
+ fscache_cookie_put(object->cookie, fscache_cookie_put_object);
object->cookie = NULL;
}
EXPORT_SYMBOL(fscache_object_destroy);
@@ -798,7 +814,7 @@ void fscache_enqueue_object(struct fscache_object *object)
{
_enter("{OBJ%x}", object->debug_id);
- if (fscache_get_object(object) >= 0) {
+ if (fscache_get_object(object, fscache_obj_get_queue) >= 0) {
wait_queue_head_t *cong_wq =
&get_cpu_var(fscache_object_cong_wait);
@@ -806,7 +822,7 @@ void fscache_enqueue_object(struct fscache_object *object)
if (fscache_object_congested())
wake_up(cong_wq);
} else
- fscache_put_object(object);
+ fscache_put_object(object, fscache_obj_put_queue);
put_cpu_var(fscache_object_cong_wait);
}
@@ -866,7 +882,7 @@ static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
list_del_init(&dep->dep_link);
fscache_raise_event(dep, event);
- fscache_put_object(dep);
+ fscache_put_object(dep, fscache_obj_put_enq_dep);
if (!list_empty(&object->dependents) && need_resched()) {
ret = false;
@@ -906,7 +922,8 @@ static void fscache_dequeue_object(struct fscache_object *object)
* and creation).
*/
enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
- const void *data, uint16_t datalen)
+ const void *data, uint16_t datalen,
+ loff_t object_size)
{
enum fscache_checkaux result;
@@ -916,7 +933,7 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
}
result = object->cookie->def->check_aux(object->cookie->netfs_data,
- data, datalen);
+ data, datalen, object_size);
switch (result) {
/* entry okay as is */
case FSCACHE_CHECKAUX_OKAY:
@@ -972,11 +989,12 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
if (!op)
goto nomem;
- fscache_operation_init(op, object->cache->ops->invalidate_object,
+ fscache_operation_init(cookie, op, object->cache->ops->invalidate_object,
NULL, NULL);
op->flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_EXCLUSIVE) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
+ trace_fscache_page_op(cookie, NULL, op, fscache_page_op_invalidate);
spin_lock(&cookie->lock);
if (fscache_submit_exclusive_op(object, op) < 0)
@@ -1026,6 +1044,17 @@ static const struct fscache_state *fscache_invalidate_object(struct fscache_obje
}
/*
+ * Update auxiliary data.
+ */
+static void fscache_update_aux_data(struct fscache_object *object)
+{
+ fscache_stat(&fscache_n_updates_run);
+ fscache_stat(&fscache_n_cop_update_object);
+ object->cache->ops->update_object(object);
+ fscache_stat_d(&fscache_n_cop_update_object);
+}
+
+/*
* Asynchronously update an object.
*/
static const struct fscache_state *fscache_update_object(struct fscache_object *object,
@@ -1033,10 +1062,7 @@ static const struct fscache_state *fscache_update_object(struct fscache_object *
{
_enter("{OBJ%x},%d", object->debug_id, event);
- fscache_stat(&fscache_n_updates_run);
- fscache_stat(&fscache_n_cop_update_object);
- object->cache->ops->update_object(object);
- fscache_stat_d(&fscache_n_cop_update_object);
+ fscache_update_aux_data(object);
_leave("");
return transit_to(WAIT_FOR_CMD);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index de67745e1cd7..e30c5975ea58 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -32,7 +32,8 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op)
* Do basic initialisation of an operation. The caller must still set flags,
* object and processor if needed.
*/
-void fscache_operation_init(struct fscache_operation *op,
+void fscache_operation_init(struct fscache_cookie *cookie,
+ struct fscache_operation *op,
fscache_operation_processor_t processor,
fscache_operation_cancel_t cancel,
fscache_operation_release_t release)
@@ -46,6 +47,7 @@ void fscache_operation_init(struct fscache_operation *op,
op->release = release;
INIT_LIST_HEAD(&op->pend_link);
fscache_stat(&fscache_n_op_initialised);
+ trace_fscache_op(cookie, op, fscache_op_init);
}
EXPORT_SYMBOL(fscache_operation_init);
@@ -59,6 +61,8 @@ EXPORT_SYMBOL(fscache_operation_init);
*/
void fscache_enqueue_operation(struct fscache_operation *op)
{
+ struct fscache_cookie *cookie = op->object->cookie;
+
_enter("{OBJ%x OP%x,%u}",
op->object->debug_id, op->debug_id, atomic_read(&op->usage));
@@ -71,12 +75,14 @@ void fscache_enqueue_operation(struct fscache_operation *op)
fscache_stat(&fscache_n_op_enqueue);
switch (op->flags & FSCACHE_OP_TYPE) {
case FSCACHE_OP_ASYNC:
+ trace_fscache_op(cookie, op, fscache_op_enqueue_async);
_debug("queue async");
atomic_inc(&op->usage);
if (!queue_work(fscache_op_wq, &op->work))
fscache_put_operation(op);
break;
case FSCACHE_OP_MYTHREAD:
+ trace_fscache_op(cookie, op, fscache_op_enqueue_mythread);
_debug("queue for caller's attention");
break;
default:
@@ -101,6 +107,8 @@ static void fscache_run_op(struct fscache_object *object,
wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
if (op->processor)
fscache_enqueue_operation(op);
+ else
+ trace_fscache_op(object->cookie, op, fscache_op_run);
fscache_stat(&fscache_n_op_run);
}
@@ -155,6 +163,8 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+ trace_fscache_op(object->cookie, op, fscache_op_submit_ex);
+
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
ASSERTCMP(atomic_read(&op->usage), >, 0);
@@ -240,6 +250,8 @@ int fscache_submit_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},{%u}",
object->debug_id, op->debug_id, atomic_read(&op->usage));
+ trace_fscache_op(object->cookie, op, fscache_op_submit);
+
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
ASSERTCMP(atomic_read(&op->usage), >, 0);
@@ -357,6 +369,8 @@ int fscache_cancel_op(struct fscache_operation *op,
_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+ trace_fscache_op(object->cookie, op, fscache_op_cancel);
+
ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
ASSERTCMP(atomic_read(&op->usage), >, 0);
@@ -419,6 +433,8 @@ void fscache_cancel_all_ops(struct fscache_object *object)
fscache_stat(&fscache_n_op_cancelled);
list_del_init(&op->pend_link);
+ trace_fscache_op(object->cookie, op, fscache_op_cancel_all);
+
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
op->cancel(op);
op->state = FSCACHE_OP_ST_CANCELLED;
@@ -454,9 +470,11 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled)
spin_lock(&object->lock);
if (!cancelled) {
+ trace_fscache_op(object->cookie, op, fscache_op_completed);
op->state = FSCACHE_OP_ST_COMPLETE;
} else {
op->cancel(op);
+ trace_fscache_op(object->cookie, op, fscache_op_cancelled);
op->state = FSCACHE_OP_ST_CANCELLED;
}
@@ -488,6 +506,8 @@ void fscache_put_operation(struct fscache_operation *op)
if (!atomic_dec_and_test(&op->usage))
return;
+ trace_fscache_op(op->object ? op->object->cookie : NULL, op, fscache_op_put);
+
_debug("PUT OP");
ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED &&
op->state != FSCACHE_OP_ST_COMPLETE,
@@ -563,6 +583,8 @@ void fscache_operation_gc(struct work_struct *work)
spin_unlock(&cache->op_gc_list_lock);
object = op->object;
+ trace_fscache_op(object->cookie, op, fscache_op_gc);
+
spin_lock(&object->lock);
_debug("GC DEFERRED REL OBJ%x OP%x",
@@ -601,6 +623,8 @@ void fscache_op_work_func(struct work_struct *work)
_enter("{OBJ%x OP%x,%d}",
op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+ trace_fscache_op(op->object->cookie, op, fscache_op_work);
+
ASSERT(op->processor != NULL);
start = jiffies;
op->processor(op);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 961029e04027..111349f67d98 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -27,6 +27,7 @@ bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page
rcu_read_lock();
val = radix_tree_lookup(&cookie->stores, page->index);
rcu_read_unlock();
+ trace_fscache_check_page(cookie, page, val, 0);
return val != NULL;
}
@@ -39,6 +40,8 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
{
wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+ trace_fscache_page(cookie, page, fscache_page_write_wait);
+
wait_event(*wq, !__fscache_check_page_write(cookie, page));
}
EXPORT_SYMBOL(__fscache_wait_on_page_write);
@@ -69,6 +72,8 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
_enter("%p,%p,%x", cookie, page, gfp);
+ trace_fscache_page(cookie, page, fscache_page_maybe_release);
+
try_again:
rcu_read_lock();
val = radix_tree_lookup(&cookie->stores, page->index);
@@ -101,6 +106,7 @@ try_again:
}
xpage = radix_tree_delete(&cookie->stores, page->index);
+ trace_fscache_page(cookie, page, fscache_page_radix_delete);
spin_unlock(&cookie->stores_lock);
if (xpage) {
@@ -112,6 +118,7 @@ try_again:
}
wake_up_bit(&cookie->flags, 0);
+ trace_fscache_wake_cookie(cookie);
if (xpage)
put_page(xpage);
__fscache_uncache_page(cookie, page);
@@ -144,7 +151,7 @@ static void fscache_end_page_write(struct fscache_object *object,
struct page *page)
{
struct fscache_cookie *cookie;
- struct page *xpage = NULL;
+ struct page *xpage = NULL, *val;
spin_lock(&object->lock);
cookie = object->cookie;
@@ -154,13 +161,24 @@ static void fscache_end_page_write(struct fscache_object *object,
spin_lock(&cookie->stores_lock);
radix_tree_tag_clear(&cookie->stores, page->index,
FSCACHE_COOKIE_STORING_TAG);
+ trace_fscache_page(cookie, page, fscache_page_radix_clear_store);
if (!radix_tree_tag_get(&cookie->stores, page->index,
FSCACHE_COOKIE_PENDING_TAG)) {
fscache_stat(&fscache_n_store_radix_deletes);
xpage = radix_tree_delete(&cookie->stores, page->index);
+ trace_fscache_page(cookie, page, fscache_page_radix_delete);
+ trace_fscache_page(cookie, page, fscache_page_write_end);
+
+ val = radix_tree_lookup(&cookie->stores, page->index);
+ trace_fscache_check_page(cookie, page, val, 1);
+ } else {
+ trace_fscache_page(cookie, page, fscache_page_write_end_pend);
}
spin_unlock(&cookie->stores_lock);
wake_up_bit(&cookie->flags, 0);
+ trace_fscache_wake_cookie(cookie);
+ } else {
+ trace_fscache_page(cookie, page, fscache_page_write_end_noc);
}
spin_unlock(&object->lock);
if (xpage)
@@ -185,9 +203,11 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat_d(&fscache_n_cop_attr_changed);
if (ret < 0)
fscache_abort_object(object);
+ fscache_op_complete(op, ret < 0);
+ } else {
+ fscache_op_complete(op, true);
}
- fscache_op_complete(op, true);
_leave("");
}
@@ -213,7 +233,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
return -ENOMEM;
}
- fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL);
+ fscache_operation_init(cookie, op, fscache_attr_changed_op, NULL, NULL);
+ trace_fscache_page_op(cookie, NULL, op, fscache_page_op_attr_changed);
op->flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_EXCLUSIVE) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -297,7 +318,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
return NULL;
}
- fscache_operation_init(&op->op, NULL,
+ fscache_operation_init(cookie, &op->op, NULL,
fscache_do_cancel_retrieval,
fscache_release_retrieval_op);
op->op.flags = FSCACHE_OP_MYTHREAD |
@@ -368,6 +389,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
fscache_stat(stat_op_waits);
if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
TASK_INTERRUPTIBLE) != 0) {
+ trace_fscache_op(object->cookie, op, fscache_op_signal);
ret = fscache_cancel_op(op, false);
if (ret == 0)
return -ERESTARTSYS;
@@ -389,6 +411,7 @@ check_if_dead:
if (unlikely(fscache_object_is_dying(object) ||
fscache_cache_is_broken(object))) {
enum fscache_operation_state state = op->state;
+ trace_fscache_op(object->cookie, op, fscache_op_signal);
fscache_cancel_op(op, true);
if (stat_object_dead)
fscache_stat(stat_object_dead);
@@ -443,6 +466,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
return -ENOMEM;
}
atomic_set(&op->n_pages, 1);
+ trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_retr_one);
spin_lock(&cookie->lock);
@@ -571,6 +595,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
if (!op)
return -ENOMEM;
atomic_set(&op->n_pages, *nr_pages);
+ trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi);
spin_lock(&cookie->lock);
@@ -682,6 +707,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
if (!op)
return -ENOMEM;
atomic_set(&op->n_pages, 1);
+ trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_alloc_one);
spin_lock(&cookie->lock);
@@ -776,15 +802,17 @@ static void fscache_write_op(struct fscache_operation *_op)
_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+again:
spin_lock(&object->lock);
cookie = object->cookie;
if (!fscache_object_is_active(object)) {
- /* If we get here, then the on-disk cache object likely longer
- * exists, so we should just cancel this write operation.
+ /* If we get here, then the on-disk cache object likely no
+ * longer exists, so we should just cancel this write
+ * operation.
*/
spin_unlock(&object->lock);
- fscache_op_complete(&op->op, false);
+ fscache_op_complete(&op->op, true);
_leave(" [inactive]");
return;
}
@@ -797,7 +825,7 @@ static void fscache_write_op(struct fscache_operation *_op)
* cancel this write operation.
*/
spin_unlock(&object->lock);
- fscache_op_complete(&op->op, false);
+ fscache_op_complete(&op->op, true);
_leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
_op->flags, _op->state, object->state->short_name,
object->flags);
@@ -809,30 +837,33 @@ static void fscache_write_op(struct fscache_operation *_op)
fscache_stat(&fscache_n_store_calls);
/* find a page to store */
+ results[0] = NULL;
page = NULL;
n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
FSCACHE_COOKIE_PENDING_TAG);
+ trace_fscache_gang_lookup(cookie, &op->op, results, n, op->store_limit);
if (n != 1)
goto superseded;
page = results[0];
_debug("gang %d [%lx]", n, page->index);
- if (page->index >= op->store_limit) {
- fscache_stat(&fscache_n_store_pages_over_limit);
- goto superseded;
- }
radix_tree_tag_set(&cookie->stores, page->index,
FSCACHE_COOKIE_STORING_TAG);
radix_tree_tag_clear(&cookie->stores, page->index,
FSCACHE_COOKIE_PENDING_TAG);
+ trace_fscache_page(cookie, page, fscache_page_radix_pend2store);
spin_unlock(&cookie->stores_lock);
spin_unlock(&object->lock);
+ if (page->index >= op->store_limit)
+ goto discard_page;
+
fscache_stat(&fscache_n_store_pages);
fscache_stat(&fscache_n_cop_write_page);
ret = object->cache->ops->write_page(op, page);
fscache_stat_d(&fscache_n_cop_write_page);
+ trace_fscache_wrote_page(cookie, page, &op->op, ret);
fscache_end_page_write(object, page);
if (ret < 0) {
fscache_abort_object(object);
@@ -844,6 +875,12 @@ static void fscache_write_op(struct fscache_operation *_op)
_leave("");
return;
+discard_page:
+ fscache_stat(&fscache_n_store_pages_over_limit);
+ trace_fscache_wrote_page(cookie, page, &op->op, -ENOBUFS);
+ fscache_end_page_write(object, page);
+ goto again;
+
superseded:
/* this writer is going away and there aren't any more things to
* write */
@@ -851,7 +888,7 @@ superseded:
spin_unlock(&cookie->stores_lock);
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
spin_unlock(&object->lock);
- fscache_op_complete(&op->op, true);
+ fscache_op_complete(&op->op, false);
_leave("");
}
@@ -879,6 +916,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
for (i = n - 1; i >= 0; i--) {
page = results[i];
radix_tree_delete(&cookie->stores, page->index);
+ trace_fscache_page(cookie, page, fscache_page_radix_delete);
+ trace_fscache_page(cookie, page, fscache_page_inval);
}
spin_unlock(&cookie->stores_lock);
@@ -888,6 +927,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
}
wake_up_bit(&cookie->flags, 0);
+ trace_fscache_wake_cookie(cookie);
_leave("");
}
@@ -923,6 +963,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
*/
int __fscache_write_page(struct fscache_cookie *cookie,
struct page *page,
+ loff_t object_size,
gfp_t gfp)
{
struct fscache_storage *op;
@@ -946,7 +987,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
if (!op)
goto nomem;
- fscache_operation_init(&op->op, fscache_write_op, NULL,
+ fscache_operation_init(cookie, &op->op, fscache_write_op, NULL,
fscache_release_write_op);
op->op.flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_WAITING) |
@@ -956,6 +997,8 @@ int __fscache_write_page(struct fscache_cookie *cookie,
if (ret < 0)
goto nomem_free;
+ trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_write_one);
+
ret = -ENOBUFS;
spin_lock(&cookie->lock);
@@ -967,9 +1010,15 @@ int __fscache_write_page(struct fscache_cookie *cookie,
if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
goto nobufs;
+ trace_fscache_page(cookie, page, fscache_page_write);
+
/* add the page to the pending-storage radix tree on the backing
* object */
spin_lock(&object->lock);
+
+ if (object->store_limit_l != object_size)
+ fscache_set_store_limit(object, object_size);
+
spin_lock(&cookie->stores_lock);
_debug("store limit %llx", (unsigned long long) object->store_limit);
@@ -982,8 +1031,10 @@ int __fscache_write_page(struct fscache_cookie *cookie,
goto nobufs_unlock_obj;
}
+ trace_fscache_page(cookie, page, fscache_page_radix_insert);
radix_tree_tag_set(&cookie->stores, page->index,
FSCACHE_COOKIE_PENDING_TAG);
+ trace_fscache_page(cookie, page, fscache_page_radix_set_pend);
get_page(page);
/* we only want one writer at a time, but we do need to queue new
@@ -1026,6 +1077,7 @@ already_pending:
submit_failed:
spin_lock(&cookie->stores_lock);
radix_tree_delete(&cookie->stores, page->index);
+ trace_fscache_page(cookie, page, fscache_page_radix_delete);
spin_unlock(&cookie->stores_lock);
wake_cookie = __fscache_unuse_cookie(cookie);
put_page(page);
@@ -1072,6 +1124,8 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
if (!PageFsCache(page))
goto done;
+ trace_fscache_page(cookie, page, fscache_page_uncache);
+
/* get the object */
spin_lock(&cookie->lock);
@@ -1120,6 +1174,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
atomic_inc(&fscache_n_marks);
#endif
+ trace_fscache_page(cookie, page, fscache_page_cached);
+
_debug("- mark %p{%lx}", page, page->index);
if (TestSetPageFsCache(page)) {
static bool once_only;
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 7ac6e839b065..fcc8c2f2690e 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -21,7 +21,6 @@
atomic_t fscache_n_op_pend;
atomic_t fscache_n_op_run;
atomic_t fscache_n_op_enqueue;
-atomic_t fscache_n_op_requeue;
atomic_t fscache_n_op_deferred_release;
atomic_t fscache_n_op_initialised;
atomic_t fscache_n_op_release;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 624f18bbfd2b..ef309958e060 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1080,6 +1080,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_time_gran = 1;
sb->s_export_op = &fuse_export_operations;
+ sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
file = fget(d.fd);
err = -EINVAL;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 2f725b4a386b..f58716567972 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -940,13 +940,13 @@ failed:
}
/**
- * gfs2_set_page_dirty - Page dirtying function
+ * jdata_set_page_dirty - Page dirtying function
* @page: The page to dirty
*
* Returns: 1 if it dirtyed the page, or 0 otherwise
*/
-static int gfs2_set_page_dirty(struct page *page)
+static int jdata_set_page_dirty(struct page *page)
{
SetPageChecked(page);
return __set_page_dirty_buffers(page);
@@ -1214,7 +1214,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
.readpages = gfs2_readpages,
.write_begin = gfs2_write_begin,
.write_end = gfs2_write_end,
- .set_page_dirty = gfs2_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_buffers,
.bmap = gfs2_bmap,
.invalidatepage = gfs2_invalidatepage,
.releasepage = gfs2_releasepage,
@@ -1231,7 +1231,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
.readpages = gfs2_readpages,
.write_begin = gfs2_write_begin,
.write_end = gfs2_write_end,
- .set_page_dirty = gfs2_set_page_dirty,
+ .set_page_dirty = jdata_set_page_dirty,
.bmap = gfs2_bmap,
.invalidatepage = gfs2_invalidatepage,
.releasepage = gfs2_releasepage,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 86d6a4435c87..685c305cbeb6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -491,14 +491,12 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct super_block *sb = sdp->sd_vfs;
struct buffer_head *dibh = mp->mp_bh[0];
u64 bn;
unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
unsigned dblks = 0;
unsigned ptrs_per_blk;
const unsigned end_of_metadata = mp->mp_fheight - 1;
- int ret;
enum alloc_state state;
__be64 *ptr;
__be64 zero_bn = 0;
@@ -607,15 +605,6 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
iomap->flags |= IOMAP_F_NEW;
while (n-- > 0)
*ptr++ = cpu_to_be64(bn++);
- if (flags & IOMAP_ZERO) {
- ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
- dblks, GFP_NOFS);
- if (ret) {
- fs_err(sdp,
- "Failed to zero data buffers\n");
- flags &= ~IOMAP_ZERO;
- }
- }
break;
}
} while (iomap->addr == IOMAP_NULL_ADDR);
@@ -807,23 +796,27 @@ do_alloc:
iomap->length = hole_size(inode, lblock, &mp);
else
iomap->length = size - pos;
- } else {
- if (height <= ip->i_height)
- iomap->length = hole_size(inode, lblock, &mp);
}
goto out_release;
}
/**
- * gfs2_block_map - Map a block from an inode to a disk block
+ * gfs2_block_map - Map one or more blocks of an inode to a disk block
* @inode: The inode
* @lblock: The logical block number
* @bh_map: The bh to be mapped
* @create: True if its ok to alloc blocks to satify the request
*
- * Sets buffer_mapped() if successful, sets buffer_boundary() if a
- * read of metadata will be required before the next block can be
- * mapped. Sets buffer_new() if new blocks were allocated.
+ * The size of the requested mapping is defined in bh_map->b_size.
+ *
+ * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
+ * when @lblock is not mapped. Sets buffer_mapped(bh_map) and
+ * bh_map->b_size to indicate the size of the mapping when @lblock and
+ * successive blocks are mapped, up to the requested size.
+ *
+ * Sets buffer_boundary() if a read of metadata will be required
+ * before the next block can be mapped. Sets buffer_new() if new
+ * blocks were allocated.
*
* Returns: errno
*/
@@ -842,8 +835,6 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
if (create)
flags |= IOMAP_WRITE;
- if (buffer_zeronew(bh_map))
- flags |= IOMAP_ZERO;
ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
bh_map->b_size, flags, &iomap);
if (ret) {
@@ -1347,6 +1338,7 @@ static inline bool walk_done(struct gfs2_sbd *sdp,
static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 maxsize = sdp->sd_heightsize[ip->i_height];
struct metapath mp = {};
struct buffer_head *dibh, *bh;
struct gfs2_holder rd_gh;
@@ -1362,6 +1354,14 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
u64 prev_bnr = 0;
__be64 *start, *end;
+ if (offset >= maxsize) {
+ /*
+ * The starting point lies beyond the allocated meta-data;
+ * there are no blocks do deallocate.
+ */
+ return 0;
+ }
+
/*
* The start position of the hole is defined by lblock, start_list, and
* start_aligned. The end position of the hole is defined by lend,
@@ -1375,7 +1375,6 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
*/
if (length) {
- u64 maxsize = sdp->sd_heightsize[ip->i_height];
u64 end_offset = offset + length;
u64 lend;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 7c21aea0266b..d9fb0ad6cc30 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1940,7 +1940,6 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
{
struct buffer_head *bh;
struct gfs2_dirent *dent;
- int error;
dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
if (!dent) {
@@ -1953,18 +1952,10 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
gfs2_trans_add_meta(dip->i_gl, bh);
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(new_type);
-
- if (dip->i_diskflags & GFS2_DIF_EXHASH) {
- brelse(bh);
- error = gfs2_meta_inode_buffer(dip, &bh);
- if (error)
- return error;
- gfs2_trans_add_meta(dip->i_gl, bh);
- }
+ brelse(bh);
dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode);
- gfs2_dinode_out(dip, bh->b_data);
- brelse(bh);
+ mark_inode_dirty_sync(&dip->i_inode);
return 0;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4f88e201b3f0..4b71f021a9e2 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -729,11 +729,12 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
int mode)
{
+ struct super_block *sb = inode->i_sb;
struct gfs2_inode *ip = GFS2_I(inode);
+ loff_t end = offset + len;
struct buffer_head *dibh;
+ struct iomap iomap;
int error;
- unsigned int nr_blks;
- sector_t lblock = offset >> inode->i_blkbits;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (unlikely(error))
@@ -747,21 +748,19 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
goto out;
}
- while (len) {
- struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
- bh_map.b_size = len;
- set_buffer_zeronew(&bh_map);
-
- error = gfs2_block_map(inode, lblock, &bh_map, 1);
- if (unlikely(error))
+ while (offset < end) {
+ error = gfs2_iomap_begin(inode, offset, end - offset,
+ IOMAP_WRITE, &iomap);
+ if (error)
goto out;
- len -= bh_map.b_size;
- nr_blks = bh_map.b_size >> inode->i_blkbits;
- lblock += nr_blks;
- if (!buffer_new(&bh_map))
+ offset = iomap.offset + iomap.length;
+ if (iomap.type != IOMAP_HOLE)
continue;
- if (unlikely(!buffer_zeronew(&bh_map))) {
- error = -EIO;
+ error = sb_issue_zeroout(sb, iomap.addr >> inode->i_blkbits,
+ iomap.length >> inode->i_blkbits,
+ GFP_NOFS);
+ if (error) {
+ fs_err(GFS2_SB(inode), "Failed to zero data buffers\n");
goto out;
}
}
@@ -809,7 +808,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
- loff_t bytes, max_bytes, max_blks = UINT_MAX;
+ loff_t bytes, max_bytes, max_blks;
int error;
const loff_t pos = offset;
const loff_t count = len;
@@ -861,7 +860,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
return error;
/* ap.allowed tells us how many blocks quota will allow
* us to write. Check if this reduces max_blks */
- if (ap.allowed && ap.allowed < max_blks)
+ max_blks = UINT_MAX;
+ if (ap.allowed)
max_blks = ap.allowed;
error = gfs2_inplace_reserve(ip, &ap);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e0557b8a590a..1b6b1e3f5caf 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -130,15 +130,12 @@ static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1,
enum gfs2_state_bits {
BH_Pinned = BH_PrivateStart,
BH_Escaped = BH_PrivateStart + 1,
- BH_Zeronew = BH_PrivateStart + 2,
};
BUFFER_FNS(Pinned, pinned)
TAS_BUFFER_FNS(Pinned, pinned)
BUFFER_FNS(Escaped, escaped)
TAS_BUFFER_FNS(Escaped, escaped)
-BUFFER_FNS(Zeronew, zeronew)
-TAS_BUFFER_FNS(Zeronew, zeronew)
struct gfs2_bufdata {
struct buffer_head *bd_bh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 59e0560180ec..8700eb815638 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1326,19 +1326,11 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip,
int dir_rename)
{
- int error;
- struct buffer_head *dibh;
-
if (dir_rename)
return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- return error;
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
+ mark_inode_dirty_sync(&ip->i_inode);
return 0;
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index cf6b46247df4..0248835625f1 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -73,7 +73,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
*
*/
-void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
+static void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
{
bd->bd_tr = NULL;
list_del_init(&bd->bd_ail_st_list);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 93b52ac1ca1f..1862e310a067 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -70,7 +70,6 @@ extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
u32 type);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 5e47c935a515..836f29480be6 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -45,6 +45,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
int ret;
+
+ ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index b6b258998bcd..d8b622c375ab 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -15,6 +15,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/crc32c.h>
+#include <linux/ktime.h>
#include "gfs2.h"
#include "incore.h"
@@ -409,12 +410,13 @@ void gfs2_recover_func(struct work_struct *work)
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct gfs2_log_header_host head;
struct gfs2_holder j_gh, ji_gh, thaw_gh;
- unsigned long t;
+ ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep;
int ro = 0;
unsigned int pass;
int error;
int jlocked = 0;
+ t_start = ktime_get();
if (sdp->sd_args.ar_spectator ||
(jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
@@ -446,6 +448,7 @@ void gfs2_recover_func(struct work_struct *work)
fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
}
+ t_jlck = ktime_get();
fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
error = gfs2_jdesc_check(jd);
@@ -455,13 +458,12 @@ void gfs2_recover_func(struct work_struct *work)
error = gfs2_find_jhead(jd, &head);
if (error)
goto fail_gunlock_ji;
+ t_jhd = ktime_get();
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
jd->jd_jid);
- t = jiffies;
-
/* Acquire a shared hold on the freeze lock */
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
@@ -495,6 +497,7 @@ void gfs2_recover_func(struct work_struct *work)
goto fail_gunlock_thaw;
}
+ t_tlck = ktime_get();
fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
for (pass = 0; pass < 2; pass++) {
@@ -509,9 +512,14 @@ void gfs2_recover_func(struct work_struct *work)
clean_journal(jd, &head);
gfs2_glock_dq_uninit(&thaw_gh);
- t = DIV_ROUND_UP(jiffies - t, HZ);
- fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
- jd->jd_jid, t);
+ t_rep = ktime_get();
+ fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, "
+ "jhead:%lldms, tlck:%lldms, replay:%lldms]\n",
+ jd->jd_jid, ktime_ms_delta(t_rep, t_start),
+ ktime_ms_delta(t_jlck, t_start),
+ ktime_ms_delta(t_jhd, t_jlck),
+ ktime_ms_delta(t_tlck, t_jhd),
+ ktime_ms_delta(t_rep, t_tlck));
}
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 620be0521866..cf5c7f3080d2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -800,7 +800,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
int need_endtrans = 0;
int ret;
- if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
+ if (!(flags & I_DIRTY_INODE))
return;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
return;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index b9318b49ff8f..cb10b95efe0f 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -515,6 +515,7 @@ TRACE_EVENT(gfs2_iomap_end,
__field( u64, inum )
__field( loff_t, offset )
__field( ssize_t, length )
+ __field( sector_t, pblock )
__field( u16, flags )
__field( u16, type )
__field( int, ret )
@@ -525,16 +526,20 @@ TRACE_EVENT(gfs2_iomap_end,
__entry->inum = ip->i_no_addr;
__entry->offset = iomap->offset;
__entry->length = iomap->length;
+ __entry->pblock = iomap->addr == IOMAP_NULL_ADDR ? 0 :
+ (iomap->addr >> ip->i_inode.i_blkbits);
__entry->flags = iomap->flags;
__entry->type = iomap->type;
__entry->ret = ret;
),
- TP_printk("%u,%u bmap %llu iomap end %llu/%lu ty:%d flags:%08x rc:%d",
+ TP_printk("%u,%u bmap %llu iomap end %llu/%lu to %llu ty:%d flags:%08x rc:%d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->inum,
(unsigned long long)__entry->offset,
- (unsigned long)__entry->length, (u16)__entry->type,
+ (unsigned long)__entry->length,
+ (long long)__entry->pblock,
+ (u16)__entry->type,
(u16)__entry->flags, __entry->ret)
);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 05de20954659..f2bce1e0f6fb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -308,7 +308,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
}
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(sdp);
@@ -768,7 +768,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
goto out_end_trans;
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
out_end_trans:
gfs2_trans_end(GFS2_SB(&ip->i_inode));
@@ -896,7 +896,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
ea_set_remove_stuffed(ip, es->es_el);
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
return error;
@@ -1114,7 +1114,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
}
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index ffaec2e7526c..cb8374af08a6 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -84,7 +84,7 @@ extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd);
extern int make_symlink(const char *from, const char *to);
extern int unlink_file(const char *file);
extern int do_mkdir(const char *file, int mode);
-extern int do_rmdir(const char *file);
+extern int hostfs_do_rmdir(const char *file);
extern int do_mknod(const char *file, int mode, unsigned int major,
unsigned int minor);
extern int link_file(const char *from, const char *to);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index c148e7f4f451..3cd85eb5bbb1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -706,7 +706,7 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
- err = do_rmdir(file);
+ err = hostfs_do_rmdir(file);
__putname(file);
return err;
}
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 9c1e0f019880..5ecc4706172b 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -304,7 +304,7 @@ int do_mkdir(const char *file, int mode)
return 0;
}
-int do_rmdir(const char *file)
+int hostfs_do_rmdir(const char *file)
{
int err;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8fe1b0aa2896..d508c7844681 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec)
pagevec_reinit(pvec);
}
+/*
+ * Mask used when checking the page offset value passed in via system
+ * calls. This value will be converted to a loff_t which is signed.
+ * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
+ * value. The extra bit (- 1 in the shift value) is to take the sign
+ * bit into account.
+ */
+#define PGOFF_LOFFT_MAX \
+ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
+
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
@@ -127,12 +137,17 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_ops = &hugetlb_vm_ops;
/*
- * Offset passed to mmap (before page shift) could have been
- * negative when represented as a (l)off_t.
+ * page based offset in vm_pgoff could be sufficiently large to
+ * overflow a loff_t when converted to byte offset. This can
+ * only happen on architectures where sizeof(loff_t) ==
+ * sizeof(unsigned long). So, only check in those instances.
*/
- if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
- return -EINVAL;
+ if (sizeof(unsigned long) == sizeof(loff_t)) {
+ if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+ return -EINVAL;
+ }
+ /* must be huge page aligned */
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
diff --git a/fs/inode.c b/fs/inode.c
index ef362364d396..b153aeaa61ea 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -346,9 +346,8 @@ void inc_nlink(struct inode *inode)
}
EXPORT_SYMBOL(inc_nlink);
-void address_space_init_once(struct address_space *mapping)
+static void __address_space_init_once(struct address_space *mapping)
{
- memset(mapping, 0, sizeof(*mapping));
INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&mapping->tree_lock);
init_rwsem(&mapping->i_mmap_rwsem);
@@ -356,6 +355,12 @@ void address_space_init_once(struct address_space *mapping)
spin_lock_init(&mapping->private_lock);
mapping->i_mmap = RB_ROOT_CACHED;
}
+
+void address_space_init_once(struct address_space *mapping)
+{
+ memset(mapping, 0, sizeof(*mapping));
+ __address_space_init_once(mapping);
+}
EXPORT_SYMBOL(address_space_init_once);
/*
@@ -371,7 +376,7 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
- address_space_init_once(&inode->i_data);
+ __address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
}
EXPORT_SYMBOL(inode_init_once);
@@ -1533,7 +1538,6 @@ retry:
if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
atomic_inc(&inode->i_count);
- inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
trace_writeback_lazytime_iput(inode);
mark_inode_dirty_sync(inode);
diff --git a/fs/internal.h b/fs/internal.h
index df262f41a0ef..e08972db0303 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,7 +55,15 @@ extern void __init chrdev_init(void);
extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
+long do_mknodat(int dfd, const char __user *filename, umode_t mode,
+ unsigned int dev);
+long do_mkdirat(int dfd, const char __user *pathname, umode_t mode);
+long do_rmdir(int dfd, const char __user *pathname);
long do_unlinkat(int dfd, struct filename *name);
+long do_symlinkat(const char __user *oldname, int newdfd,
+ const char __user *newname);
+int do_linkat(int olddfd, const char __user *oldname, int newdfd,
+ const char __user *newname, int flags);
/*
* namespace.c
@@ -111,7 +119,12 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
-extern int open_check_o_direct(struct file *f);
+long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
+long do_faccessat(int dfd, const char __user *filename, int mode);
+int do_fchmodat(int dfd, const char __user *filename, umode_t mode);
+int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
+ int flag);
+
extern int vfs_open(const struct path *, struct file *, const struct cred *);
extern struct file *filp_clone_open(struct file *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5ace7efb0d04..4823431d1c9d 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -689,7 +689,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
return error;
}
-SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
{
int error;
struct fd f = fdget(fd);
@@ -702,3 +702,8 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
fdput(f);
return error;
}
+
+SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+{
+ return ksys_ioctl(fd, cmd, arg);
+}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 3fbf48ec2188..dfb057900e79 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -974,7 +974,7 @@ out:
}
/*
- * This is a variaon of __jbd2_update_log_tail which checks for validity of
+ * This is a variation of __jbd2_update_log_tail which checks for validity of
* provided log tail and locks j_checkpoint_mutex. So it is safe against races
* with other threads updating log tail.
*/
@@ -1417,6 +1417,9 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
journal_superblock_t *sb = journal->j_superblock;
int ret;
+ if (is_journal_aborted(journal))
+ return -EIO;
+
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
tail_block, tail_tid);
@@ -1483,12 +1486,15 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
void jbd2_journal_update_sb_errno(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
+ int errcode;
read_lock(&journal->j_state_lock);
- jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
- journal->j_errno);
- sb->s_errno = cpu_to_be32(journal->j_errno);
+ errcode = journal->j_errno;
read_unlock(&journal->j_state_lock);
+ if (errcode == -ESHUTDOWN)
+ errcode = 0;
+ jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
+ sb->s_errno = cpu_to_be32(errcode);
jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
}
@@ -2105,12 +2111,22 @@ void __jbd2_journal_abort_hard(journal_t *journal)
* but don't do any other IO. */
static void __journal_abort_soft (journal_t *journal, int errno)
{
- if (journal->j_flags & JBD2_ABORT)
- return;
+ int old_errno;
- if (!journal->j_errno)
+ write_lock(&journal->j_state_lock);
+ old_errno = journal->j_errno;
+ if (!journal->j_errno || errno == -ESHUTDOWN)
journal->j_errno = errno;
+ if (journal->j_flags & JBD2_ABORT) {
+ write_unlock(&journal->j_state_lock);
+ if (!old_errno && old_errno != -ESHUTDOWN &&
+ errno == -ESHUTDOWN)
+ jbd2_journal_update_sb_errno(journal);
+ return;
+ }
+ write_unlock(&journal->j_state_lock);
+
__jbd2_journal_abort_hard(journal);
if (errno) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index f99910b69c78..a4967b27ffb6 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -600,8 +600,8 @@ static int do_one_pass(journal_t *journal,
success = -EFSBADCRC;
printk(KERN_ERR "JBD2: Invalid "
"checksum recovering "
- "block %llu in log\n",
- blocknr);
+ "data block %llu in "
+ "log\n", blocknr);
block_error = 1;
goto skip_write;
}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 4a6cf289be24..83b8f06b4a64 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -21,14 +21,6 @@
#include <linux/pagemap.h>
#include "nodelist.h"
-struct erase_priv_struct {
- struct jffs2_eraseblock *jeb;
- struct jffs2_sb_info *c;
-};
-
-#ifndef __ECOS
-static void jffs2_erase_callback(struct erase_info *);
-#endif
static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
@@ -51,7 +43,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n",
__func__,
jeb->offset, jeb->offset, jeb->offset + c->sector_size);
- instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL);
+ instr = kmalloc(sizeof(struct erase_info), GFP_KERNEL);
if (!instr) {
pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
mutex_lock(&c->erase_free_sem);
@@ -67,18 +59,15 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
memset(instr, 0, sizeof(*instr));
- instr->mtd = c->mtd;
instr->addr = jeb->offset;
instr->len = c->sector_size;
- instr->callback = jffs2_erase_callback;
- instr->priv = (unsigned long)(&instr[1]);
-
- ((struct erase_priv_struct *)instr->priv)->jeb = jeb;
- ((struct erase_priv_struct *)instr->priv)->c = c;
ret = mtd_erase(c->mtd, instr);
- if (!ret)
+ if (!ret) {
+ jffs2_erase_succeeded(c, jeb);
+ kfree(instr);
return;
+ }
bad_offset = instr->fail_addr;
kfree(instr);
@@ -214,22 +203,6 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
wake_up(&c->erase_wait);
}
-#ifndef __ECOS
-static void jffs2_erase_callback(struct erase_info *instr)
-{
- struct erase_priv_struct *priv = (void *)instr->priv;
-
- if(instr->state != MTD_ERASE_DONE) {
- pr_warn("Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
- (unsigned long long)instr->addr, instr->state);
- jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
- } else {
- jffs2_erase_succeeded(priv->c, priv->jeb);
- }
- kfree(instr);
-}
-#endif /* !__ECOS */
-
/* Hmmm. Maybe we should accept the extra space it takes and make
this a standard doubly-linked list? */
static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 9c36d614bf89..346ed161756d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,8 +57,8 @@ static struct task_struct *nlmsvc_task;
static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
-atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
-DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
+static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
unsigned int lockd_net_id;
diff --git a/fs/locks.c b/fs/locks.c
index d6ff4beb70ce..62bbe8b31f26 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -559,7 +559,7 @@ static const struct lock_manager_operations lease_manager_ops = {
* Initialize a lease, use the default lock manager operations
*/
static int lease_init(struct file *filp, long type, struct file_lock *fl)
- {
+{
if (assign_type(fl, type) != 0)
return -EINVAL;
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index f2a0cfcef11d..bcd53a79156f 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -18,7 +18,7 @@ config MINIX_FS
config MINIX_FS_NATIVE_ENDIAN
def_bool MINIX_FS
- depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+ depends on MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
def_bool MINIX_FS
diff --git a/fs/namei.c b/fs/namei.c
index 921ae32dbc80..a66ed5a1622a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -39,6 +39,7 @@
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>
+#include <linux/build_bug.h>
#include "internal.h"
#include "mount.h"
@@ -130,6 +131,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
struct filename *result;
char *kname;
int len;
+ BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
result = audit_reusename(filename);
if (result)
@@ -559,9 +561,10 @@ static int __nd_alloc_stack(struct nameidata *nd)
static bool path_connected(const struct path *path)
{
struct vfsmount *mnt = path->mnt;
+ struct super_block *sb = mnt->mnt_sb;
- /* Only bind mounts can have disconnected paths */
- if (mnt->mnt_root == mnt->mnt_sb->s_root)
+ /* Bind mounts and multi-root filesystems can have disconnected paths */
+ if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
return true;
return is_subdir(path->dentry, mnt->mnt_root);
@@ -926,7 +929,8 @@ static inline int may_follow_link(struct nameidata *nd)
if (nd->flags & LOOKUP_RCU)
return -ECHILD;
- audit_log_link_denied("follow_link", &nd->stack[0].link);
+ audit_inode(nd->name, nd->stack[0].link.dentry, 0);
+ audit_log_link_denied("follow_link");
return -EACCES;
}
@@ -992,7 +996,7 @@ static int may_linkat(struct path *link)
if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
return 0;
- audit_log_link_denied("linkat", link);
+ audit_log_link_denied("linkat");
return -EPERM;
}
@@ -1473,43 +1477,36 @@ static struct dentry *lookup_dcache(const struct qstr *name,
}
/*
- * Call i_op->lookup on the dentry. The dentry must be negative and
- * unhashed.
- *
- * dir->d_inode->i_mutex must be held
+ * Parent directory has inode locked exclusive. This is one
+ * and only case when ->lookup() gets called on non in-lookup
+ * dentries - as the matter of fact, this only gets called
+ * when directory is guaranteed to have no in-lookup children
+ * at all.
*/
-static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct dentry *old;
-
- /* Don't create child dentry for a dead directory. */
- if (unlikely(IS_DEADDIR(dir))) {
- dput(dentry);
- return ERR_PTR(-ENOENT);
- }
-
- old = dir->i_op->lookup(dir, dentry, flags);
- if (unlikely(old)) {
- dput(dentry);
- dentry = old;
- }
- return dentry;
-}
-
static struct dentry *__lookup_hash(const struct qstr *name,
struct dentry *base, unsigned int flags)
{
struct dentry *dentry = lookup_dcache(name, base, flags);
+ struct dentry *old;
+ struct inode *dir = base->d_inode;
if (dentry)
return dentry;
+ /* Don't create child dentry for a dead directory. */
+ if (unlikely(IS_DEADDIR(dir)))
+ return ERR_PTR(-ENOENT);
+
dentry = d_alloc(base, name);
if (unlikely(!dentry))
return ERR_PTR(-ENOMEM);
- return lookup_real(base->d_inode, dentry, flags);
+ old = dir->i_op->lookup(dir, dentry, flags);
+ if (unlikely(old)) {
+ dput(dentry);
+ dentry = old;
+ }
+ return dentry;
}
static int lookup_fast(struct nameidata *nd,
@@ -3380,9 +3377,7 @@ finish_open_created:
goto out;
*opened |= FILE_OPENED;
opened:
- error = open_check_o_direct(file);
- if (!error)
- error = ima_file_check(file, op->acc_mode, *opened);
+ error = ima_file_check(file, op->acc_mode, *opened);
if (!error && will_truncate)
error = handle_truncate(file);
out:
@@ -3462,9 +3457,6 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
error = finish_open(file, child, NULL, opened);
if (error)
goto out2;
- error = open_check_o_direct(file);
- if (error)
- fput(file);
out2:
mnt_drop_write(path.mnt);
out:
@@ -3728,8 +3720,8 @@ static int may_mknod(umode_t mode)
}
}
-SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
- unsigned, dev)
+long do_mknodat(int dfd, const char __user *filename, umode_t mode,
+ unsigned int dev)
{
struct dentry *dentry;
struct path path;
@@ -3772,9 +3764,15 @@ out:
return error;
}
+SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
+ unsigned int, dev)
+{
+ return do_mknodat(dfd, filename, mode, dev);
+}
+
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
- return sys_mknodat(AT_FDCWD, filename, mode, dev);
+ return do_mknodat(AT_FDCWD, filename, mode, dev);
}
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -3803,7 +3801,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
EXPORT_SYMBOL(vfs_mkdir);
-SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
+long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
{
struct dentry *dentry;
struct path path;
@@ -3828,9 +3826,14 @@ retry:
return error;
}
+SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
+{
+ return do_mkdirat(dfd, pathname, mode);
+}
+
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
- return sys_mkdirat(AT_FDCWD, pathname, mode);
+ return do_mkdirat(AT_FDCWD, pathname, mode);
}
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -3872,7 +3875,7 @@ out:
}
EXPORT_SYMBOL(vfs_rmdir);
-static long do_rmdir(int dfd, const char __user *pathname)
+long do_rmdir(int dfd, const char __user *pathname)
{
int error = 0;
struct filename *name;
@@ -4108,8 +4111,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
}
EXPORT_SYMBOL(vfs_symlink);
-SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
- int, newdfd, const char __user *, newname)
+long do_symlinkat(const char __user *oldname, int newdfd,
+ const char __user *newname)
{
int error;
struct filename *from;
@@ -4139,9 +4142,15 @@ out_putname:
return error;
}
+SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
+ int, newdfd, const char __user *, newname)
+{
+ return do_symlinkat(oldname, newdfd, newname);
+}
+
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
- return sys_symlinkat(oldname, AT_FDCWD, newname);
+ return do_symlinkat(oldname, AT_FDCWD, newname);
}
/**
@@ -4233,8 +4242,8 @@ EXPORT_SYMBOL(vfs_link);
* with linux 2.0, and to avoid hard-linking to directories
* and other special files. --ADM
*/
-SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
- int, newdfd, const char __user *, newname, int, flags)
+int do_linkat(int olddfd, const char __user *oldname, int newdfd,
+ const char __user *newname, int flags)
{
struct dentry *new_dentry;
struct path old_path, new_path;
@@ -4298,9 +4307,15 @@ out:
return error;
}
+SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
+ int, newdfd, const char __user *, newname, int, flags)
+{
+ return do_linkat(olddfd, oldname, newdfd, newname, flags);
+}
+
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
- return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
/**
@@ -4478,8 +4493,8 @@ out:
}
EXPORT_SYMBOL(vfs_rename);
-SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
- int, newdfd, const char __user *, newname, unsigned int, flags)
+static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
+ const char __user *newname, unsigned int flags)
{
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
@@ -4621,15 +4636,21 @@ exit:
return error;
}
+SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
+ int, newdfd, const char __user *, newname, unsigned int, flags)
+{
+ return do_renameat2(olddfd, oldname, newdfd, newname, flags);
+}
+
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
- return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
+ return do_renameat2(olddfd, oldname, newdfd, newname, 0);
}
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
- return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
int vfs_whiteout(struct inode *dir, struct dentry *dentry)
diff --git a/fs/namespace.c b/fs/namespace.c
index 9d1374ab6e06..e398f32d7541 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1680,7 +1680,7 @@ static inline bool may_mandlock(void)
* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
*/
-SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
+int ksys_umount(char __user *name, int flags)
{
struct path path;
struct mount *mnt;
@@ -1720,6 +1720,11 @@ out:
return retval;
}
+SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
+{
+ return ksys_umount(name, flags);
+}
+
#ifdef __ARCH_WANT_SYS_OLDUMOUNT
/*
@@ -1727,7 +1732,7 @@ out:
*/
SYSCALL_DEFINE1(oldumount, char __user *, name)
{
- return sys_umount(name, 0);
+ return ksys_umount(name, 0);
}
#endif
@@ -3032,8 +3037,8 @@ struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
}
EXPORT_SYMBOL(mount_subtree);
-SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
- char __user *, type, unsigned long, flags, void __user *, data)
+int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
+ unsigned long flags, void __user *data)
{
int ret;
char *kernel_type;
@@ -3066,6 +3071,12 @@ out_type:
return ret;
}
+SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
+ char __user *, type, unsigned long, flags, void __user *, data)
+{
+ return ksys_mount(dev_name, dir_name, type, flags, data);
+}
+
/*
* Return true if path is reachable from root
*
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2435af56b87e..a50d7813e3ea 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -572,7 +572,7 @@ out:
}
static bool
-validate_bitmap_values(unsigned long mask)
+validate_bitmap_values(unsigned int mask)
{
return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
}
@@ -596,17 +596,15 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
goto out;
status = cpu_to_be32(NFS4_OK);
- if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
- &args->craa_type_mask))
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_RDATA_DLG))
flags = FMODE_READ;
- if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
- &args->craa_type_mask))
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_WDATA_DLG))
flags |= FMODE_WRITE;
- if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
- &args->craa_type_mask))
- pnfs_recall_all_layouts(cps->clp);
if (flags)
nfs_expire_unused_delegation_types(cps->clp, flags);
+
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
+ pnfs_recall_all_layouts(cps->clp);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8c10b0562e75..621c517b325c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -86,10 +86,10 @@ struct nfs_direct_req {
struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
int mirror_count;
+ loff_t io_start; /* Start offset for I/O */
ssize_t count, /* bytes actually processed */
max_count, /* max expected count */
bytes_left, /* bytes left to be sent */
- io_start, /* start of IO */
error; /* any reported error */
struct completion completion; /* wait for i/o completion */
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 0ee4b93d36ea..1c5d8d31fc0a 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -50,59 +50,6 @@ void nfs_fscache_unregister(void)
}
/*
- * Layout of the key for an NFS server cache object.
- */
-struct nfs_server_key {
- uint16_t nfsversion; /* NFS protocol version */
- uint16_t family; /* address family */
- uint16_t port; /* IP port */
- union {
- struct in_addr ipv4_addr; /* IPv4 address */
- struct in6_addr ipv6_addr; /* IPv6 address */
- } addr[0];
-};
-
-/*
- * Generate a key to describe a server in the main NFS index
- * - We return the length of the key, or 0 if we can't generate one
- */
-static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct nfs_client *clp = cookie_netfs_data;
- const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
- const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
- struct nfs_server_key *key = buffer;
- uint16_t len = sizeof(struct nfs_server_key);
-
- memset(key, 0, len);
- key->nfsversion = clp->rpc_ops->version;
- key->family = clp->cl_addr.ss_family;
-
- switch (clp->cl_addr.ss_family) {
- case AF_INET:
- key->port = sin->sin_port;
- key->addr[0].ipv4_addr = sin->sin_addr;
- len += sizeof(key->addr[0].ipv4_addr);
- break;
-
- case AF_INET6:
- key->port = sin6->sin6_port;
- key->addr[0].ipv6_addr = sin6->sin6_addr;
- len += sizeof(key->addr[0].ipv6_addr);
- break;
-
- default:
- printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
- clp->cl_addr.ss_family);
- len = 0;
- break;
- }
-
- return len;
-}
-
-/*
* Define the server object for FS-Cache. This is used to describe a server
* object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
* server address parameters.
@@ -110,33 +57,9 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
const struct fscache_cookie_def nfs_fscache_server_index_def = {
.name = "NFS.server",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = nfs_server_get_key,
};
/*
- * Generate a key to describe a superblock key in the main NFS index
- */
-static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct nfs_fscache_key *key;
- const struct nfs_server *nfss = cookie_netfs_data;
- uint16_t len;
-
- key = nfss->fscache_key;
- len = sizeof(key->key) + key->key.uniq_len;
- if (len > bufmax) {
- len = 0;
- } else {
- memcpy(buffer, &key->key, sizeof(key->key));
- memcpy(buffer + sizeof(key->key),
- key->key.uniquifier, key->key.uniq_len);
- }
-
- return len;
-}
-
-/*
* Define the superblock object for FS-Cache. This is used to describe a
* superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
* parameters that might cause a separate superblock.
@@ -144,84 +67,9 @@ static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
const struct fscache_cookie_def nfs_fscache_super_index_def = {
.name = "NFS.super",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = nfs_super_get_key,
};
/*
- * Definition of the auxiliary data attached to NFS inode storage objects
- * within the cache.
- *
- * The contents of this struct are recorded in the on-disk local cache in the
- * auxiliary data attached to the data storage object backing an inode. This
- * permits coherency to be managed when a new inode binds to an already extant
- * cache object.
- */
-struct nfs_fscache_inode_auxdata {
- struct timespec mtime;
- struct timespec ctime;
- loff_t size;
- u64 change_attr;
-};
-
-/*
- * Generate a key to describe an NFS inode in an NFS server's index
- */
-static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct nfs_inode *nfsi = cookie_netfs_data;
- uint16_t nsize;
-
- /* use the inode's NFS filehandle as the key */
- nsize = nfsi->fh.size;
- memcpy(buffer, nfsi->fh.data, nsize);
- return nsize;
-}
-
-/*
- * Get certain file attributes from the netfs data
- * - This function can be absent for an index
- * - Not permitted to return an error
- * - The netfs data from the cookie being used as the source is presented
- */
-static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
- uint64_t *size)
-{
- const struct nfs_inode *nfsi = cookie_netfs_data;
-
- *size = nfsi->vfs_inode.i_size;
-}
-
-/*
- * Get the auxiliary data from netfs data
- * - This function can be absent if the index carries no state data
- * - Should store the auxiliary data in the buffer
- * - Should return the amount of amount stored
- * - Not permitted to return an error
- * - The netfs data from the cookie being used as the source is presented
- */
-static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- struct nfs_fscache_inode_auxdata auxdata;
- const struct nfs_inode *nfsi = cookie_netfs_data;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.size = nfsi->vfs_inode.i_size;
- auxdata.mtime = nfsi->vfs_inode.i_mtime;
- auxdata.ctime = nfsi->vfs_inode.i_ctime;
-
- if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
-
- if (bufmax > sizeof(auxdata))
- bufmax = sizeof(auxdata);
-
- memcpy(buffer, &auxdata, bufmax);
- return bufmax;
-}
-
-/*
* Consult the netfs about the state of an object
* - This function can be absent if the index carries no state data
* - The netfs data from the cookie being used as the target is
@@ -230,7 +78,8 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
static
enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
const void *data,
- uint16_t datalen)
+ uint16_t datalen,
+ loff_t object_size)
{
struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = cookie_netfs_data;
@@ -239,7 +88,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&auxdata, 0, sizeof(auxdata));
- auxdata.size = nfsi->vfs_inode.i_size;
auxdata.mtime = nfsi->vfs_inode.i_mtime;
auxdata.ctime = nfsi->vfs_inode.i_ctime;
@@ -288,9 +136,6 @@ static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
const struct fscache_cookie_def nfs_fscache_inode_object_def = {
.name = "NFS.fh",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = nfs_fscache_inode_get_key,
- .get_attr = nfs_fscache_inode_get_attr,
- .get_aux = nfs_fscache_inode_get_aux,
.check_aux = nfs_fscache_inode_check_aux,
.get_context = nfs_fh_get_context,
.put_context = nfs_fh_put_context,
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index d63bea8bbfbb..b55fc7920c3b 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -18,6 +18,7 @@
#include <linux/in6.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
+#include <linux/iversion.h>
#include "internal.h"
#include "iostat.h"
@@ -29,6 +30,21 @@ static struct rb_root nfs_fscache_keys = RB_ROOT;
static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+ struct {
+ uint16_t nfsversion; /* NFS protocol version */
+ uint16_t family; /* address family */
+ __be16 port; /* IP port */
+ } hdr;
+ union {
+ struct in_addr ipv4_addr; /* IPv4 address */
+ struct in6_addr ipv6_addr; /* IPv6 address */
+ };
+} __packed;
+
+/*
* Get the per-client index cookie for an NFS client if the appropriate mount
* flag was set
* - We always try and get an index cookie for the client, but get filehandle
@@ -36,10 +52,41 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
*/
void nfs_fscache_get_client_cookie(struct nfs_client *clp)
{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+ const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+ struct nfs_server_key key;
+ uint16_t len = sizeof(key.hdr);
+
+ memset(&key, 0, sizeof(key));
+ key.hdr.nfsversion = clp->rpc_ops->version;
+ key.hdr.family = clp->cl_addr.ss_family;
+
+ switch (clp->cl_addr.ss_family) {
+ case AF_INET:
+ key.hdr.port = sin->sin_port;
+ key.ipv4_addr = sin->sin_addr;
+ len += sizeof(key.ipv4_addr);
+ break;
+
+ case AF_INET6:
+ key.hdr.port = sin6->sin6_port;
+ key.ipv6_addr = sin6->sin6_addr;
+ len += sizeof(key.ipv6_addr);
+ break;
+
+ default:
+ printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+ clp->cl_addr.ss_family);
+ clp->fscache = NULL;
+ return;
+ }
+
/* create a cache index for looking up filehandles */
clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
&nfs_fscache_server_index_def,
- clp, true);
+ &key, len,
+ NULL, 0,
+ clp, 0, true);
dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
clp, clp->fscache);
}
@@ -52,7 +99,7 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
clp, clp->fscache);
- fscache_relinquish_cookie(clp->fscache, 0);
+ fscache_relinquish_cookie(clp->fscache, NULL, false);
clp->fscache = NULL;
}
@@ -139,7 +186,9 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
/* create a cache index for looking up filehandles */
nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
&nfs_fscache_super_index_def,
- nfss, true);
+ key, sizeof(*key) + ulen,
+ NULL, 0,
+ nfss, 0, true);
dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
nfss, nfss->fscache);
return;
@@ -163,7 +212,7 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
nfss, nfss->fscache);
- fscache_relinquish_cookie(nfss->fscache, 0);
+ fscache_relinquish_cookie(nfss->fscache, NULL, false);
nfss->fscache = NULL;
if (nfss->fscache_key) {
@@ -180,14 +229,25 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
*/
void nfs_fscache_init_inode(struct inode *inode)
{
+ struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = NFS_I(inode);
nfsi->fscache = NULL;
if (!S_ISREG(inode->i_mode))
return;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+
nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
&nfs_fscache_inode_object_def,
- nfsi, false);
+ nfsi->fh.data, nfsi->fh.size,
+ &auxdata, sizeof(auxdata),
+ nfsi, nfsi->vfs_inode.i_size, false);
}
/*
@@ -195,12 +255,16 @@ void nfs_fscache_init_inode(struct inode *inode)
*/
void nfs_fscache_clear_inode(struct inode *inode)
{
+ struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
- fscache_relinquish_cookie(cookie, false);
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+ fscache_relinquish_cookie(cookie, &auxdata, false);
nfsi->fscache = NULL;
}
@@ -232,20 +296,26 @@ static bool nfs_fscache_can_enable(void *data)
*/
void nfs_fscache_open_file(struct inode *inode, struct file *filp)
{
+ struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
if (!fscache_cookie_valid(cookie))
return;
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
if (inode_is_open_for_write(inode)) {
dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
- fscache_disable_cookie(cookie, true);
+ fscache_disable_cookie(cookie, &auxdata, true);
fscache_uncache_all_inode_pages(cookie, inode);
} else {
dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
- fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode);
+ fscache_enable_cookie(cookie, &auxdata, nfsi->vfs_inode.i_size,
+ nfs_fscache_can_enable, inode);
if (fscache_cookie_enabled(cookie))
set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
}
@@ -422,7 +492,8 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
"NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
nfs_i_fscache(inode), page, page->index, page->flags, sync);
- ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);
+ ret = fscache_write_page(nfs_i_fscache(inode), page,
+ inode->i_size, GFP_KERNEL);
dfprintk(FSCACHE,
"NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
page, page->index, page->flags, ret);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index d7fe3e799f2f..161ba2edb9d0 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -57,6 +57,21 @@ struct nfs_fscache_key {
};
/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode. This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+ struct timespec mtime;
+ struct timespec ctime;
+ u64 change_attr;
+};
+
+/*
* fscache-index.c
*/
extern struct fscache_netfs nfs_fscache_netfs;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7d893543cf3b..d17a90c4fa37 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -85,11 +85,6 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
-int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode)
-{
- return nfs_wait_killable(mode);
-}
-
/**
* nfs_compat_user_ino64 - returns the user-visible inode number
* @fileid: 64-bit fileid
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 49f848fd1f04..7327930ad970 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -873,7 +873,7 @@ static void nfs3_nlm_release_call(void *data)
}
}
-const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = {
+static const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = {
.nlmclnt_alloc_call = nfs3_nlm_alloc_call,
.nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare,
.nlmclnt_release_call = nfs3_nlm_release_call,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 04612c24d394..979631411a0e 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -868,8 +868,10 @@ static int nfs4_set_client(struct nfs_server *server,
if (IS_ERR(clp))
return PTR_ERR(clp);
- if (server->nfs_client == clp)
+ if (server->nfs_client == clp) {
+ nfs_put_client(clp);
return -ELOOP;
+ }
/*
* Query for the lease time on clientid setup or renewal
@@ -1244,11 +1246,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
clp->cl_proto, clnt->cl_timeout,
clp->cl_minorversion, net);
clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
- nfs_put_client(clp);
if (error != 0) {
nfs_server_insert_lists(server);
return error;
}
+ nfs_put_client(clp);
if (server->nfs_client->cl_hostname == NULL)
server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 65c9c4175145..b993ad282de2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,7 +52,6 @@
#include <linux/nfs.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
-#include <linux/fs_struct.h>
#include "nfs4_fs.h"
#include "internal.h"
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 18a7626ac638..67d19cd92e44 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -98,8 +98,8 @@ nfs_page_free(struct nfs_page *p)
int
nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
{
- return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable,
- TASK_KILLABLE);
+ return wait_var_event_killable(&l_ctx->io_count,
+ !atomic_read(&l_ctx->io_count));
}
/**
@@ -395,7 +395,7 @@ static void nfs_clear_request(struct nfs_page *req)
}
if (l_ctx != NULL) {
if (atomic_dec_and_test(&l_ctx->io_count)) {
- wake_up_atomic_t(&l_ctx->io_count);
+ wake_up_var(&l_ctx->io_count);
if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c13e826614b5..ee723aa153a3 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -292,8 +292,11 @@ pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
void
pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
{
- struct inode *inode = lo->plh_inode;
+ struct inode *inode;
+ if (!lo)
+ return;
+ inode = lo->plh_inode;
pnfs_layoutreturn_before_put_layout_hdr(lo);
if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
@@ -1241,10 +1244,12 @@ retry:
spin_lock(&ino->i_lock);
lo = nfsi->layout;
if (!lo || !pnfs_layout_is_valid(lo) ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ lo = NULL;
goto out_noroc;
+ }
+ pnfs_get_layout_hdr(lo);
if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
- pnfs_get_layout_hdr(lo);
spin_unlock(&ino->i_lock);
wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
TASK_UNINTERRUPTIBLE);
@@ -1312,10 +1317,12 @@ out_noroc:
struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
if (ld->prepare_layoutreturn)
ld->prepare_layoutreturn(args);
+ pnfs_put_layout_hdr(lo);
return true;
}
if (layoutreturn)
pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+ pnfs_put_layout_hdr(lo);
return false;
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 03aaa60c7768..32ba2d471853 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -245,7 +245,7 @@ pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
{
if (list_empty(pages)) {
if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
- wake_up_atomic_t(&cinfo->mds->rpcs_out);
+ wake_up_var(&cinfo->mds->rpcs_out);
/* don't call nfs_commitdata_release - it tries to put
* the open_context which is not acquired until nfs_init_commit
* which has not been called on @data */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 29bacdc56f6a..5e470e233c83 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
/* initial superblock/root creation */
mount_info->fill_super(s, mount_info);
nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
+ if (!(server->flags & NFS_MOUNT_UNSHARED))
+ s->s_iflags |= SB_I_MULTIROOT;
}
mntroot = nfs_get_root(s, mount_info->mntfh, dev_name);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7428a669d7a7..6579f3b367bd 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1620,8 +1620,8 @@ static void nfs_writeback_result(struct rpc_task *task,
static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
{
- return wait_on_atomic_t(&cinfo->rpcs_out,
- nfs_wait_atomic_killable, TASK_KILLABLE);
+ return wait_var_event_killable(&cinfo->rpcs_out,
+ !atomic_read(&cinfo->rpcs_out));
}
static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
@@ -1632,7 +1632,7 @@ static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
{
if (atomic_dec_and_test(&cinfo->rpcs_out))
- wake_up_atomic_t(&cinfo->rpcs_out);
+ wake_up_var(&cinfo->rpcs_out);
}
void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1876,40 +1876,43 @@ int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
return status;
}
-int nfs_commit_inode(struct inode *inode, int how)
+static int __nfs_commit_inode(struct inode *inode, int how,
+ struct writeback_control *wbc)
{
LIST_HEAD(head);
struct nfs_commit_info cinfo;
int may_wait = how & FLUSH_SYNC;
- int error = 0;
- int res;
+ int ret, nscan;
nfs_init_cinfo_from_inode(&cinfo, inode);
nfs_commit_begin(cinfo.mds);
- res = nfs_scan_commit(inode, &head, &cinfo);
- if (res)
- error = nfs_generic_commit_list(inode, &head, how, &cinfo);
+ for (;;) {
+ ret = nscan = nfs_scan_commit(inode, &head, &cinfo);
+ if (ret <= 0)
+ break;
+ ret = nfs_generic_commit_list(inode, &head, how, &cinfo);
+ if (ret < 0)
+ break;
+ ret = 0;
+ if (wbc && wbc->sync_mode == WB_SYNC_NONE) {
+ if (nscan < wbc->nr_to_write)
+ wbc->nr_to_write -= nscan;
+ else
+ wbc->nr_to_write = 0;
+ }
+ if (nscan < INT_MAX)
+ break;
+ cond_resched();
+ }
nfs_commit_end(cinfo.mds);
- if (res == 0)
- return res;
- if (error < 0)
- goto out_error;
- if (!may_wait)
- goto out_mark_dirty;
- error = wait_on_commit(cinfo.mds);
- if (error < 0)
- return error;
- return res;
-out_error:
- res = error;
- /* Note: If we exit without ensuring that the commit is complete,
- * we must mark the inode as dirty. Otherwise, future calls to
- * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
- * that the data is on the disk.
- */
-out_mark_dirty:
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
- return res;
+ if (ret || !may_wait)
+ return ret;
+ return wait_on_commit(cinfo.mds);
+}
+
+int nfs_commit_inode(struct inode *inode, int how)
+{
+ return __nfs_commit_inode(inode, how, NULL);
}
EXPORT_SYMBOL_GPL(nfs_commit_inode);
@@ -1919,11 +1922,11 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
int flags = FLUSH_SYNC;
int ret = 0;
- /* no commits means nothing needs to be done */
- if (!atomic_long_read(&nfsi->commit_info.ncommit))
- return ret;
-
if (wbc->sync_mode == WB_SYNC_NONE) {
+ /* no commits means nothing needs to be done */
+ if (!atomic_long_read(&nfsi->commit_info.ncommit))
+ goto check_requests_outstanding;
+
/* Don't commit yet if this is a non-blocking flush and there
* are a lot of outstanding writes for this mapping.
*/
@@ -1934,16 +1937,16 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
flags = 0;
}
- ret = nfs_commit_inode(inode, flags);
- if (ret >= 0) {
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (ret < wbc->nr_to_write)
- wbc->nr_to_write -= ret;
- else
- wbc->nr_to_write = 0;
- }
- return 0;
- }
+ ret = __nfs_commit_inode(inode, flags, wbc);
+ if (!ret) {
+ if (flags & FLUSH_SYNC)
+ return 0;
+ } else if (atomic_long_read(&nfsi->commit_info.ncommit))
+ goto out_mark_dirty;
+
+check_requests_outstanding:
+ if (!atomic_read(&nfsi->commit_info.rpcs_out))
+ return ret;
out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 1d0ce3c57d93..6259a4b8579f 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -192,6 +192,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
struct nfsd3_writeres *resp = rqstp->rq_resp;
__be32 nfserr;
unsigned long cnt = argp->len;
+ unsigned int nvecs;
dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
SVCFH_fmt(&argp->fh),
@@ -201,9 +202,12 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->committed = argp->stable;
+ nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt);
+ if (!nvecs)
+ RETURN_STATUS(nfserr_io);
nfserr = nfsd_write(rqstp, &resp->fh, argp->offset,
- rqstp->rq_vec, argp->vlen,
- &cnt, resp->committed);
+ rqstp->rq_vec, nvecs, &cnt,
+ resp->committed);
resp->count = cnt;
RETURN_STATUS(nfserr);
}
@@ -279,6 +283,16 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
struct nfsd3_diropres *resp = rqstp->rq_resp;
__be32 nfserr;
+ if (argp->tlen == 0)
+ RETURN_STATUS(nfserr_inval);
+ if (argp->tlen > NFS3_MAXPATHLEN)
+ RETURN_STATUS(nfserr_nametoolong);
+
+ argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first,
+ argp->tlen);
+ if (IS_ERR(argp->tname))
+ RETURN_STATUS(nfserrno(PTR_ERR(argp->tname)));
+
dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n",
SVCFH_fmt(&argp->ffh),
argp->flen, argp->fname,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 1a70581e1cb2..3192b544a441 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -391,7 +391,7 @@ int
nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_writeargs *args = rqstp->rq_argp;
- unsigned int len, v, hdr, dlen;
+ unsigned int len, hdr, dlen;
u32 max_blocksize = svc_max_payload(rqstp);
struct kvec *head = rqstp->rq_arg.head;
struct kvec *tail = rqstp->rq_arg.tail;
@@ -433,17 +433,9 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
args->count = max_blocksize;
len = args->len = max_blocksize;
}
- rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
- v = 0;
- while (len > rqstp->rq_vec[v].iov_len) {
- len -= rqstp->rq_vec[v].iov_len;
- v++;
- rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
- rqstp->rq_vec[v].iov_len = PAGE_SIZE;
- }
- rqstp->rq_vec[v].iov_len = len;
- args->vlen = v + 1;
+
+ args->first.iov_base = (void *)p;
+ args->first.iov_len = head->iov_len - hdr;
return 1;
}
@@ -489,51 +481,24 @@ int
nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_symlinkargs *args = rqstp->rq_argp;
- unsigned int len, avail;
- char *old, *new;
- struct kvec *vec;
+ char *base = (char *)p;
+ size_t dlen;
if (!(p = decode_fh(p, &args->ffh)) ||
- !(p = decode_filename(p, &args->fname, &args->flen))
- )
+ !(p = decode_filename(p, &args->fname, &args->flen)))
return 0;
p = decode_sattr3(p, &args->attrs);
- /* now decode the pathname, which might be larger than the first page.
- * As we have to check for nul's anyway, we copy it into a new page
- * This page appears in the rq_res.pages list, but as pages_len is always
- * 0, it won't get in the way
- */
- len = ntohl(*p++);
- if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
- return 0;
- args->tname = new = page_address(*(rqstp->rq_next_page++));
- args->tlen = len;
- /* first copy and check from the first page */
- old = (char*)p;
- vec = &rqstp->rq_arg.head[0];
- if ((void *)old > vec->iov_base + vec->iov_len)
- return 0;
- avail = vec->iov_len - (old - (char*)vec->iov_base);
- while (len && avail && *old) {
- *new++ = *old++;
- len--;
- avail--;
- }
- /* now copy next page if there is one */
- if (len && !avail && rqstp->rq_arg.page_len) {
- avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE);
- old = page_address(rqstp->rq_arg.pages[0]);
- }
- while (len && avail && *old) {
- *new++ = *old++;
- len--;
- avail--;
- }
- *new = '\0';
- if (len)
- return 0;
+ args->tlen = ntohl(*p++);
+
+ args->first.iov_base = p;
+ args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
+ args->first.iov_len -= (char *)p - base;
+ dlen = args->first.iov_len + rqstp->rq_arg.page_len +
+ rqstp->rq_arg.tail[0].iov_len;
+ if (dlen < XDR_QUADLEN(args->tlen) << 2)
+ return 0;
return 1;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 49b0a9e7ff18..1f04d2a70d25 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -223,8 +223,8 @@ static int nfs_cb_stat_to_errno(int status)
return -status;
}
-static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
- int *status)
+static int decode_cb_op_status(struct xdr_stream *xdr,
+ enum nfs_cb_opnum4 expected, int *status)
{
__be32 *p;
u32 op;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 7d888369f85a..228faf00a594 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -165,7 +165,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
struct nfs4_client *clp = ls->ls_stid.sc_client;
struct nfs4_file *fp = ls->ls_stid.sc_file;
- trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+ trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid);
spin_lock(&clp->cl_lock);
list_del_init(&ls->ls_perclnt);
@@ -264,7 +264,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
list_add(&ls->ls_perfile, &fp->fi_lo_states);
spin_unlock(&fp->fi_lock);
- trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+ trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
return ls;
}
@@ -334,7 +334,7 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
if (list_empty(&ls->ls_layouts))
goto out_unlock;
- trace_layout_recall(&ls->ls_stid.sc_stateid);
+ trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid);
refcount_inc(&ls->ls_stid.sc_count);
nfsd4_run_cb(&ls->ls_recall);
@@ -507,7 +507,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
false, lrp->lr_layout_type,
&ls);
if (nfserr) {
- trace_layout_return_lookup_fail(&lrp->lr_sid);
+ trace_nfsd_layout_return_lookup_fail(&lrp->lr_sid);
return nfserr;
}
@@ -523,7 +523,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
lrp->lrs_present = 1;
} else {
- trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+ trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
nfs4_unhash_stid(&ls->ls_stid);
lrp->lrs_present = 0;
}
@@ -694,7 +694,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
/*
* Unknown error or non-responding client, we'll need to fence.
*/
- trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+ trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid);
ops = nfsd4_layout_ops[ls->ls_layout_type];
if (ops->fence_client)
@@ -703,7 +703,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
nfsd4_cb_layout_fail(ls);
return -1;
case -NFS4ERR_NOMATCHING_LAYOUT:
- trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+ trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid);
task->tk_status = 0;
return 1;
}
@@ -716,7 +716,7 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
container_of(cb, struct nfs4_layout_stateid, ls_recall);
LIST_HEAD(reaplist);
- trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+ trace_nfsd_layout_recall_release(&ls->ls_stid.sc_stateid);
nfsd4_return_all_layouts(ls, &reaplist);
nfsd4_free_layouts(&reaplist);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a0bed2b2004d..5d99e8810b85 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -32,6 +32,7 @@
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/fs_struct.h>
#include <linux/file.h>
#include <linux/falloc.h>
#include <linux/slab.h>
@@ -252,11 +253,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
* Note: create modes (UNCHECKED,GUARDED...) are the same
* in NFSv4 as in v3 except EXCLUSIVE4_1.
*/
+ current->fs->umask = open->op_umask;
status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
open->op_fname.len, &open->op_iattr,
*resfh, open->op_createmode,
(u32 *)open->op_verf.data,
&open->op_truncate, &open->op_created);
+ current->fs->umask = 0;
if (!status && open->op_label.len)
nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
@@ -603,6 +606,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
+ current->fs->umask = create->cr_umask;
switch (create->cr_type) {
case NF4LNK:
status = nfsd_symlink(rqstp, &cstate->current_fh,
@@ -611,20 +615,22 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
case NF4BLK:
+ status = nfserr_inval;
rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
if (MAJOR(rdev) != create->cr_specdata1 ||
MINOR(rdev) != create->cr_specdata2)
- return nfserr_inval;
+ goto out_umask;
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
&create->cr_iattr, S_IFBLK, rdev, &resfh);
break;
case NF4CHR:
+ status = nfserr_inval;
rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
if (MAJOR(rdev) != create->cr_specdata1 ||
MINOR(rdev) != create->cr_specdata2)
- return nfserr_inval;
+ goto out_umask;
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
&create->cr_iattr,S_IFCHR, rdev, &resfh);
@@ -668,6 +674,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fh_dup2(&cstate->current_fh, &resfh);
out:
fh_put(&resfh);
+out_umask:
+ current->fs->umask = 0;
return status;
}
@@ -751,6 +759,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (read->rd_offset >= OFFSET_MAX)
return nfserr_inval;
+ trace_nfsd_read_start(rqstp, &cstate->current_fh,
+ read->rd_offset, read->rd_length);
+
/*
* If we do a zero copy read, then a client will see read data
* that reflects the state of the file *after* performing the
@@ -783,6 +794,8 @@ nfsd4_read_release(union nfsd4_op_u *u)
{
if (u->read.rd_filp)
fput(u->read.rd_filp);
+ trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
+ u->read.rd_offset, u->read.rd_length);
}
static __be32
@@ -1001,6 +1014,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (write->wr_offset >= OFFSET_MAX)
return nfserr_inval;
+ cnt = write->wr_buflen;
+ trace_nfsd_write_start(rqstp, &cstate->current_fh,
+ write->wr_offset, cnt);
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
stateid, WR_STATE, &filp, NULL);
if (status) {
@@ -1008,7 +1024,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
- cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp));
@@ -1021,7 +1036,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fput(filp);
write->wr_bytes_written = cnt;
-
+ trace_nfsd_write_done(rqstp, &cstate->current_fh,
+ write->wr_offset, cnt);
return status;
}
@@ -1106,7 +1122,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
else {
copy->cp_res.wr_bytes_written = bytes;
copy->cp_res.wr_stable_how = NFS_UNSTABLE;
- copy->cp_consecutive = 1;
copy->cp_synchronous = 1;
gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
status = nfs_ok;
@@ -1412,7 +1427,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
true, lgp->lg_layout_type, &ls);
if (nfserr) {
- trace_layout_get_lookup_fail(&lgp->lg_sid);
+ trace_nfsd_layout_get_lookup_fail(&lgp->lg_sid);
goto out;
}
@@ -1481,7 +1496,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
false, lcp->lc_layout_type,
&ls);
if (nfserr) {
- trace_layout_commit_lookup_fail(&lcp->lc_sid);
+ trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid);
/* fixup error code as per RFC5661 */
if (nfserr == nfserr_bad_stateid)
nfserr = nfserr_badlayout;
@@ -1714,12 +1729,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
goto encode_op;
}
+ trace_nfsd_compound(rqstp, args->opcnt);
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
- dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
- resp->opcnt, args->opcnt, op->opnum,
- nfsd4_op_name(op->opnum));
/*
* The XDR decode routines may have pre-set op->status;
* for example, if there is a miscellaneous XDR error
@@ -1793,9 +1806,8 @@ encode_op:
status = op->status;
}
- dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
- args->ops, args->opcnt, resp->opcnt, op->opnum,
- be32_to_cpu(status));
+ trace_nfsd_compound_status(args->opcnt, resp->opcnt, status,
+ nfsd4_op_name(op->opnum));
nfsd4_cstate_clear_replay(cstate);
nfsd4_increment_op_stats(op->opnum);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 150521c9671b..fc74d6f46bd5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -98,6 +98,7 @@ enum nfsd4_st_mutex_lock_subclass {
*/
static DECLARE_WAIT_QUEUE_HEAD(close_wq);
+static struct kmem_cache *client_slab;
static struct kmem_cache *openowner_slab;
static struct kmem_cache *lockowner_slab;
static struct kmem_cache *file_slab;
@@ -268,6 +269,35 @@ free_blocked_lock(struct nfsd4_blocked_lock *nbl)
kfree(nbl);
}
+static void
+remove_blocked_locks(struct nfs4_lockowner *lo)
+{
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct nfsd4_blocked_lock *nbl;
+ LIST_HEAD(reaplist);
+
+ /* Dequeue all blocked locks */
+ spin_lock(&nn->blocked_locks_lock);
+ while (!list_empty(&lo->lo_blocked)) {
+ nbl = list_first_entry(&lo->lo_blocked,
+ struct nfsd4_blocked_lock,
+ nbl_list);
+ list_del_init(&nbl->nbl_list);
+ list_move(&nbl->nbl_lru, &reaplist);
+ }
+ spin_unlock(&nn->blocked_locks_lock);
+
+ /* Now free them */
+ while (!list_empty(&reaplist)) {
+ nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock,
+ nbl_lru);
+ list_del_init(&nbl->nbl_lru);
+ posix_unblock_lock(&nbl->nbl_lock);
+ free_blocked_lock(nbl);
+ }
+}
+
static int
nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
@@ -777,7 +807,8 @@ static void block_delegations(struct knfsd_fh *fh)
}
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
+ struct svc_fh *current_fh,
struct nfs4_clnt_odstate *odstate)
{
struct nfs4_delegation *dp;
@@ -808,6 +839,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
dp->dl_retries = 1;
nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
&nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+ get_nfs4_file(fp);
+ dp->dl_stid.sc_file = fp;
return dp;
out_dec:
atomic_long_dec(&num_delegations);
@@ -845,19 +878,35 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
spin_unlock(&stid->sc_lock);
}
-static void nfs4_put_deleg_lease(struct nfs4_file *fp)
+static void put_deleg_file(struct nfs4_file *fp)
{
struct file *filp = NULL;
spin_lock(&fp->fi_lock);
- if (fp->fi_deleg_file && --fp->fi_delegees == 0)
+ if (--fp->fi_delegees == 0)
swap(filp, fp->fi_deleg_file);
spin_unlock(&fp->fi_lock);
- if (filp) {
- vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
+ if (filp)
fput(filp);
- }
+}
+
+static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
+{
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
+ struct file *filp = fp->fi_deleg_file;
+
+ WARN_ON_ONCE(!fp->fi_delegees);
+
+ vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp);
+ put_deleg_file(fp);
+}
+
+static void destroy_unhashed_deleg(struct nfs4_delegation *dp)
+{
+ put_clnt_odstate(dp->dl_clnt_odstate);
+ nfs4_unlock_deleg_lease(dp);
+ nfs4_put_stid(&dp->dl_stid);
}
void nfs4_unhash_stid(struct nfs4_stid *s)
@@ -866,20 +915,16 @@ void nfs4_unhash_stid(struct nfs4_stid *s)
}
/**
- * nfs4_get_existing_delegation - Discover if this delegation already exists
+ * nfs4_delegation_exists - Discover if this delegation already exists
* @clp: a pointer to the nfs4_client we're granting a delegation to
* @fp: a pointer to the nfs4_file we're granting a delegation on
*
* Return:
- * On success: NULL if an existing delegation was not found.
- *
- * On error: -EAGAIN if one was previously granted to this nfs4_client
- * for this nfs4_file.
- *
+ * On success: true iff an existing delegation is found
*/
-static int
-nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
+static bool
+nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp)
{
struct nfs4_delegation *searchdp = NULL;
struct nfs4_client *searchclp = NULL;
@@ -890,10 +935,10 @@ nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
searchclp = searchdp->dl_stid.sc_client;
if (clp == searchclp) {
- return -EAGAIN;
+ return true;
}
}
- return 0;
+ return false;
}
/**
@@ -912,16 +957,13 @@ nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
static int
hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
{
- int status;
struct nfs4_client *clp = dp->dl_stid.sc_client;
lockdep_assert_held(&state_lock);
lockdep_assert_held(&fp->fi_lock);
- status = nfs4_get_existing_delegation(clp, fp);
- if (status)
- return status;
- ++fp->fi_delegees;
+ if (nfs4_delegation_exists(clp, fp))
+ return -EAGAIN;
refcount_inc(&dp->dl_stid.sc_count);
dp->dl_stid.sc_type = NFS4_DELEG_STID;
list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -957,11 +999,8 @@ static void destroy_delegation(struct nfs4_delegation *dp)
spin_lock(&state_lock);
unhashed = unhash_delegation_locked(dp);
spin_unlock(&state_lock);
- if (unhashed) {
- put_clnt_odstate(dp->dl_clnt_odstate);
- nfs4_put_deleg_lease(dp->dl_stid.sc_file);
- nfs4_put_stid(&dp->dl_stid);
- }
+ if (unhashed)
+ destroy_unhashed_deleg(dp);
}
static void revoke_delegation(struct nfs4_delegation *dp)
@@ -970,17 +1009,14 @@ static void revoke_delegation(struct nfs4_delegation *dp)
WARN_ON(!list_empty(&dp->dl_recall_lru));
- put_clnt_odstate(dp->dl_clnt_odstate);
- nfs4_put_deleg_lease(dp->dl_stid.sc_file);
-
- if (clp->cl_minorversion == 0)
- nfs4_put_stid(&dp->dl_stid);
- else {
+ if (clp->cl_minorversion) {
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+ refcount_inc(&dp->dl_stid.sc_count);
spin_lock(&clp->cl_lock);
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
spin_unlock(&clp->cl_lock);
}
+ destroy_unhashed_deleg(dp);
}
/*
@@ -1765,7 +1801,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
struct nfs4_client *clp;
int i;
- clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
+ clp = kmem_cache_zalloc(client_slab, GFP_KERNEL);
if (clp == NULL)
return NULL;
clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
@@ -1796,7 +1832,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
err_no_hashtbl:
kfree(clp->cl_name.data);
err_no_name:
- kfree(clp);
+ kmem_cache_free(client_slab, clp);
return NULL;
}
@@ -1816,7 +1852,7 @@ free_client(struct nfs4_client *clp)
kfree(clp->cl_ownerstr_hashtbl);
kfree(clp->cl_name.data);
idr_destroy(&clp->cl_stateids);
- kfree(clp);
+ kmem_cache_free(client_slab, clp);
}
/* must be called under the client_lock */
@@ -1866,6 +1902,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp)
static void
__destroy_client(struct nfs4_client *clp)
{
+ int i;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head reaplist;
@@ -1881,9 +1918,7 @@ __destroy_client(struct nfs4_client *clp)
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
list_del_init(&dp->dl_recall_lru);
- put_clnt_odstate(dp->dl_clnt_odstate);
- nfs4_put_deleg_lease(dp->dl_stid.sc_file);
- nfs4_put_stid(&dp->dl_stid);
+ destroy_unhashed_deleg(dp);
}
while (!list_empty(&clp->cl_revoked)) {
dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru);
@@ -1895,6 +1930,16 @@ __destroy_client(struct nfs4_client *clp)
nfs4_get_stateowner(&oo->oo_owner);
release_openowner(oo);
}
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ struct nfs4_stateowner *so, *tmp;
+
+ list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i],
+ so_strhash) {
+ /* Should be no openowners at this point */
+ WARN_ON_ONCE(so->so_is_open_owner);
+ remove_blocked_locks(lockowner(so));
+ }
+ }
nfsd4_return_all_client_layouts(clp);
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
@@ -2913,7 +2958,7 @@ out_no_session:
static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
{
if (!session)
- return 0;
+ return false;
return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
}
@@ -3431,21 +3476,26 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
void
nfsd4_free_slabs(void)
{
- kmem_cache_destroy(odstate_slab);
+ kmem_cache_destroy(client_slab);
kmem_cache_destroy(openowner_slab);
kmem_cache_destroy(lockowner_slab);
kmem_cache_destroy(file_slab);
kmem_cache_destroy(stateid_slab);
kmem_cache_destroy(deleg_slab);
+ kmem_cache_destroy(odstate_slab);
}
int
nfsd4_init_slabs(void)
{
+ client_slab = kmem_cache_create("nfsd4_clients",
+ sizeof(struct nfs4_client), 0, 0, NULL);
+ if (client_slab == NULL)
+ goto out;
openowner_slab = kmem_cache_create("nfsd4_openowners",
sizeof(struct nfs4_openowner), 0, 0, NULL);
if (openowner_slab == NULL)
- goto out;
+ goto out_free_client_slab;
lockowner_slab = kmem_cache_create("nfsd4_lockowners",
sizeof(struct nfs4_lockowner), 0, 0, NULL);
if (lockowner_slab == NULL)
@@ -3478,6 +3528,8 @@ out_free_lockowner_slab:
kmem_cache_destroy(lockowner_slab);
out_free_openowner_slab:
kmem_cache_destroy(openowner_slab);
+out_free_client_slab:
+ kmem_cache_destroy(client_slab);
out:
dprintk("nfsd4: out of memory while initializing nfsv4\n");
return -ENOMEM;
@@ -3905,17 +3957,9 @@ static bool
nfsd_break_deleg_cb(struct file_lock *fl)
{
bool ret = false;
- struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
- struct nfs4_delegation *dp;
+ struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
- if (!fp) {
- WARN(1, "(%p)->fl_owner NULL\n", fl);
- return ret;
- }
- if (fp->fi_had_conflict) {
- WARN(1, "duplicate break on %p\n", fp);
- return ret;
- }
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a delegation isn't returned
@@ -3925,15 +3969,7 @@ nfsd_break_deleg_cb(struct file_lock *fl)
spin_lock(&fp->fi_lock);
fp->fi_had_conflict = true;
- /*
- * If there are no delegations on the list, then return true
- * so that the lease code will go ahead and delete it.
- */
- if (list_empty(&fp->fi_delegations))
- ret = true;
- else
- list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
- nfsd_break_one_deleg(dp);
+ nfsd_break_one_deleg(dp);
spin_unlock(&fp->fi_lock);
return ret;
}
@@ -4257,7 +4293,8 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
}
-static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
+ int flag)
{
struct file_lock *fl;
@@ -4268,124 +4305,88 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
fl->fl_flags = FL_DELEG;
fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
fl->fl_end = OFFSET_MAX;
- fl->fl_owner = (fl_owner_t)fp;
+ fl->fl_owner = (fl_owner_t)dp;
fl->fl_pid = current->tgid;
+ fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file;
return fl;
}
-/**
- * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
- * @dp: a pointer to the nfs4_delegation we're adding.
- *
- * Return:
- * On success: Return code will be 0 on success.
- *
- * On error: -EAGAIN if there was an existing delegation.
- * nonzero if there is an error in other cases.
- *
- */
-
-static int nfs4_setlease(struct nfs4_delegation *dp)
-{
- struct nfs4_file *fp = dp->dl_stid.sc_file;
- struct file_lock *fl;
- struct file *filp;
- int status = 0;
-
- fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ);
- if (!fl)
- return -ENOMEM;
- filp = find_readable_file(fp);
- if (!filp) {
- /* We should always have a readable file here */
- WARN_ON_ONCE(1);
- locks_free_lock(fl);
- return -EBADF;
- }
- fl->fl_file = filp;
- status = vfs_setlease(filp, fl->fl_type, &fl, NULL);
- if (fl)
- locks_free_lock(fl);
- if (status)
- goto out_fput;
- spin_lock(&state_lock);
- spin_lock(&fp->fi_lock);
- /* Did the lease get broken before we took the lock? */
- status = -EAGAIN;
- if (fp->fi_had_conflict)
- goto out_unlock;
- /* Race breaker */
- if (fp->fi_deleg_file) {
- status = hash_delegation_locked(dp, fp);
- goto out_unlock;
- }
- fp->fi_deleg_file = filp;
- fp->fi_delegees = 0;
- status = hash_delegation_locked(dp, fp);
- spin_unlock(&fp->fi_lock);
- spin_unlock(&state_lock);
- if (status) {
- /* Should never happen, this is a new fi_deleg_file */
- WARN_ON_ONCE(1);
- goto out_fput;
- }
- return 0;
-out_unlock:
- spin_unlock(&fp->fi_lock);
- spin_unlock(&state_lock);
-out_fput:
- fput(filp);
- return status;
-}
-
static struct nfs4_delegation *
nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
{
- int status;
+ int status = 0;
struct nfs4_delegation *dp;
+ struct file *filp;
+ struct file_lock *fl;
+ /*
+ * The fi_had_conflict and nfs_get_existing_delegation checks
+ * here are just optimizations; we'll need to recheck them at
+ * the end:
+ */
if (fp->fi_had_conflict)
return ERR_PTR(-EAGAIN);
+ filp = find_readable_file(fp);
+ if (!filp) {
+ /* We should always have a readable file here */
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EBADF);
+ }
spin_lock(&state_lock);
spin_lock(&fp->fi_lock);
- status = nfs4_get_existing_delegation(clp, fp);
+ if (nfs4_delegation_exists(clp, fp))
+ status = -EAGAIN;
+ else if (!fp->fi_deleg_file) {
+ fp->fi_deleg_file = filp;
+ /* increment early to prevent fi_deleg_file from being
+ * cleared */
+ fp->fi_delegees = 1;
+ filp = NULL;
+ } else
+ fp->fi_delegees++;
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
-
+ if (filp)
+ fput(filp);
if (status)
return ERR_PTR(status);
- dp = alloc_init_deleg(clp, fh, odstate);
+ status = -ENOMEM;
+ dp = alloc_init_deleg(clp, fp, fh, odstate);
if (!dp)
- return ERR_PTR(-ENOMEM);
+ goto out_delegees;
+
+ fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
+ if (!fl)
+ goto out_stid;
+
+ status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL);
+ if (fl)
+ locks_free_lock(fl);
+ if (status)
+ goto out_clnt_odstate;
- get_nfs4_file(fp);
spin_lock(&state_lock);
spin_lock(&fp->fi_lock);
- dp->dl_stid.sc_file = fp;
- if (!fp->fi_deleg_file) {
- spin_unlock(&fp->fi_lock);
- spin_unlock(&state_lock);
- status = nfs4_setlease(dp);
- goto out;
- }
- if (fp->fi_had_conflict) {
+ if (fp->fi_had_conflict)
status = -EAGAIN;
- goto out_unlock;
- }
- status = hash_delegation_locked(dp, fp);
-out_unlock:
+ else
+ status = hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
-out:
- if (status) {
- put_clnt_odstate(dp->dl_clnt_odstate);
- nfs4_put_stid(&dp->dl_stid);
- return ERR_PTR(status);
- }
+
+ if (status)
+ destroy_unhashed_deleg(dp);
return dp;
+out_clnt_odstate:
+ put_clnt_odstate(dp->dl_clnt_odstate);
+out_stid:
+ nfs4_put_stid(&dp->dl_stid);
+out_delegees:
+ put_deleg_file(fp);
+ return ERR_PTR(status);
}
static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
@@ -5481,15 +5482,26 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
stp->st_stid.sc_type = NFS4_CLOSED_STID;
+
+ /*
+ * Technically we don't _really_ have to increment or copy it, since
+ * it should just be gone after this operation and we clobber the
+ * copied value below, but we continue to do so here just to ensure
+ * that racing ops see that there was a state change.
+ */
nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
nfsd4_close_open_stateid(stp);
mutex_unlock(&stp->st_mutex);
- /* See RFC5661 sectionm 18.2.4 */
- if (stp->st_stid.sc_client->cl_minorversion)
- memcpy(&close->cl_stateid, &close_stateid,
- sizeof(close->cl_stateid));
+ /* v4.1+ suggests that we send a special stateid in here, since the
+ * clients should just ignore this anyway. Since this is not useful
+ * for v4.0 clients either, we set it to the special close_stateid
+ * universally.
+ *
+ * See RFC5661 section 18.2.4, and RFC7530 section 16.2.5
+ */
+ memcpy(&close->cl_stateid, &close_stateid, sizeof(close->cl_stateid));
/* put reference from nfs4_preprocess_seqid_op */
nfs4_put_stid(&stp->st_stid);
@@ -6355,6 +6367,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
}
spin_unlock(&clp->cl_lock);
free_ol_stateid_reaplist(&reaplist);
+ remove_blocked_locks(lo);
nfs4_put_stateowner(&lo->lo_owner);
return status;
@@ -7140,6 +7153,8 @@ nfs4_state_destroy_net(struct net *net)
}
}
+ WARN_ON(!list_empty(&nn->blocked_locks_lru));
+
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
while (!list_empty(&nn->unconf_id_hashtbl[i])) {
clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
@@ -7206,7 +7221,6 @@ nfs4_state_shutdown_net(struct net *net)
struct nfs4_delegation *dp = NULL;
struct list_head *pos, *next, reaplist;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- struct nfsd4_blocked_lock *nbl;
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
@@ -7222,27 +7236,7 @@ nfs4_state_shutdown_net(struct net *net)
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
list_del_init(&dp->dl_recall_lru);
- put_clnt_odstate(dp->dl_clnt_odstate);
- nfs4_put_deleg_lease(dp->dl_stid.sc_file);
- nfs4_put_stid(&dp->dl_stid);
- }
-
- BUG_ON(!list_empty(&reaplist));
- spin_lock(&nn->blocked_locks_lock);
- while (!list_empty(&nn->blocked_locks_lru)) {
- nbl = list_first_entry(&nn->blocked_locks_lru,
- struct nfsd4_blocked_lock, nbl_lru);
- list_move(&nbl->nbl_lru, &reaplist);
- list_del_init(&nbl->nbl_list);
- }
- spin_unlock(&nn->blocked_locks_lock);
-
- while (!list_empty(&reaplist)) {
- nbl = list_first_entry(&reaplist,
- struct nfsd4_blocked_lock, nbl_lru);
- list_del_init(&nbl->nbl_lru);
- posix_unblock_lock(&nbl->nbl_lock);
- free_blocked_lock(nbl);
+ destroy_unhashed_deleg(dp);
}
nfsd4_client_tracking_exit(net);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e502fd16246b..1d048dd95464 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -33,7 +33,6 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <linux/fs_struct.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/namei.h>
@@ -682,7 +681,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
&create->cr_acl, &create->cr_label,
- &current->fs->umask);
+ &create->cr_umask);
if (status)
goto out;
@@ -927,7 +926,6 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
case NFS4_OPEN_NOCREATE:
break;
case NFS4_OPEN_CREATE:
- current->fs->umask = 0;
READ_BUF(4);
open->op_createmode = be32_to_cpup(p++);
switch (open->op_createmode) {
@@ -935,7 +933,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
case NFS4_CREATE_GUARDED:
status = nfsd4_decode_fattr(argp, open->op_bmval,
&open->op_iattr, &open->op_acl, &open->op_label,
- &current->fs->umask);
+ &open->op_umask);
if (status)
goto out;
break;
@@ -950,7 +948,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
status = nfsd4_decode_fattr(argp, open->op_bmval,
&open->op_iattr, &open->op_acl, &open->op_label,
- &current->fs->umask);
+ &open->op_umask);
if (status)
goto out;
break;
@@ -1759,7 +1757,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
p = xdr_decode_hyper(p, &copy->cp_src_pos);
p = xdr_decode_hyper(p, &copy->cp_dst_pos);
p = xdr_decode_hyper(p, &copy->cp_count);
- copy->cp_consecutive = be32_to_cpup(p++);
+ p++; /* ca_consecutive: we always do consecutive copies */
copy->cp_synchronous = be32_to_cpup(p++);
tmp = be32_to_cpup(p); /* Source server list not supported */
@@ -3427,8 +3425,9 @@ static __be32 nfsd4_encode_splice_read(
return nfserr_resource;
len = maxcount;
- nfserr = nfsd_splice_read(read->rd_rqstp, file,
- read->rd_offset, &maxcount);
+ nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
+ file, read->rd_offset, &maxcount);
+ read->rd_length = maxcount;
if (nfserr) {
/*
* nfsd_splice_actor may have already messed with the
@@ -3511,8 +3510,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
read->rd_vlen = v;
len = maxcount;
- nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
- read->rd_vlen, &maxcount);
+ nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
+ resp->rqstp->rq_vec, read->rd_vlen, &maxcount);
+ read->rd_length = maxcount;
if (nfserr)
return nfserr;
xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
@@ -4214,7 +4214,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr;
p = xdr_reserve_space(&resp->xdr, 4 + 4);
- *p++ = cpu_to_be32(copy->cp_consecutive);
+ *p++ = xdr_one; /* cr_consecutive */
*p++ = cpu_to_be32(copy->cp_synchronous);
return 0;
}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8aa011820c4a..a008e7634181 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -87,13 +87,23 @@ nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry,
return nfserr_inval;
}
+static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, int flags)
+{
+ if (flags & NFSEXP_INSECURE_PORT)
+ return true;
+ /* We don't require gss requests to use low ports: */
+ if (rqstp->rq_cred.cr_flavor >= RPC_AUTH_GSS)
+ return true;
+ return test_bit(RQ_SECURE, &rqstp->rq_flags);
+}
+
static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
struct svc_export *exp)
{
int flags = nfsexp_flags(rqstp, exp);
/* Check if the request originated from a secure port. */
- if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) {
+ if (!nfsd_originating_port_ok(rqstp, flags)) {
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
dprintk("nfsd: request from insecure port %s!\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 43c0419b8ddb..f107f9fa8e15 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -212,13 +212,18 @@ nfsd_proc_write(struct svc_rqst *rqstp)
struct nfsd_attrstat *resp = rqstp->rq_resp;
__be32 nfserr;
unsigned long cnt = argp->len;
+ unsigned int nvecs;
dprintk("nfsd: WRITE %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
- nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset,
- rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC);
+ nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt);
+ if (!nvecs)
+ return nfserr_io;
+ nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
+ argp->offset, rqstp->rq_vec, nvecs,
+ &cnt, NFS_DATA_SYNC);
return nfsd_return_attrs(nfserr, resp);
}
@@ -444,17 +449,19 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
struct svc_fh newfh;
__be32 nfserr;
+ if (argp->tlen > NFS_MAXPATHLEN)
+ return nfserr_nametoolong;
+
+ argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first,
+ argp->tlen);
+ if (IS_ERR(argp->tname))
+ return nfserrno(PTR_ERR(argp->tname));
+
dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n",
SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
argp->tlen, argp->tname);
fh_init(&newfh, NFS_FHSIZE);
- /*
- * Crazy hack: the request fits in a page, and already-decoded
- * attributes follow argp->tname, so it's safe to just write a
- * null to ensure it's null-terminated:
- */
- argp->tname[argp->tlen] = '\0';
nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
argp->tname, &newfh);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 79b6064f8977..a43e8260520a 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -71,22 +71,6 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp)
}
static __be32 *
-decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
-{
- char *name;
- unsigned int i;
-
- if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
- for (i = 0, name = *namp; i < *lenp; i++, name++) {
- if (*name == '\0')
- return NULL;
- }
- }
-
- return p;
-}
-
-static __be32 *
decode_sattr(__be32 *p, struct iattr *iap)
{
u32 tmp, tmp1;
@@ -287,7 +271,6 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
struct nfsd_writeargs *args = rqstp->rq_argp;
unsigned int len, hdr, dlen;
struct kvec *head = rqstp->rq_arg.head;
- int v;
p = decode_fh(p, &args->fh);
if (!p)
@@ -323,17 +306,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
if (dlen < XDR_QUADLEN(len)*4)
return 0;
- rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
- v = 0;
- while (len > rqstp->rq_vec[v].iov_len) {
- len -= rqstp->rq_vec[v].iov_len;
- v++;
- rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
- rqstp->rq_vec[v].iov_len = PAGE_SIZE;
- }
- rqstp->rq_vec[v].iov_len = len;
- args->vlen = v + 1;
+ args->first.iov_base = (void *)p;
+ args->first.iov_len = head->iov_len - hdr;
return 1;
}
@@ -394,14 +368,39 @@ int
nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_symlinkargs *args = rqstp->rq_argp;
+ char *base = (char *)p;
+ size_t xdrlen;
if ( !(p = decode_fh(p, &args->ffh))
- || !(p = decode_filename(p, &args->fname, &args->flen))
- || !(p = decode_pathname(p, &args->tname, &args->tlen)))
+ || !(p = decode_filename(p, &args->fname, &args->flen)))
return 0;
- p = decode_sattr(p, &args->attrs);
- return xdr_argsize_check(rqstp, p);
+ args->tlen = ntohl(*p++);
+ if (args->tlen == 0)
+ return 0;
+
+ args->first.iov_base = p;
+ args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
+ args->first.iov_len -= (char *)p - base;
+
+ /* This request is never larger than a page. Therefore,
+ * transport will deliver either:
+ * 1. pathname in the pagelist -> sattr is in the tail.
+ * 2. everything in the head buffer -> sattr is in the head.
+ */
+ if (rqstp->rq_arg.page_len) {
+ if (args->tlen != rqstp->rq_arg.page_len)
+ return 0;
+ p = rqstp->rq_arg.tail[0].iov_base;
+ } else {
+ xdrlen = XDR_QUADLEN(args->tlen);
+ if (xdrlen > args->first.iov_len - (8 * sizeof(__be32)))
+ return 0;
+ p += xdrlen;
+ }
+ decode_sattr(p, &args->attrs);
+
+ return 1;
}
int
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 8b2f1d92c579..80933e4334d8 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -11,39 +11,79 @@
#include <linux/tracepoint.h>
#include "nfsfh.h"
+TRACE_EVENT(nfsd_compound,
+ TP_PROTO(const struct svc_rqst *rqst,
+ u32 args_opcnt),
+ TP_ARGS(rqst, args_opcnt),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, args_opcnt)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqst->rq_xid);
+ __entry->args_opcnt = args_opcnt;
+ ),
+ TP_printk("xid=0x%08x opcnt=%u",
+ __entry->xid, __entry->args_opcnt)
+)
+
+TRACE_EVENT(nfsd_compound_status,
+ TP_PROTO(u32 args_opcnt,
+ u32 resp_opcnt,
+ __be32 status,
+ const char *name),
+ TP_ARGS(args_opcnt, resp_opcnt, status, name),
+ TP_STRUCT__entry(
+ __field(u32, args_opcnt)
+ __field(u32, resp_opcnt)
+ __field(int, status)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->args_opcnt = args_opcnt;
+ __entry->resp_opcnt = resp_opcnt;
+ __entry->status = be32_to_cpu(status);
+ __assign_str(name, name);
+ ),
+ TP_printk("op=%u/%u %s status=%d",
+ __entry->resp_opcnt, __entry->args_opcnt,
+ __get_str(name), __entry->status)
+)
+
DECLARE_EVENT_CLASS(nfsd_io_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
loff_t offset,
- int len),
+ unsigned long len),
TP_ARGS(rqstp, fhp, offset, len),
TP_STRUCT__entry(
- __field(__be32, xid)
- __field_struct(struct knfsd_fh, fh)
+ __field(u32, xid)
+ __field(u32, fh_hash)
__field(loff_t, offset)
- __field(int, len)
+ __field(unsigned long, len)
),
TP_fast_assign(
- __entry->xid = rqstp->rq_xid,
- fh_copy_shallow(&__entry->fh, &fhp->fh_handle);
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
__entry->offset = offset;
__entry->len = len;
),
- TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d",
- __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh),
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu",
+ __entry->xid, __entry->fh_hash,
__entry->offset, __entry->len)
)
#define DEFINE_NFSD_IO_EVENT(name) \
-DEFINE_EVENT(nfsd_io_class, name, \
+DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
TP_PROTO(struct svc_rqst *rqstp, \
struct svc_fh *fhp, \
loff_t offset, \
- int len), \
+ unsigned long len), \
TP_ARGS(rqstp, fhp, offset, len))
DEFINE_NFSD_IO_EVENT(read_start);
-DEFINE_NFSD_IO_EVENT(read_opened);
+DEFINE_NFSD_IO_EVENT(read_splice);
+DEFINE_NFSD_IO_EVENT(read_vector);
DEFINE_NFSD_IO_EVENT(read_io_done);
DEFINE_NFSD_IO_EVENT(read_done);
DEFINE_NFSD_IO_EVENT(write_start);
@@ -51,6 +91,40 @@ DEFINE_NFSD_IO_EVENT(write_opened);
DEFINE_NFSD_IO_EVENT(write_io_done);
DEFINE_NFSD_IO_EVENT(write_done);
+DECLARE_EVENT_CLASS(nfsd_err_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ loff_t offset,
+ int status),
+ TP_ARGS(rqstp, fhp, offset, status),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(loff_t, offset)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->status = status;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld status=%d",
+ __entry->xid, __entry->fh_hash,
+ __entry->offset, __entry->status)
+)
+
+#define DEFINE_NFSD_ERR_EVENT(name) \
+DEFINE_EVENT(nfsd_err_class, nfsd_##name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ loff_t offset, \
+ int len), \
+ TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_ERR_EVENT(read_err);
+DEFINE_NFSD_ERR_EVENT(write_err);
+
#include "state.h"
DECLARE_EVENT_CLASS(nfsd_stateid_class,
@@ -76,7 +150,7 @@ DECLARE_EVENT_CLASS(nfsd_stateid_class,
)
#define DEFINE_STATEID_EVENT(name) \
-DEFINE_EVENT(nfsd_stateid_class, name, \
+DEFINE_EVENT(nfsd_stateid_class, nfsd_##name, \
TP_PROTO(stateid_t *stp), \
TP_ARGS(stp))
DEFINE_STATEID_EVENT(layoutstate_alloc);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a3c9bfa77def..2410b093a2e6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -881,20 +881,24 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
}
-static __be32
-nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
+static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ unsigned long *count, int host_err)
{
if (host_err >= 0) {
nfsdstats.io_read += host_err;
*count = host_err;
fsnotify_access(file);
+ trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
return 0;
- } else
+ } else {
+ trace_nfsd_read_err(rqstp, fhp, offset, host_err);
return nfserrno(host_err);
+ }
}
-__be32 nfsd_splice_read(struct svc_rqst *rqstp,
- struct file *file, loff_t offset, unsigned long *count)
+__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset, unsigned long *count)
{
struct splice_desc sd = {
.len = 0,
@@ -904,21 +908,23 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp,
};
int host_err;
+ trace_nfsd_read_splice(rqstp, fhp, offset, *count);
rqstp->rq_next_page = rqstp->rq_respages + 1;
host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
- return nfsd_finish_read(file, count, host_err);
+ return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
}
-__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
- unsigned long *count)
+__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ struct kvec *vec, int vlen, unsigned long *count)
{
struct iov_iter iter;
int host_err;
+ trace_nfsd_read_vector(rqstp, fhp, offset, *count);
iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count);
host_err = vfs_iter_read(file, &iter, &offset, 0);
-
- return nfsd_finish_read(file, count, host_err);
+ return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
}
/*
@@ -965,13 +971,15 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
{
struct svc_export *exp;
struct iov_iter iter;
- __be32 err = 0;
+ __be32 nfserr;
int host_err;
int use_wgather;
loff_t pos = offset;
unsigned int pflags = current->flags;
rwf_t flags = 0;
+ trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
+
if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
/*
* We want less throttling in balance_dirty_pages()
@@ -994,22 +1002,23 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
host_err = vfs_iter_write(file, &iter, &pos, flags);
if (host_err < 0)
goto out_nfserr;
- *cnt = host_err;
- nfsdstats.io_write += host_err;
+ nfsdstats.io_write += *cnt;
fsnotify_modify(file);
if (stable && use_wgather)
host_err = wait_for_concurrent_writes(file);
out_nfserr:
- dprintk("nfsd: write complete host_err=%d\n", host_err);
- if (host_err >= 0)
- err = 0;
- else
- err = nfserrno(host_err);
+ if (host_err >= 0) {
+ trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt);
+ nfserr = nfs_ok;
+ } else {
+ trace_nfsd_write_err(rqstp, fhp, offset, host_err);
+ nfserr = nfserrno(host_err);
+ }
if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
current_restore_flags(pflags, PF_LESS_THROTTLE);
- return err;
+ return nfserr;
}
/*
@@ -1024,27 +1033,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct raparms *ra;
__be32 err;
- trace_read_start(rqstp, fhp, offset, vlen);
+ trace_nfsd_read_start(rqstp, fhp, offset, *count);
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
if (err)
return err;
ra = nfsd_init_raparms(file);
- trace_read_opened(rqstp, fhp, offset, vlen);
-
if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
- err = nfsd_splice_read(rqstp, file, offset, count);
+ err = nfsd_splice_read(rqstp, fhp, file, offset, count);
else
- err = nfsd_readv(file, offset, vec, vlen, count);
-
- trace_read_io_done(rqstp, fhp, offset, vlen);
+ err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count);
if (ra)
nfsd_put_raparams(file, ra);
fput(file);
- trace_read_done(rqstp, fhp, offset, vlen);
+ trace_nfsd_read_done(rqstp, fhp, offset, *count);
return err;
}
@@ -1061,18 +1066,16 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
struct file *file = NULL;
__be32 err = 0;
- trace_write_start(rqstp, fhp, offset, vlen);
+ trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
if (err)
goto out;
- trace_write_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
- trace_write_io_done(rqstp, fhp, offset, vlen);
fput(file);
out:
- trace_write_done(rqstp, fhp, offset, vlen);
+ trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
return err;
}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index be6d8e00453f..a7e107309f76 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -78,10 +78,13 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
struct raparms;
-__be32 nfsd_splice_read(struct svc_rqst *,
- struct file *, loff_t, unsigned long *);
-__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int,
- unsigned long *);
+__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ unsigned long *count);
+__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ struct kvec *vec, int vlen,
+ unsigned long *count);
__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
loff_t, struct kvec *, int, unsigned long *);
__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 2f4f22e6b8cb..ea7cca3a64b7 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -34,7 +34,7 @@ struct nfsd_writeargs {
svc_fh fh;
__u32 offset;
int len;
- int vlen;
+ struct kvec first;
};
struct nfsd_createargs {
@@ -72,6 +72,7 @@ struct nfsd_symlinkargs {
char * tname;
unsigned int tlen;
struct iattr attrs;
+ struct kvec first;
};
struct nfsd_readdirargs {
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 056bf8a7364e..2cb29e961a76 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -41,7 +41,7 @@ struct nfsd3_writeargs {
__u32 count;
int stable;
__u32 len;
- int vlen;
+ struct kvec first;
};
struct nfsd3_createargs {
@@ -90,6 +90,7 @@ struct nfsd3_symlinkargs {
char * tname;
unsigned int tlen;
struct iattr attrs;
+ struct kvec first;
};
struct nfsd3_readdirargs {
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index bc29511b6405..17c453a7999c 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -110,6 +110,7 @@ struct nfsd4_create {
struct {
u32 datalen;
char *data;
+ struct kvec first;
} link; /* NF4LNK */
struct {
u32 specdata1;
@@ -118,12 +119,14 @@ struct nfsd4_create {
} u;
u32 cr_bmval[3]; /* request */
struct iattr cr_iattr; /* request */
+ int cr_umask; /* request */
struct nfsd4_change_info cr_cinfo; /* response */
struct nfs4_acl *cr_acl;
struct xdr_netobj cr_label;
};
#define cr_datalen u.link.datalen
#define cr_data u.link.data
+#define cr_first u.link.first
#define cr_specdata1 u.dev.specdata1
#define cr_specdata2 u.dev.specdata2
@@ -228,6 +231,7 @@ struct nfsd4_open {
u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */
u32 op_create; /* request */
u32 op_createmode; /* request */
+ int op_umask; /* request */
u32 op_bmval[3]; /* request */
struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
nfs4_verifier op_verf __attribute__((aligned(32)));
@@ -518,7 +522,6 @@ struct nfsd4_copy {
u64 cp_count;
/* both */
- bool cp_consecutive;
bool cp_synchronous;
/* response */
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 6702a6a0bbb5..d51e1bb781cf 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -139,23 +139,32 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
return false;
}
-struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
const struct path *path)
{
struct fanotify_event_info *event;
+ gfp_t gfp = GFP_KERNEL;
+
+ /*
+ * For queues with unlimited length lost events are not expected and
+ * can possibly have security implications. Avoid losing events when
+ * memory is short.
+ */
+ if (group->max_events == UINT_MAX)
+ gfp |= __GFP_NOFAIL;
if (fanotify_is_perm_event(mask)) {
struct fanotify_perm_event_info *pevent;
- pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
- GFP_KERNEL);
+ pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
if (!pevent)
return NULL;
event = &pevent->fae;
pevent->response = 0;
goto init;
}
- event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+ event = kmem_cache_alloc(fanotify_event_cachep, gfp);
if (!event)
return NULL;
init: __maybe_unused
@@ -210,10 +219,17 @@ static int fanotify_handle_event(struct fsnotify_group *group,
return 0;
}
- event = fanotify_alloc_event(inode, mask, data);
+ event = fanotify_alloc_event(group, inode, mask, data);
ret = -ENOMEM;
- if (unlikely(!event))
+ if (unlikely(!event)) {
+ /*
+ * We don't queue overflow events for permission events as
+ * there the access is denied and so no event is in fact lost.
+ */
+ if (!fanotify_is_perm_event(mask))
+ fsnotify_queue_overflow(group);
goto finish;
+ }
fsn_event = &event->fse;
ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 256d9d1ddea9..8609ba06f474 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -52,5 +52,6 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
return container_of(fse, struct fanotify_event_info, fse);
}
-struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
const struct path *path);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c07eb3d655ea..ec4d8c59d0e3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -757,7 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.user = user;
atomic_inc(&user->fanotify_listeners);
- oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL);
+ oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
if (unlikely(!oevent)) {
fd = -ENOMEM;
goto out_destroy_group;
@@ -820,9 +820,8 @@ out_destroy_group:
return fd;
}
-SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
- __u64, mask, int, dfd,
- const char __user *, pathname)
+static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ int dfd, const char __user *pathname)
{
struct inode *inode = NULL;
struct vfsmount *mnt = NULL;
@@ -928,13 +927,20 @@ fput_and_out:
return ret;
}
+SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
+ __u64, mask, int, dfd,
+ const char __user *, pathname)
+{
+ return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE6(fanotify_mark,
int, fanotify_fd, unsigned int, flags,
__u32, mask0, __u32, mask1, int, dfd,
const char __user *, pathname)
{
- return sys_fanotify_mark(fanotify_fd, flags,
+ return do_fanotify_mark(fanotify_fd, flags,
#ifdef __BIG_ENDIAN
((__u64)mask0 << 32) | mask1,
#else
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 8b73332735ba..40dedb37a1f3 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -99,8 +99,14 @@ int inotify_handle_event(struct fsnotify_group *group,
fsn_mark);
event = kmalloc(alloc_len, GFP_KERNEL);
- if (unlikely(!event))
+ if (unlikely(!event)) {
+ /*
+ * Treat lost event due to ENOMEM the same way as queue
+ * overflow to let userspace know event was lost.
+ */
+ fsnotify_queue_overflow(group);
return -ENOMEM;
+ }
fsn_event = &event->fse;
fsnotify_init_event(fsn_event, inode, mask);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 2c908b31d6c9..ef32f3657958 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -307,6 +307,20 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
spin_unlock(&group->notification_lock);
ret = put_user(send_len, (int __user *) p);
break;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ case INOTIFY_IOC_SETNEXTWD:
+ ret = -EINVAL;
+ if (arg >= 1 && arg <= INT_MAX) {
+ struct inotify_group_private_data *data;
+
+ data = &group->inotify_data;
+ spin_lock(&data->idr_lock);
+ idr_set_cursor(&data->idr, (unsigned int)arg);
+ spin_unlock(&data->idr_lock);
+ ret = 0;
+ }
+ break;
+#endif /* CONFIG_CHECKPOINT_RESTORE */
}
return ret;
@@ -635,7 +649,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
/* inotify syscalls */
-SYSCALL_DEFINE1(inotify_init1, int, flags)
+static int do_inotify_init(int flags)
{
struct fsnotify_group *group;
int ret;
@@ -660,9 +674,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
return ret;
}
+SYSCALL_DEFINE1(inotify_init1, int, flags)
+{
+ return do_inotify_init(flags);
+}
+
SYSCALL_DEFINE0(inotify_init)
{
- return sys_inotify_init1(0);
+ return do_inotify_init(0);
}
SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 66f85c651c52..3c3e36745f59 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -111,7 +111,8 @@ int fsnotify_add_event(struct fsnotify_group *group,
return 2;
}
- if (group->q_len >= group->max_events) {
+ if (event == group->overflow_event ||
+ group->q_len >= group->max_events) {
ret = 2;
/* Queue overflow event only if it isn't already queued */
if (!list_empty(&group->overflow_event->list)) {
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 36b0772701a0..60702d677bd4 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -184,6 +184,7 @@ int open_related_ns(struct ns_common *ns,
return fd;
}
+EXPORT_SYMBOL_GPL(open_related_ns);
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 2831f495a674..32c523cf5a2d 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -381,7 +381,7 @@ unm_err_out:
* vfs inode dirty. This ensures that any changes to the mft record are
* written out to disk.
*
- * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
+ * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
* on the base vfs inode, because even though file data may have been modified,
* it is dirty in the inode meta data rather than the data page cache of the
* inode, and thus there are no data pages that need writing out. Therefore, a
@@ -407,7 +407,7 @@ void __mark_mft_record_dirty(ntfs_inode *ni)
else
base_ni = ni->ext.base_ntfs_ino;
mutex_unlock(&ni->extent_lock);
- __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
}
static const char *ntfs_please_email = "Please email "
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9a876bb07cac..0f157bbd3e0f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7119,7 +7119,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_commit;
did_quota = 1;
- data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+ data_ac->ac_resv = &oi->ip_la_data_resv;
ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&num);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e8e205bf2e41..302cd7caa4a7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -346,7 +346,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
unlock = 0;
out_alloc:
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
out_inode_unlock:
ocfs2_inode_unlock(inode, 0);
out:
@@ -2213,7 +2213,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
down_write(&oi->ip_alloc_sem);
if (first_get_block) {
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(osb))
ret = ocfs2_zero_tail(inode, di_bh, pos);
else
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 8614ff069d99..3494a62ed749 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,7 +78,7 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
/*
* Using a named enum representing lock types in terms of #N bit stored in
* iocb->private, which is going to be used for communication between
- * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ * ocfs2_dio_end_io() and ocfs2_file_write/read_iter().
*/
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ea8c551bcd7e..91a8889abf9b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -570,7 +570,16 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
current_page, vec_len, vec_start);
len = bio_add_page(bio, page, vec_len, vec_start);
- if (len != vec_len) break;
+ if (len != vec_len) {
+ mlog(ML_ERROR, "Adding page[%d] to bio failed, "
+ "page %p, len %d, vec_len %u, vec_start %u, "
+ "bi_sector %llu\n", current_page, page, len,
+ vec_len, vec_start,
+ (unsigned long long)bio->bi_iter.bi_sector);
+ bio_put(bio);
+ bio = ERR_PTR(-EIO);
+ return bio;
+ }
cs += vec_len / (PAGE_SIZE/spp);
vec_start = 0;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index eac5140aac47..e5076185cc1e 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1819,7 +1819,7 @@ int o2net_register_hb_callbacks(void)
static int o2net_accept_one(struct socket *sock, int *more)
{
- int ret, slen;
+ int ret;
struct sockaddr_in sin;
struct socket *new_sock = NULL;
struct o2nm_node *node = NULL;
@@ -1864,9 +1864,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
goto out;
}
- slen = sizeof(sin);
- ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
- &slen, 1);
+ ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1);
if (ret < 0)
goto out;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 977763d4c27d..b048d4fa3959 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3072,7 +3072,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* We need to return the correct block within the
* cluster which should hold our entry.
*/
- off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+ off = ocfs2_dx_dir_hash_idx(osb,
&lookup->dl_hinfo);
get_bh(dx_leaves[off]);
lookup->dl_dx_leaf_bh = dx_leaves[off];
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index fd6bbbbd7d78..39831fc2fd52 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -224,14 +224,12 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
dlm_astlockfunc_t *fn;
- struct dlm_lockstatus *lksb;
mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
- lksb = lock->lksb;
fn = lock->ast;
BUG_ON(lock->ml.node != dlm->node_num);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e9f3705c4c9f..d06e27ec4be4 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
u8 node_num;
u32 key;
u8 joining_node;
+ u8 migrate_done; /* set to 1 means node has migrated all lock resources */
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -960,13 +961,10 @@ static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
-u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
-int dlm_nm_init(struct dlm_ctxt *dlm);
-int dlm_heartbeat_init(struct dlm_ctxt *dlm);
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index e1fea149f50b..425081be6161 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -461,6 +461,19 @@ redo_bucket:
cond_resched_lock(&dlm->spinlock);
num += n;
}
+
+ if (!num) {
+ if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+ mlog(0, "%s: perhaps there are more lock resources "
+ "need to be migrated after dlm recovery\n", dlm->name);
+ ret = -EAGAIN;
+ } else {
+ mlog(0, "%s: we won't do dlm recovery after migrating "
+ "all lock resources\n", dlm->name);
+ dlm->migrate_done = 1;
+ }
+ }
+
spin_unlock(&dlm->spinlock);
wake_up(&dlm->dlm_thread_wq);
@@ -675,20 +688,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
spin_unlock(&dlm->spinlock);
}
-int dlm_shutting_down(struct dlm_ctxt *dlm)
-{
- int ret = 0;
-
- spin_lock(&dlm_domain_lock);
-
- if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
- ret = 1;
-
- spin_unlock(&dlm_domain_lock);
-
- return ret;
-}
-
void dlm_unregister_domain(struct dlm_ctxt *dlm)
{
int leave = 0;
@@ -2052,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
init_waitqueue_head(&dlm->dlm_join_events);
+ dlm->migrate_done = 0;
+
dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index fd6122a38dbd..8a9281411c18 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,30 @@
extern spinlock_t dlm_domain_lock;
extern struct list_head dlm_domains;
-int dlm_shutting_down(struct dlm_ctxt *dlm);
+static inline int dlm_joined(struct dlm_ctxt *dlm)
+{
+ int ret = 0;
+
+ spin_lock(&dlm_domain_lock);
+ if (dlm->dlm_state == DLM_CTXT_JOINED)
+ ret = 1;
+ spin_unlock(&dlm_domain_lock);
+
+ return ret;
+}
+
+static inline int dlm_shutting_down(struct dlm_ctxt *dlm)
+{
+ int ret = 0;
+
+ spin_lock(&dlm_domain_lock);
+ if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+ ret = 1;
+ spin_unlock(&dlm_domain_lock);
+
+ return ret;
+}
+
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 66c2a491f68d..74962315794e 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -77,8 +77,7 @@ int dlm_init_lock_cache(void)
void dlm_destroy_lock_cache(void)
{
- if (dlm_lock_cache)
- kmem_cache_destroy(dlm_lock_cache);
+ kmem_cache_destroy(dlm_lock_cache);
}
/* Tell us whether we can grant a new lock request.
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a7df226f9449..aaca0949fe53 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -414,8 +414,7 @@ int dlm_init_mle_cache(void)
void dlm_destroy_mle_cache(void)
{
- if (dlm_mle_cache)
- kmem_cache_destroy(dlm_mle_cache);
+ kmem_cache_destroy(dlm_mle_cache);
}
static void dlm_mle_release(struct kref *kref)
@@ -472,15 +471,11 @@ bail:
void dlm_destroy_master_caches(void)
{
- if (dlm_lockname_cache) {
- kmem_cache_destroy(dlm_lockname_cache);
- dlm_lockname_cache = NULL;
- }
+ kmem_cache_destroy(dlm_lockname_cache);
+ dlm_lockname_cache = NULL;
- if (dlm_lockres_cache) {
- kmem_cache_destroy(dlm_lockres_cache);
- dlm_lockres_cache = NULL;
- }
+ kmem_cache_destroy(dlm_lockres_cache);
+ dlm_lockres_cache = NULL;
}
static void dlm_lockres_release(struct kref *kref)
@@ -2495,13 +2490,13 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
}
/*
- * A migrateable resource is one that is :
+ * A migratable resource is one that is :
* 1. locally mastered, and,
* 2. zero local locks, and,
* 3. one or more non-local locks, or, one or more references
* Returns 1 if yes, 0 if not.
*/
-static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
+static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
enum dlm_lockres_list idx;
@@ -2532,7 +2527,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
continue;
}
cookie = be64_to_cpu(lock->ml.cookie);
- mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+ mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on "
"%s list\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(cookie),
@@ -2548,7 +2543,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
return 0;
}
- mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len,
res->lockname.name);
return 1;
@@ -2792,7 +2787,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
assert_spin_locked(&dlm->spinlock);
spin_lock(&res->spinlock);
- if (dlm_is_lockres_migrateable(dlm, res))
+ if (dlm_is_lockres_migratable(dlm, res))
target = dlm_pick_migration_target(dlm, res);
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ec8f75813beb..802636d50365 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -62,7 +62,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_request_all_locks(struct dlm_ctxt *dlm,
u8 request_from, u8 dead_node);
-static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm);
static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
@@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
- spin_lock(&dlm->spinlock);
+ assert_spin_locked(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
- spin_unlock(&dlm->spinlock);
}
static void dlm_end_recovery(struct dlm_ctxt *dlm)
@@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
spin_lock(&dlm->spinlock);
+ if (dlm->migrate_done) {
+ mlog(0, "%s: no need do recovery after migrating all "
+ "lock resources\n", dlm->name);
+ spin_unlock(&dlm->spinlock);
+ return 0;
+ }
+
/* check to see if the new master has died */
if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
test_bit(dlm->reco.new_master, dlm->recovery_map)) {
@@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.dead_node);
- spin_unlock(&dlm->spinlock);
/* take write barrier */
/* (stops the list reshuffling thread, proxy ast handling) */
dlm_begin_recovery(dlm);
+ spin_unlock(&dlm->spinlock);
+
if (dlm->reco.new_master == dlm->node_num)
goto master_here;
@@ -739,7 +746,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
}
if (destroy)
- dlm_destroy_recovery_area(dlm, dead_node);
+ dlm_destroy_recovery_area(dlm);
return status;
}
@@ -764,7 +771,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
if (!ndata) {
- dlm_destroy_recovery_area(dlm, dead_node);
+ dlm_destroy_recovery_area(dlm);
return -ENOMEM;
}
ndata->node_num = num;
@@ -778,7 +785,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
return 0;
}
-static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm)
{
struct dlm_reco_node_data *ndata, *next;
LIST_HEAD(tmplist);
@@ -1378,6 +1385,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
if (!dlm_grab(dlm))
return -EINVAL;
+ if (!dlm_joined(dlm)) {
+ mlog(ML_ERROR, "Domain %s not joined! "
+ "lockres %.*s, master %u\n",
+ dlm->name, mres->lockname_len,
+ mres->lockname, mres->master);
+ dlm_put(dlm);
+ return -EINVAL;
+ }
+
BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
real_master = mres->master;
@@ -1807,7 +1823,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
int i, j, bad;
struct dlm_lock *lock;
u8 from = O2NM_MAX_NODES;
- unsigned int added = 0;
__be64 c;
mlog(0, "running %d locks for this lockres\n", mres->num_locks);
@@ -1823,7 +1838,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
dlm_lockres_set_refmap_bit(dlm, res, from);
spin_unlock(&res->spinlock);
- added++;
break;
}
BUG_ON(ml->highest_blocked != LKM_IVMODE);
@@ -1911,7 +1925,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, queue);
spin_unlock(&res->spinlock);
- added++;
mlog(0, "just reordered a local lock!\n");
continue;
@@ -2037,7 +2050,6 @@ skip_lvb:
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
dlm_lockres_set_refmap_bit(dlm, res, ml->node);
- added++;
}
spin_unlock(&res->spinlock);
}
@@ -2331,13 +2343,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
__dlm_dirty_lockres(dlm, res);
}
-/* if this node is the recovery master, and there are no
- * locks for a given lockres owned by this node that are in
- * either PR or EX mode, zero out the lvb before requesting.
- *
- */
-
-
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{
struct dlm_lock_resource *res;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 9479f99c2145..97a972efab83 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1756,8 +1756,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
- 0);
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1796,7 +1795,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
write ? "EXMODE" : "PRMODE");
if (!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_cluster_unlock(osb, lockres, level);
}
/*
@@ -1816,8 +1815,7 @@ int ocfs2_open_lock(struct inode *inode)
lockres = &OCFS2_I(inode)->ip_open_lockres;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_PR, 0, 0);
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1854,8 +1852,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
* other nodes and the -EAGAIN will indicate to the caller that
* this inode is still in use.
*/
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- level, DLM_LKF_NOQUEUE, 0);
+ status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
out:
return status;
@@ -1876,11 +1873,9 @@ void ocfs2_open_unlock(struct inode *inode)
goto out;
if(lockres->l_ro_holders)
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_PR);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR);
if(lockres->l_ex_holders)
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_EX);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
out:
return;
@@ -2601,9 +2596,9 @@ void ocfs2_inode_unlock(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
- if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
+ if (!ocfs2_is_hard_readonly(osb) &&
!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_cluster_unlock(osb, lockres, level);
}
/*
@@ -3537,7 +3532,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
* On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
* expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
* we can recover correctly from node failure. Otherwise, we may get
- * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+ * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
*/
if (!ocfs2_is_o2cb_active() &&
lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5d1784a365a3..6ee94bc23f5b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -101,7 +101,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
trace_ocfs2_file_open(inode, file, file->f_path.dentry,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)oi->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name, mode);
@@ -116,7 +116,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
/* Check that the inode hasn't been wiped from disk by another
* node. If it hasn't then we're safe as long as we hold the
* spin lock until our increment of open count. */
- if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+ if (oi->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&oi->ip_lock);
status = -ENOENT;
@@ -190,7 +190,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
bool needs_barrier = false;
trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
- OCFS2_I(inode)->ip_blkno,
+ oi->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name,
(unsigned long long)datasync);
@@ -296,7 +296,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
ocfs2_journal_dirty(handle, bh);
out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ ocfs2_commit_trans(osb, handle);
out:
return ret;
}
@@ -2257,7 +2257,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
- trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+ trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name,
@@ -2405,7 +2405,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
- trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+ trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
filp->f_path.dentry->d_name.len,
filp->f_path.dentry->d_name.name,
@@ -2448,7 +2448,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
*
* Take and drop the meta data lock to update inode fields
* like i_size. This allows the checks down below
- * generic_file_aio_read() a chance of actually working.
+ * generic_file_read_iter() a chance of actually working.
*/
ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
!nowait);
@@ -2460,7 +2460,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
ocfs2_inode_unlock(inode, lock_level);
ret = generic_file_read_iter(iocb, to);
- trace_generic_file_aio_read_ret(ret);
+ trace_generic_file_read_iter_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index e87279e49ba3..f65f2b2f594d 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -53,36 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = {
"UNSUPPORTED"
};
-static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
-static LIST_HEAD(ocfs2_filecheck_sysfs_list);
-
-struct ocfs2_filecheck {
- struct list_head fc_head; /* File check entry list head */
- spinlock_t fc_lock;
- unsigned int fc_max; /* Maximum number of entry in list */
- unsigned int fc_size; /* Current entry count in list */
- unsigned int fc_done; /* Finished entry count in list */
-};
-
-struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */
- struct list_head fs_list;
- atomic_t fs_count;
- struct super_block *fs_sb;
- struct kset *fs_devicekset;
- struct kset *fs_fcheckkset;
- struct ocfs2_filecheck *fs_fcheck;
-};
-
-#define OCFS2_FILECHECK_MAXSIZE 100
-#define OCFS2_FILECHECK_MINSIZE 10
-
-/* File check operation type */
-enum {
- OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
- OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
- OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
-};
-
struct ocfs2_filecheck_entry {
struct list_head fe_list;
unsigned long fe_ino;
@@ -110,34 +80,84 @@ ocfs2_filecheck_error(int errno)
return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
}
-static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf);
-static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count);
-static struct kobj_attribute ocfs2_attr_filecheck_chk =
+static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf);
+static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count);
+static struct kobj_attribute ocfs2_filecheck_attr_chk =
__ATTR(check, S_IRUSR | S_IWUSR,
- ocfs2_filecheck_show,
- ocfs2_filecheck_store);
-static struct kobj_attribute ocfs2_attr_filecheck_fix =
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct kobj_attribute ocfs2_filecheck_attr_fix =
__ATTR(fix, S_IRUSR | S_IWUSR,
- ocfs2_filecheck_show,
- ocfs2_filecheck_store);
-static struct kobj_attribute ocfs2_attr_filecheck_set =
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct kobj_attribute ocfs2_filecheck_attr_set =
__ATTR(set, S_IRUSR | S_IWUSR,
- ocfs2_filecheck_show,
- ocfs2_filecheck_store);
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct attribute *ocfs2_filecheck_attrs[] = {
+ &ocfs2_filecheck_attr_chk.attr,
+ &ocfs2_filecheck_attr_fix.attr,
+ &ocfs2_filecheck_attr_set.attr,
+ NULL
+};
+
+static void ocfs2_filecheck_release(struct kobject *kobj)
+{
+ struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
+
+ complete(&entry->fs_kobj_unregister);
+}
+
+static ssize_t
+ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ ssize_t ret = -EIO;
+ struct kobj_attribute *kattr = container_of(attr,
+ struct kobj_attribute, attr);
+
+ kobject_get(kobj);
+ if (kattr->show)
+ ret = kattr->show(kobj, kattr, buf);
+ kobject_put(kobj);
+ return ret;
+}
+
+static ssize_t
+ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret = -EIO;
+ struct kobj_attribute *kattr = container_of(attr,
+ struct kobj_attribute, attr);
+
+ kobject_get(kobj);
+ if (kattr->store)
+ ret = kattr->store(kobj, kattr, buf, count);
+ kobject_put(kobj);
+ return ret;
+}
+
+static const struct sysfs_ops ocfs2_filecheck_ops = {
+ .show = ocfs2_filecheck_show,
+ .store = ocfs2_filecheck_store,
+};
+
+static struct kobj_type ocfs2_ktype_filecheck = {
+ .default_attrs = ocfs2_filecheck_attrs,
+ .sysfs_ops = &ocfs2_filecheck_ops,
+ .release = ocfs2_filecheck_release,
+};
static void
ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
{
struct ocfs2_filecheck_entry *p;
- if (!atomic_dec_and_test(&entry->fs_count))
- wait_on_atomic_t(&entry->fs_count, atomic_t_wait,
- TASK_UNINTERRUPTIBLE);
-
spin_lock(&entry->fs_fcheck->fc_lock);
while (!list_empty(&entry->fs_fcheck->fc_head)) {
p = list_first_entry(&entry->fs_fcheck->fc_head,
@@ -148,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
}
spin_unlock(&entry->fs_fcheck->fc_lock);
- kset_unregister(entry->fs_fcheckkset);
- kset_unregister(entry->fs_devicekset);
kfree(entry->fs_fcheck);
- kfree(entry);
-}
-
-static void
-ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
-{
- spin_lock(&ocfs2_filecheck_sysfs_lock);
- list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ entry->fs_fcheck = NULL;
}
-static int ocfs2_filecheck_sysfs_del(const char *devname)
+int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb)
{
- struct ocfs2_filecheck_sysfs_entry *p;
-
- spin_lock(&ocfs2_filecheck_sysfs_lock);
- list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
- if (!strcmp(p->fs_sb->s_id, devname)) {
- list_del(&p->fs_list);
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
- ocfs2_filecheck_sysfs_free(p);
- return 0;
- }
- }
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
- return 1;
-}
-
-static void
-ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
-{
- if (atomic_dec_and_test(&entry->fs_count))
- wake_up_atomic_t(&entry->fs_count);
-}
-
-static struct ocfs2_filecheck_sysfs_entry *
-ocfs2_filecheck_sysfs_get(const char *devname)
-{
- struct ocfs2_filecheck_sysfs_entry *p = NULL;
-
- spin_lock(&ocfs2_filecheck_sysfs_lock);
- list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
- if (!strcmp(p->fs_sb->s_id, devname)) {
- atomic_inc(&p->fs_count);
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
- return p;
- }
- }
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
- return NULL;
-}
-
-int ocfs2_filecheck_create_sysfs(struct super_block *sb)
-{
- int ret = 0;
- struct kset *device_kset = NULL;
- struct kset *fcheck_kset = NULL;
- struct ocfs2_filecheck *fcheck = NULL;
- struct ocfs2_filecheck_sysfs_entry *entry = NULL;
- struct attribute **attrs = NULL;
- struct attribute_group attrgp;
-
- if (!ocfs2_kset)
- return -ENOMEM;
-
- attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
- if (!attrs) {
- ret = -ENOMEM;
- goto error;
- } else {
- attrs[0] = &ocfs2_attr_filecheck_chk.attr;
- attrs[1] = &ocfs2_attr_filecheck_fix.attr;
- attrs[2] = &ocfs2_attr_filecheck_set.attr;
- attrs[3] = NULL;
- memset(&attrgp, 0, sizeof(attrgp));
- attrgp.attrs = attrs;
- }
+ int ret;
+ struct ocfs2_filecheck *fcheck;
+ struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent;
fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
- if (!fcheck) {
- ret = -ENOMEM;
- goto error;
- } else {
- INIT_LIST_HEAD(&fcheck->fc_head);
- spin_lock_init(&fcheck->fc_lock);
- fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
- fcheck->fc_size = 0;
- fcheck->fc_done = 0;
- }
-
- if (strlen(sb->s_id) <= 0) {
- mlog(ML_ERROR,
- "Cannot get device basename when create filecheck sysfs\n");
- ret = -ENODEV;
- goto error;
- }
-
- device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
- if (!device_kset) {
- ret = -ENOMEM;
- goto error;
- }
+ if (!fcheck)
+ return -ENOMEM;
- fcheck_kset = kset_create_and_add("filecheck", NULL,
- &device_kset->kobj);
- if (!fcheck_kset) {
- ret = -ENOMEM;
- goto error;
+ INIT_LIST_HEAD(&fcheck->fc_head);
+ spin_lock_init(&fcheck->fc_lock);
+ fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+ fcheck->fc_size = 0;
+ fcheck->fc_done = 0;
+
+ entry->fs_kobj.kset = osb->osb_dev_kset;
+ init_completion(&entry->fs_kobj_unregister);
+ ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck,
+ NULL, "filecheck");
+ if (ret) {
+ kfree(fcheck);
+ return ret;
}
- ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
- if (ret)
- goto error;
-
- entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
- if (!entry) {
- ret = -ENOMEM;
- goto error;
- } else {
- atomic_set(&entry->fs_count, 1);
- entry->fs_sb = sb;
- entry->fs_devicekset = device_kset;
- entry->fs_fcheckkset = fcheck_kset;
- entry->fs_fcheck = fcheck;
- ocfs2_filecheck_sysfs_add(entry);
- }
-
- kfree(attrs);
+ entry->fs_fcheck = fcheck;
return 0;
-
-error:
- kfree(attrs);
- kfree(entry);
- kfree(fcheck);
- kset_unregister(fcheck_kset);
- kset_unregister(device_kset);
- return ret;
}
-int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb)
{
- return ocfs2_filecheck_sysfs_del(sb->s_id);
+ if (!osb->osb_fc_ent.fs_fcheck)
+ return;
+
+ kobject_del(&osb->osb_fc_ent.fs_kobj);
+ kobject_put(&osb->osb_fc_ent.fs_kobj);
+ wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister);
+ ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent);
}
static int
@@ -309,7 +226,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
spin_lock(&ent->fs_fcheck->fc_lock);
if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
- mlog(ML_ERROR,
+ mlog(ML_NOTICE,
"Cannot set online file check maximum entry number "
"to %u due to too many pending entries(%u)\n",
len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
@@ -386,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
return 0;
}
-static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
@@ -394,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
unsigned int type;
struct ocfs2_filecheck_entry *p;
- struct ocfs2_filecheck_sysfs_entry *ent;
+ struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
return -EINVAL;
- ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
- if (!ent) {
- mlog(ML_ERROR,
- "Cannot get the corresponding entry via device basename %s\n",
- kobj->name);
- return -ENODEV;
- }
-
if (type == OCFS2_FILECHECK_TYPE_SET) {
spin_lock(&ent->fs_fcheck->fc_lock);
total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
@@ -440,11 +350,26 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
spin_unlock(&ent->fs_fcheck->fc_lock);
exit:
- ocfs2_filecheck_sysfs_put(ent);
return total;
}
-static int
+static inline int
+ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned long ino)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (!p->fe_done) {
+ if (p->fe_ino == ino)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static inline int
ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
{
struct ocfs2_filecheck_entry *p;
@@ -483,21 +408,21 @@ static void
ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
struct ocfs2_filecheck_entry *entry)
{
- entry->fe_done = 1;
spin_lock(&ent->fs_fcheck->fc_lock);
+ entry->fe_done = 1;
ent->fs_fcheck->fc_done++;
spin_unlock(&ent->fs_fcheck->fc_lock);
}
static unsigned int
-ocfs2_filecheck_handle(struct super_block *sb,
+ocfs2_filecheck_handle(struct ocfs2_super *osb,
unsigned long ino, unsigned int flags)
{
unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
struct inode *inode = NULL;
int rc;
- inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+ inode = ocfs2_iget(osb, ino, flags, 0);
if (IS_ERR(inode)) {
rc = (int)(-(long)inode);
if (rc >= OCFS2_FILECHECK_ERR_START &&
@@ -515,11 +440,14 @@ static void
ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
struct ocfs2_filecheck_entry *entry)
{
+ struct ocfs2_super *osb = container_of(ent, struct ocfs2_super,
+ osb_fc_ent);
+
if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
- entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_status = ocfs2_filecheck_handle(osb,
entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
- entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_status = ocfs2_filecheck_handle(osb,
entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
else
entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
@@ -527,30 +455,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
ocfs2_filecheck_done_entry(ent, entry);
}
-static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ ssize_t ret = 0;
struct ocfs2_filecheck_args args;
struct ocfs2_filecheck_entry *entry;
- struct ocfs2_filecheck_sysfs_entry *ent;
- ssize_t ret = 0;
+ struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
if (count == 0)
return count;
- if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) {
- mlog(ML_ERROR, "Invalid arguments for online file check\n");
+ if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args))
return -EINVAL;
- }
-
- ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
- if (!ent) {
- mlog(ML_ERROR,
- "Cannot get the corresponding entry via device basename %s\n",
- kobj->parent->name);
- return -ENODEV;
- }
if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
@@ -564,13 +483,16 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
}
spin_lock(&ent->fs_fcheck->fc_lock);
- if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
- (ent->fs_fcheck->fc_done == 0)) {
- mlog(ML_ERROR,
+ if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) {
+ ret = -EEXIST;
+ kfree(entry);
+ } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done == 0)) {
+ mlog(ML_NOTICE,
"Cannot do more file check "
"since file check queue(%u) is full now\n",
ent->fs_fcheck->fc_max);
- ret = -EBUSY;
+ ret = -EAGAIN;
kfree(entry);
} else {
if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
@@ -595,6 +517,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
ocfs2_filecheck_handle_entry(ent, entry);
exit:
- ocfs2_filecheck_sysfs_put(ent);
return (!ret ? count : ret);
}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
index e5cd002a2c09..6a22ee79e8d0 100644
--- a/fs/ocfs2/filecheck.h
+++ b/fs/ocfs2/filecheck.h
@@ -43,7 +43,32 @@ enum {
#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
-int ocfs2_filecheck_create_sysfs(struct super_block *sb);
-int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+struct ocfs2_filecheck {
+ struct list_head fc_head; /* File check entry list head */
+ spinlock_t fc_lock;
+ unsigned int fc_max; /* Maximum number of entry in list */
+ unsigned int fc_size; /* Current entry count in list */
+ unsigned int fc_done; /* Finished entry count in list */
+};
+
+#define OCFS2_FILECHECK_MAXSIZE 100
+#define OCFS2_FILECHECK_MINSIZE 10
+
+/* File check operation type */
+enum {
+ OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
+ OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
+ OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
+};
+
+struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */
+ struct kobject fs_kobj;
+ struct completion fs_kobj_unregister;
+ struct ocfs2_filecheck *fs_fcheck;
+};
+
+
+int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb);
+void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb);
#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d51b80edd972..ddc3e9470c87 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1135,7 +1135,7 @@ static void ocfs2_clear_inode(struct inode *inode)
trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink);
- mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+ mlog_bug_on_msg(osb == NULL,
"Inode=%lu\n", inode->i_ino);
dquot_drop(inode);
@@ -1150,7 +1150,7 @@ static void ocfs2_clear_inode(struct inode *inode)
ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
- ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+ ocfs2_resv_discard(&osb->osb_la_resmap,
&oi->ip_la_data_resv);
ocfs2_resv_init_once(&oi->ip_la_data_resv);
@@ -1160,7 +1160,7 @@ static void ocfs2_clear_inode(struct inode *inode)
* exception here are successfully wiped inodes - their
* metadata can now be considered to be part of the system
* inodes from which it came. */
- if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+ if (!(oi->ip_flags & OCFS2_INODE_DELETED))
ocfs2_checkpoint_inode(inode);
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
@@ -1223,7 +1223,7 @@ static void ocfs2_clear_inode(struct inode *inode)
* the journal is flushed before journal shutdown. Thus it is safe to
* have inodes get cleaned up after journal shutdown.
*/
- jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+ jbd2_journal_release_jbd_inode(osb->journal->j_journal,
&oi->ip_jinode);
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c801eddc4bf3..8dd6f703c819 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -525,7 +525,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
* these are used by the support functions here and in
* callers. */
inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
- OCFS2_I(inode)->ip_blkno = fe_blkno;
+ oi->ip_blkno = fe_blkno;
spin_lock(&osb->osb_lock);
inode->i_generation = osb->s_next_generation++;
spin_unlock(&osb->osb_lock);
@@ -1186,8 +1186,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
trace_ocfs2_double_lock_end(
- (unsigned long long)OCFS2_I(inode1)->ip_blkno,
- (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+ (unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
bail:
if (status)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6867eef2e06b..4f86ac0027b5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -50,6 +50,8 @@
#include "reservations.h"
+#include "filecheck.h"
+
/* Caching of metadata buffers */
/* Most user visible OCFS2 inodes will have very few pieces of
@@ -472,6 +474,12 @@ struct ocfs2_super
* workqueue and schedule on our own.
*/
struct workqueue_struct *ocfs2_wq;
+
+ /* sysfs directory per partition */
+ struct kset *osb_dev_kset;
+
+ /* file check related stuff */
+ struct ocfs2_filecheck_sysfs_entry osb_fc_ent;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index e2a11aaece10..2ee76a90ba8f 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1311,11 +1311,11 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
-DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter);
DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
-DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter);
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
@@ -1467,7 +1467,7 @@ TRACE_EVENT(ocfs2_prepare_inode_for_write,
__entry->saved_pos, __entry->count, __entry->wait)
);
-DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret);
/* End of trace events for fs/ocfs2/file.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ab156e35ec00..01c6b3894406 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -573,7 +573,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
BUG_ON(ocfs2_is_refcount_inode(inode));
trace_ocfs2_create_refcount_tree(
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)oi->ip_blkno);
ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
if (ret) {
@@ -3359,7 +3359,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
unsigned int ext_flags;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ if (!ocfs2_refcount_tree(osb)) {
return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
inode->i_ino);
}
@@ -3707,7 +3707,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
trace_ocfs2_add_refcount_flag(ref_blocks, credits);
if (ref_blocks) {
- ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ ret = ocfs2_reserve_new_metadata_blocks(osb,
ref_blocks, &meta_ac);
if (ret) {
mlog_errno(ret);
@@ -4766,8 +4766,8 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
*bh2 = *bh1;
trace_ocfs2_double_lock_end(
- (unsigned long long)OCFS2_I(inode1)->ip_blkno,
- (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+ (unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
return 0;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index dae9eb7c441e..d2fb97b173da 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -398,7 +398,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
static int ocfs2_control_do_setversion_msg(struct file *file,
struct ocfs2_control_message_setv *msg)
- {
+{
long major, minor;
char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d8f5f6ce99dc..f7c972fbed6a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -79,8 +79,6 @@ static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
}
-static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
-static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
@@ -387,7 +385,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
memset(bg, 0, sb->s_blocksize);
strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
- bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ bg->bg_generation = cpu_to_le32(osb->fs_generation);
bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
osb->s_feature_incompat));
bg->bg_chain = cpu_to_le16(my_chain);
@@ -1521,7 +1519,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
OCFS2_I(inode)->ip_clusters, max_bits);
}
- ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+ ret = ocfs2_block_group_find_clear_bits(osb,
group_bh, bits_wanted,
max_bits, res);
if (ret)
@@ -2626,53 +2624,6 @@ int ocfs2_release_clusters(handle_t *handle,
_ocfs2_clear_bit);
}
-static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
-{
- printk("Block Group:\n");
- printk("bg_signature: %s\n", bg->bg_signature);
- printk("bg_size: %u\n", bg->bg_size);
- printk("bg_bits: %u\n", bg->bg_bits);
- printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
- printk("bg_chain: %u\n", bg->bg_chain);
- printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
- printk("bg_next_group: %llu\n",
- (unsigned long long)bg->bg_next_group);
- printk("bg_parent_dinode: %llu\n",
- (unsigned long long)bg->bg_parent_dinode);
- printk("bg_blkno: %llu\n",
- (unsigned long long)bg->bg_blkno);
-}
-
-static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
-{
- int i;
-
- printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
- printk("i_signature: %s\n", fe->i_signature);
- printk("i_size: %llu\n",
- (unsigned long long)fe->i_size);
- printk("i_clusters: %u\n", fe->i_clusters);
- printk("i_generation: %u\n",
- le32_to_cpu(fe->i_generation));
- printk("id1.bitmap1.i_used: %u\n",
- le32_to_cpu(fe->id1.bitmap1.i_used));
- printk("id1.bitmap1.i_total: %u\n",
- le32_to_cpu(fe->id1.bitmap1.i_total));
- printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
- printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
- printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
- printk("id2.i_chain.cl_next_free_rec: %u\n",
- fe->id2.i_chain.cl_next_free_rec);
- for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
- printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
- fe->id2.i_chain.cl_recs[i].c_free);
- printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
- fe->id2.i_chain.cl_recs[i].c_total);
- printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
- (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
- }
-}
-
/*
* For a given allocation, determine which allocators will need to be
* accessed, and lock them, reserving the appropriate number of bits.
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ffa4952d432b..3415e0b09398 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -423,10 +423,10 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
ocfs2_schedule_truncate_log_flush(osb, 0);
}
- if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+ if (jbd2_journal_start_commit(osb->journal->j_journal,
&target)) {
if (wait)
- jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+ jbd2_log_wait_commit(osb->journal->j_journal,
target);
}
return 0;
@@ -1161,6 +1161,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
ocfs2_complete_mount_recovery(osb);
+ osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
+ &ocfs2_kset->kobj);
+ if (!osb->osb_dev_kset) {
+ status = -ENOMEM;
+ mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id);
+ goto read_super_error;
+ }
+
+ /* Create filecheck sysfs related directories/files at
+ * /sys/fs/ocfs2/<devname>/filecheck */
+ if (ocfs2_filecheck_create_sysfs(osb)) {
+ status = -ENOMEM;
+ mlog(ML_ERROR, "Unable to create filecheck sysfs directory at "
+ "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id);
+ goto read_super_error;
+ }
+
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
@@ -1199,9 +1216,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Start this when the mount is almost sure of being successful */
ocfs2_orphan_scan_start(osb);
- /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
- ocfs2_filecheck_create_sysfs(sb);
-
return status;
read_super_error:
@@ -1653,7 +1667,6 @@ static void ocfs2_put_super(struct super_block *sb)
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
- ocfs2_filecheck_remove_sysfs(sb);
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1768,12 +1781,9 @@ static int ocfs2_initialize_mem_caches(void)
NULL);
if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
!ocfs2_qf_chunk_cachep) {
- if (ocfs2_inode_cachep)
- kmem_cache_destroy(ocfs2_inode_cachep);
- if (ocfs2_dquot_cachep)
- kmem_cache_destroy(ocfs2_dquot_cachep);
- if (ocfs2_qf_chunk_cachep)
- kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ kmem_cache_destroy(ocfs2_inode_cachep);
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
return -ENOMEM;
}
@@ -1787,16 +1797,13 @@ static void ocfs2_free_mem_caches(void)
* destroy cache.
*/
rcu_barrier();
- if (ocfs2_inode_cachep)
- kmem_cache_destroy(ocfs2_inode_cachep);
+ kmem_cache_destroy(ocfs2_inode_cachep);
ocfs2_inode_cachep = NULL;
- if (ocfs2_dquot_cachep)
- kmem_cache_destroy(ocfs2_dquot_cachep);
+ kmem_cache_destroy(ocfs2_dquot_cachep);
ocfs2_dquot_cachep = NULL;
- if (ocfs2_qf_chunk_cachep)
- kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
ocfs2_qf_chunk_cachep = NULL;
}
@@ -1899,6 +1906,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
osb = OCFS2_SB(sb);
BUG_ON(!osb);
+ /* Remove file check sysfs related directores/files,
+ * and wait for the pending file check operations */
+ ocfs2_filecheck_remove_sysfs(osb);
+
+ kset_unregister(osb->osb_dev_kset);
+
debugfs_remove(osb->osb_ctxt);
/* Orphan scan should be stopped as early as possible */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 82e17b076ce7..78f09c76ab3c 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -633,6 +633,5 @@ int __init init_ocfs2_uptodate_cache(void)
void exit_ocfs2_uptodate_cache(void)
{
- if (ocfs2_uptodate_cachep)
- kmem_cache_destroy(ocfs2_uptodate_cachep);
+ kmem_cache_destroy(ocfs2_uptodate_cachep);
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c261c1dfd374..3a24ce3deb01 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3564,7 +3564,7 @@ int ocfs2_xattr_set(struct inode *inode,
.not_found = -ENODATA,
};
- if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ if (!ocfs2_supports_xattr(osb))
return -EOPNOTSUPP;
/*
diff --git a/fs/open.c b/fs/open.c
index 7ea118471dce..c5ee7cd60424 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -128,7 +128,7 @@ out:
}
EXPORT_SYMBOL_GPL(vfs_truncate);
-static long do_sys_truncate(const char __user *pathname, loff_t length)
+long do_sys_truncate(const char __user *pathname, loff_t length)
{
unsigned int lookup_flags = LOOKUP_FOLLOW;
struct path path;
@@ -162,7 +162,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif
-static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
@@ -333,7 +333,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
}
EXPORT_SYMBOL_GPL(vfs_fallocate);
-SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
+int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
struct fd f = fdget(fd);
int error = -EBADF;
@@ -345,12 +345,17 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
return error;
}
+SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
+{
+ return ksys_fallocate(fd, mode, offset, len);
+}
+
/*
* access() needs to use the real uid/gid, not the effective uid/gid.
* We do this by temporarily clearing all FS-related capabilities and
* switching the fsuid/fsgid around to the real ones.
*/
-SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
+long do_faccessat(int dfd, const char __user *filename, int mode)
{
const struct cred *old_cred;
struct cred *override_cred;
@@ -426,12 +431,17 @@ out:
return res;
}
+SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
+{
+ return do_faccessat(dfd, filename, mode);
+}
+
SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
{
- return sys_faccessat(AT_FDCWD, filename, mode);
+ return do_faccessat(AT_FDCWD, filename, mode);
}
-SYSCALL_DEFINE1(chdir, const char __user *, filename)
+int ksys_chdir(const char __user *filename)
{
struct path path;
int error;
@@ -457,6 +467,11 @@ out:
return error;
}
+SYSCALL_DEFINE1(chdir, const char __user *, filename)
+{
+ return ksys_chdir(filename);
+}
+
SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
struct fd f = fdget_raw(fd);
@@ -479,7 +494,7 @@ out:
return error;
}
-SYSCALL_DEFINE1(chroot, const char __user *, filename)
+int ksys_chroot(const char __user *filename)
{
struct path path;
int error;
@@ -512,6 +527,11 @@ out:
return error;
}
+SYSCALL_DEFINE1(chroot, const char __user *, filename)
+{
+ return ksys_chroot(filename);
+}
+
static int chmod_common(const struct path *path, umode_t mode)
{
struct inode *inode = path->dentry->d_inode;
@@ -541,7 +561,7 @@ out_unlock:
return error;
}
-SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
+int ksys_fchmod(unsigned int fd, umode_t mode)
{
struct fd f = fdget(fd);
int err = -EBADF;
@@ -554,7 +574,12 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
return err;
}
-SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode)
+SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
+{
+ return ksys_fchmod(fd, mode);
+}
+
+int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
{
struct path path;
int error;
@@ -572,9 +597,15 @@ retry:
return error;
}
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
+ umode_t, mode)
+{
+ return do_fchmodat(dfd, filename, mode);
+}
+
SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
- return sys_fchmodat(AT_FDCWD, filename, mode);
+ return do_fchmodat(AT_FDCWD, filename, mode);
}
static int chown_common(const struct path *path, uid_t user, gid_t group)
@@ -619,8 +650,8 @@ retry_deleg:
return error;
}
-SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
- gid_t, group, int, flag)
+int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
+ int flag)
{
struct path path;
int error = -EINVAL;
@@ -651,18 +682,24 @@ out:
return error;
}
+SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
+ gid_t, group, int, flag)
+{
+ return do_fchownat(dfd, filename, user, group, flag);
+}
+
SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
- return sys_fchownat(AT_FDCWD, filename, user, group, 0);
+ return do_fchownat(AT_FDCWD, filename, user, group, 0);
}
SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
- return sys_fchownat(AT_FDCWD, filename, user, group,
- AT_SYMLINK_NOFOLLOW);
+ return do_fchownat(AT_FDCWD, filename, user, group,
+ AT_SYMLINK_NOFOLLOW);
}
-SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
+int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
struct fd f = fdget(fd);
int error = -EBADF;
@@ -682,14 +719,9 @@ out:
return error;
}
-int open_check_o_direct(struct file *f)
+SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
{
- /* NB: we're sure to have correct a_ops only after f_op->open */
- if (f->f_flags & O_DIRECT) {
- if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
- return 0;
+ return ksys_fchown(fd, user, group);
}
static int do_dentry_open(struct file *f,
@@ -713,7 +745,7 @@ static int do_dentry_open(struct file *f,
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH;
f->f_op = &empty_fops;
- return 0;
+ goto done;
}
if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
@@ -766,7 +798,12 @@ static int do_dentry_open(struct file *f,
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
-
+done:
+ /* NB: we're sure to have correct a_ops only after f_op->open */
+ error = -EINVAL;
+ if ((f->f_flags & O_DIRECT) &&
+ (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO))
+ goto out_fput;
return 0;
cleanup_all:
@@ -781,6 +818,9 @@ cleanup_file:
f->f_path.dentry = NULL;
f->f_inode = NULL;
return error;
+out_fput:
+ fput(f);
+ return error;
}
/**
@@ -878,20 +918,14 @@ struct file *dentry_open(const struct path *path, int flags,
BUG_ON(!path->mnt);
f = get_empty_filp();
- if (!IS_ERR(f)) {
- f->f_flags = flags;
- error = vfs_open(path, f, cred);
- if (!error) {
- /* from now on we need fput() to dispose of f */
- error = open_check_o_direct(f);
- if (error) {
- fput(f);
- f = ERR_PTR(error);
- }
- } else {
- put_filp(f);
- f = ERR_PTR(error);
- }
+ if (IS_ERR(f))
+ return f;
+
+ f->f_flags = flags;
+ error = vfs_open(path, f, cred);
+ if (error) {
+ put_filp(f);
+ return ERR_PTR(error);
}
return f;
}
@@ -1114,7 +1148,7 @@ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, fla
*/
SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
- return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
+ return ksys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
}
#endif
@@ -1163,7 +1197,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
return retval;
}
-EXPORT_SYMBOL(sys_close);
/*
* This routine simulates a hangup on the tty, to arrange that users
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 480ea059a680..10587413b20e 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -9,7 +9,6 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
#include <linux/posix_acl_xattr.h>
-#include <linux/fs_struct.h>
struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
{
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index ea6256d136d1..00fadaf0da8f 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -500,7 +500,7 @@ int orangefs_normalize_to_errno(__s32 error_code)
* server.
*/
} else if (error_code > 0) {
- gossip_err("orangefs: error status receieved.\n");
+ gossip_err("orangefs: error status received.\n");
gossip_err("orangefs: assuming error code is inverted.\n");
error_code = -error_code;
}
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 406e72de88f6..ce6ff5a0a6e4 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -24,6 +24,8 @@ config OVERLAY_FS_REDIRECT_DIR
an overlay which has redirects on a kernel that doesn't support this
feature will have unexpected results.
+ If unsure, say N.
+
config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
bool "Overlayfs: follow redirects even if redirects are turned off"
default y
@@ -32,8 +34,13 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
Disable this to get a possibly more secure configuration, but that
might not be backward compatible with previous kernels.
+ If backward compatibility is not an issue, then it is safe and
+ recommended to say N here.
+
For more information, see Documentation/filesystems/overlayfs.txt
+ If unsure, say Y.
+
config OVERLAY_FS_INDEX
bool "Overlayfs: turn on inodes index feature by default"
depends on OVERLAY_FS
@@ -51,6 +58,8 @@ config OVERLAY_FS_INDEX
That is, mounting an overlay which has an inodes index on a kernel
that doesn't support this feature will have unexpected results.
+ If unsure, say N.
+
config OVERLAY_FS_NFS_EXPORT
bool "Overlayfs: turn on NFS export feature by default"
depends on OVERLAY_FS
@@ -72,3 +81,8 @@ config OVERLAY_FS_NFS_EXPORT
Note, that the NFS export feature is not backward compatible.
That is, mounting an overlay which has a full index on a kernel
that doesn't support this feature will have unexpected results.
+
+ Most users should say N here and enable this feature on a case-by-
+ case basis with the "nfs_export=on" mount option.
+
+ Say N unless you fully understand the consequences.
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index bb94ce9da5c8..87bd4148f4fb 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -19,6 +19,142 @@
#include <linux/ratelimit.h>
#include "overlayfs.h"
+static int ovl_encode_maybe_copy_up(struct dentry *dentry)
+{
+ int err;
+
+ if (ovl_dentry_upper(dentry))
+ return 0;
+
+ err = ovl_want_write(dentry);
+ if (!err) {
+ err = ovl_copy_up(dentry);
+ ovl_drop_write(dentry);
+ }
+
+ if (err) {
+ pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n",
+ dentry, err);
+ }
+
+ return err;
+}
+
+/*
+ * Before encoding a non-upper directory file handle from real layer N, we need
+ * to check if it will be possible to reconnect an overlay dentry from the real
+ * lower decoded dentry. This is done by following the overlay ancestry up to a
+ * "layer N connected" ancestor and verifying that all parents along the way are
+ * "layer N connectable". If an ancestor that is NOT "layer N connectable" is
+ * found, we need to copy up an ancestor, which is "layer N connectable", thus
+ * making that ancestor "layer N connected". For example:
+ *
+ * layer 1: /a
+ * layer 2: /a/b/c
+ *
+ * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
+ * copied up and renamed, upper dir /a will be indexed by lower dir /a from
+ * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*)
+ * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay
+ * dentry from the connected lower dentry /a/b/c.
+ *
+ * To avoid this problem on decode time, we need to copy up an ancestor of
+ * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
+ * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
+ * and when the time comes to decode the file handle from lower dentry /a/b/c,
+ * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
+ * a connected overlay dentry will be accomplished.
+ *
+ * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an
+ * entry /a in the lower layers above layer N and find the indexed dir /a from
+ * layer 1. If that improvement is made, then the check for "layer N connected"
+ * will need to verify there are no redirects in lower layers above N. In the
+ * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a
+ * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable":
+ *
+ * layer 1: /A (redirect = /a)
+ * layer 2: /a/b/c
+ */
+
+/* Return the lowest layer for encoding a connectable file handle */
+static int ovl_connectable_layer(struct dentry *dentry)
+{
+ struct ovl_entry *oe = OVL_E(dentry);
+
+ /* We can get overlay root from root of any layer */
+ if (dentry == dentry->d_sb->s_root)
+ return oe->numlower;
+
+ /*
+ * If it's an unindexed merge dir, then it's not connectable with any
+ * lower layer
+ */
+ if (ovl_dentry_upper(dentry) &&
+ !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+ return 0;
+
+ /* We can get upper/overlay path from indexed/lower dentry */
+ return oe->lowerstack[0].layer->idx;
+}
+
+/*
+ * @dentry is "connected" if all ancestors up to root or a "connected" ancestor
+ * have the same uppermost lower layer as the origin's layer. We may need to
+ * copy up a "connectable" ancestor to make it "connected". A "connected" dentry
+ * cannot become non "connected", so cache positive result in dentry flags.
+ *
+ * Return the connected origin layer or < 0 on error.
+ */
+static int ovl_connect_layer(struct dentry *dentry)
+{
+ struct dentry *next, *parent = NULL;
+ int origin_layer;
+ int err = 0;
+
+ if (WARN_ON(dentry == dentry->d_sb->s_root) ||
+ WARN_ON(!ovl_dentry_lower(dentry)))
+ return -EIO;
+
+ origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx;
+ if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry))
+ return origin_layer;
+
+ /* Find the topmost origin layer connectable ancestor of @dentry */
+ next = dget(dentry);
+ for (;;) {
+ parent = dget_parent(next);
+ if (WARN_ON(parent == next)) {
+ err = -EIO;
+ break;
+ }
+
+ /*
+ * If @parent is not origin layer connectable, then copy up
+ * @next which is origin layer connectable and we are done.
+ */
+ if (ovl_connectable_layer(parent) < origin_layer) {
+ err = ovl_encode_maybe_copy_up(next);
+ break;
+ }
+
+ /* If @parent is connected or indexed we are done */
+ if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) ||
+ ovl_test_flag(OVL_INDEX, d_inode(parent)))
+ break;
+
+ dput(next);
+ next = parent;
+ }
+
+ dput(parent);
+ dput(next);
+
+ if (!err)
+ ovl_dentry_set_flag(OVL_E_CONNECTED, dentry);
+
+ return err ?: origin_layer;
+}
+
/*
* We only need to encode origin if there is a chance that the same object was
* encoded pre copy up and then we need to stay consistent with the same
@@ -41,73 +177,59 @@
* L = lower file handle
*
* (*) Connecting an overlay dir from real lower dentry is not always
- * possible when there are redirects in lower layers. To mitigate this case,
- * we copy up the lower dir first and then encode an upper dir file handle.
+ * possible when there are redirects in lower layers and non-indexed merge dirs.
+ * To mitigate those case, we may copy up the lower dir ancestor before encode
+ * a lower dir file handle.
+ *
+ * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
*/
-static bool ovl_should_encode_origin(struct dentry *dentry)
+static int ovl_check_encode_origin(struct dentry *dentry)
{
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ /* Upper file handle for pure upper */
if (!ovl_dentry_lower(dentry))
- return false;
+ return 0;
/*
- * Decoding a merge dir, whose origin's parent is under a redirected
- * lower dir is not always possible. As a simple aproximation, we do
- * not encode lower dir file handles when overlay has multiple lower
- * layers and origin is below the topmost lower layer.
+ * Upper file handle for non-indexed upper.
*
- * TODO: copy up only the parent that is under redirected lower.
+ * Root is never indexed, so if there's an upper layer, encode upper for
+ * root.
*/
- if (d_is_dir(dentry) && ofs->upper_mnt &&
- OVL_E(dentry)->lowerstack[0].layer->idx > 1)
- return false;
-
- /* Decoding a non-indexed upper from origin is not implemented */
if (ovl_dentry_upper(dentry) &&
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
- return false;
-
- return true;
-}
-
-static int ovl_encode_maybe_copy_up(struct dentry *dentry)
-{
- int err;
-
- if (ovl_dentry_upper(dentry))
return 0;
- err = ovl_want_write(dentry);
- if (err)
- return err;
-
- err = ovl_copy_up(dentry);
+ /*
+ * Decoding a merge dir, whose origin's ancestor is under a redirected
+ * lower dir or under a non-indexed upper is not always possible.
+ * ovl_connect_layer() will try to make origin's layer "connected" by
+ * copying up a "connectable" ancestor.
+ */
+ if (d_is_dir(dentry) && ofs->upper_mnt)
+ return ovl_connect_layer(dentry);
- ovl_drop_write(dentry);
- return err;
+ /* Lower file handle for indexed and non-upper dir/non-dir */
+ return 1;
}
static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
{
- struct dentry *origin = ovl_dentry_lower(dentry);
struct ovl_fh *fh = NULL;
- int err;
+ int err, enc_lower;
/*
- * If we should not encode a lower dir file handle, copy up and encode
- * an upper dir file handle.
+ * Check if we should encode a lower or upper file handle and maybe
+ * copy up an ancestor to make lower file handle connectable.
*/
- if (!ovl_should_encode_origin(dentry)) {
- err = ovl_encode_maybe_copy_up(dentry);
- if (err)
- goto fail;
-
- origin = NULL;
- }
+ err = enc_lower = ovl_check_encode_origin(dentry);
+ if (enc_lower < 0)
+ goto fail;
- /* Encode an upper or origin file handle */
- fh = ovl_encode_fh(origin ?: ovl_dentry_upper(dentry), !origin);
+ /* Encode an upper or lower file handle */
+ fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) :
+ ovl_dentry_upper(dentry), !enc_lower);
err = PTR_ERR(fh);
if (IS_ERR(fh))
goto fail;
@@ -355,8 +477,8 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
dput(upper);
}
- if (!this)
- return NULL;
+ if (IS_ERR_OR_NULL(this))
+ return this;
if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
dput(this);
@@ -498,7 +620,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb,
if (err == -ECHILD) {
this = ovl_lookup_real_ancestor(sb, real,
layer);
- err = IS_ERR(this) ? PTR_ERR(this) : 0;
+ err = PTR_ERR_OR_ZERO(this);
}
if (!err) {
dput(connected);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index fcd97b783fa1..3b1bd469accd 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -669,38 +669,59 @@ struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
return inode;
}
+/*
+ * Does overlay inode need to be hashed by lower inode?
+ */
+static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
+ struct dentry *lower, struct dentry *index)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ /* No, if pure upper */
+ if (!lower)
+ return false;
+
+ /* Yes, if already indexed */
+ if (index)
+ return true;
+
+ /* Yes, if won't be copied up */
+ if (!ofs->upper_mnt)
+ return true;
+
+ /* No, if lower hardlink is or will be broken on copy up */
+ if ((upper || !ovl_indexdir(sb)) &&
+ !d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
+ return false;
+
+ /* No, if non-indexed upper with NFS export */
+ if (sb->s_export_op && upper)
+ return false;
+
+ /* Otherwise, hash by lower inode for fsnotify */
+ return true;
+}
+
struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
struct dentry *lowerdentry, struct dentry *index,
unsigned int numlower)
{
- struct ovl_fs *ofs = sb->s_fs_info;
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
struct inode *inode;
- /* Already indexed or could be indexed on copy up? */
- bool indexed = (index || (ovl_indexdir(sb) && !upperdentry));
- struct dentry *origin = indexed ? lowerdentry : NULL;
+ bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index);
bool is_dir;
- if (WARN_ON(upperdentry && indexed && !lowerdentry))
- return ERR_PTR(-EIO);
-
if (!realinode)
realinode = d_inode(lowerdentry);
/*
- * Copy up origin (lower) may exist for non-indexed non-dir upper, but
- * we must not use lower as hash key in that case.
- * Hash non-dir that is or could be indexed by origin inode.
- * Hash dir that is or could be merged by origin inode.
- * Hash pure upper and non-indexed non-dir by upper inode.
- * Hash non-indexed dir by upper inode for NFS export.
+ * Copy up origin (lower) may exist for non-indexed upper, but we must
+ * not use lower as hash key if this is a broken hardlink.
*/
is_dir = S_ISDIR(realinode->i_mode);
- if (is_dir && (indexed || !sb->s_export_op || !ofs->upper_mnt))
- origin = lowerdentry;
-
- if (upperdentry || origin) {
- struct inode *key = d_inode(origin ?: upperdentry);
+ if (upperdentry || bylower) {
+ struct inode *key = d_inode(bylower ? lowerdentry :
+ upperdentry);
unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
inode = iget5_locked(sb, (unsigned long) key,
@@ -728,6 +749,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
set_nlink(inode, nlink);
} else {
+ /* Lower hardlink that will be broken on copy up */
inode = new_inode(sb);
if (!inode)
goto out_nomem;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index de3e6da1d5a5..70fcfcc684cc 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -913,9 +913,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
stack[ctr].layer = lower.layer;
ctr++;
- if (d.stop)
- break;
-
/*
* Following redirects can have security consequences: it's like
* a symlink into the lower layer without the permission checks.
@@ -933,6 +930,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out_put;
}
+ if (d.stop)
+ break;
+
if (d.redirect && d.redirect[0] == '/' && poe != roe) {
poe = roe;
/* Find the current layer on the root dentry */
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0df25a9c94bd..225ff1171147 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -40,6 +40,7 @@ enum ovl_inode_flag {
enum ovl_entry_flag {
OVL_E_UPPER_ALIAS,
OVL_E_OPAQUE,
+ OVL_E_CONNECTED,
};
/*
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 9ee37c76091d..7c24619ae7fc 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1359,6 +1359,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
/* Root is always merge -> can have whiteouts */
ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
+ ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry);
ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
ovl_dentry_lower(root_dentry));
diff --git a/fs/pipe.c b/fs/pipe.c
index 7b1954caf388..39d6f431da83 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -841,7 +841,7 @@ int do_pipe_flags(int *fd, int flags)
* sys_pipe() is the normal C calling standard for creating
* a pipe. It's not the way Unix traditionally does this, though.
*/
-SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
+static int do_pipe2(int __user *fildes, int flags)
{
struct file *files[2];
int fd[2];
@@ -863,9 +863,14 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
return error;
}
+SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
+{
+ return do_pipe2(fildes, flags);
+}
+
SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
- return sys_pipe2(fildes, 0);
+ return do_pipe2(fildes, 0);
}
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9298324325ed..d53246863cfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -94,9 +94,6 @@
#include <linux/sched/stat.h>
#include <linux/flex_array.h>
#include <linux/posix-timers.h>
-#ifdef CONFIG_HARDWALL
-#include <asm/hardwall.h>
-#endif
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -3002,9 +2999,6 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
ONE("io", S_IRUSR, proc_tgid_io_accounting),
#endif
-#ifdef CONFIG_HARDWALL
- ONE("hardwall", S_IRUGO, proc_pid_hardwall),
-#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
@@ -3393,9 +3387,6 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
ONE("io", S_IRUSR, proc_tid_io_accounting),
#endif
-#ifdef CONFIG_HARDWALL
- ONE("hardwall", S_IRUGO, proc_pid_hardwall),
-#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
index 779caed4f078..c30572857619 100644
--- a/fs/quota/compat.c
+++ b/fs/quota/compat.c
@@ -41,8 +41,9 @@ struct compat_fs_quota_stat {
__u16 qs_iwarnlimit;
};
-asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
- qid_t id, void __user *addr)
+COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd,
+ const char __user *, special, qid_t, id,
+ void __user *, addr)
{
unsigned int cmds;
struct if_dqblk __user *dqblk;
@@ -59,7 +60,7 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
case Q_GETQUOTA:
dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
compat_dqblk = addr;
- ret = sys_quotactl(cmd, special, id, dqblk);
+ ret = kernel_quotactl(cmd, special, id, dqblk);
if (ret)
break;
if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
@@ -75,12 +76,12 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
get_user(data, &compat_dqblk->dqb_valid) ||
put_user(data, &dqblk->dqb_valid))
break;
- ret = sys_quotactl(cmd, special, id, dqblk);
+ ret = kernel_quotactl(cmd, special, id, dqblk);
break;
case Q_XGETQSTAT:
fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
compat_fsqstat = addr;
- ret = sys_quotactl(cmd, special, id, fsqstat);
+ ret = kernel_quotactl(cmd, special, id, fsqstat);
if (ret)
break;
ret = -EFAULT;
@@ -113,7 +114,7 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
ret = 0;
break;
default:
- ret = sys_quotactl(cmd, special, id, addr);
+ ret = kernel_quotactl(cmd, special, id, addr);
}
return ret;
}
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 43612e2a73af..860bfbe7a07a 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -833,8 +833,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
* calls. Maybe we need to add the process quotas etc. in the future,
* but we probably should use rlimits for that.
*/
-SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
- qid_t, id, void __user *, addr)
+int kernel_quotactl(unsigned int cmd, const char __user *special,
+ qid_t id, void __user *addr)
{
uint cmds, type;
struct super_block *sb = NULL;
@@ -885,3 +885,9 @@ out:
path_put(pathp);
return ret;
}
+
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+ qid_t, id, void __user *, addr)
+{
+ return kernel_quotactl(cmd, special, id, addr);
+}
diff --git a/fs/read_write.c b/fs/read_write.c
index f8547b82dfb3..c4eabbfc90df 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -301,7 +301,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
}
EXPORT_SYMBOL(vfs_llseek);
-SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
+off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
off_t retval;
struct fd f = fdget_pos(fd);
@@ -319,10 +319,15 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
return retval;
}
+SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
+{
+ return ksys_lseek(fd, offset, whence);
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
{
- return sys_lseek(fd, offset, whence);
+ return ksys_lseek(fd, offset, whence);
}
#endif
@@ -563,7 +568,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
file->f_pos = pos;
}
-SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
+ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
@@ -578,8 +583,12 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
return ret;
}
-SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
- size_t, count)
+SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
+{
+ return ksys_read(fd, buf, count);
+}
+
+ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
@@ -595,8 +604,14 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
return ret;
}
-SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
- size_t, count, loff_t, pos)
+SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
+ size_t, count)
+{
+ return ksys_write(fd, buf, count);
+}
+
+ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
+ loff_t pos)
{
struct fd f;
ssize_t ret = -EBADF;
@@ -615,8 +630,14 @@ SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
return ret;
}
-SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
- size_t, count, loff_t, pos)
+SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
+ size_t, count, loff_t, pos)
+{
+ return ksys_pread64(fd, buf, count, pos);
+}
+
+ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
+ size_t count, loff_t pos)
{
struct fd f;
ssize_t ret = -EBADF;
@@ -635,6 +656,12 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
return ret;
}
+SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
+ size_t, count, loff_t, pos)
+{
+ return ksys_pwrite64(fd, buf, count, pos);
+}
+
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
loff_t *ppos, int type, rwf_t flags)
{
diff --git a/fs/readdir.c b/fs/readdir.c
index 1b83b0ad183b..d97f548e6323 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -292,8 +292,8 @@ efault:
return -EFAULT;
}
-SYSCALL_DEFINE3(getdents64, unsigned int, fd,
- struct linux_dirent64 __user *, dirent, unsigned int, count)
+int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
+ unsigned int count)
{
struct fd f;
struct linux_dirent64 __user * lastdirent;
@@ -326,6 +326,13 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
return error;
}
+
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+ struct linux_dirent64 __user *, dirent, unsigned int, count)
+{
+ return ksys_getdents64(fd, dirent, count);
+}
+
#ifdef CONFIG_COMPAT
struct compat_old_linux_dirent {
compat_ulong_t d_ino;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 48835a659948..ae4811fecc1f 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1916,7 +1916,7 @@ struct reiserfs_de_head {
/* empty directory contains two entries "." and ".." and their headers */
#define EMPTY_DIR_SIZE \
-(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen ("..")))
+(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1))
/* old format directories have this size when empty */
#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
diff --git a/fs/select.c b/fs/select.c
index b6c36254028a..ba879c51288f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -675,8 +675,8 @@ out_nofds:
return ret;
}
-SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
- fd_set __user *, exp, struct timeval __user *, tvp)
+static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
+ fd_set __user *exp, struct timeval __user *tvp)
{
struct timespec64 end_time, *to = NULL;
struct timeval tv;
@@ -699,6 +699,12 @@ SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
return ret;
}
+SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
+ fd_set __user *, exp, struct timeval __user *, tvp)
+{
+ return kern_select(n, inp, outp, exp, tvp);
+}
+
static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timespec __user *tsp,
const sigset_t __user *sigmask, size_t sigsetsize)
@@ -784,7 +790,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
- return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+ return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif
@@ -1259,9 +1265,9 @@ out_nofds:
return ret;
}
-COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
- compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
- struct compat_timeval __user *, tvp)
+static int do_compat_select(int n, compat_ulong_t __user *inp,
+ compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+ struct compat_timeval __user *tvp)
{
struct timespec64 end_time, *to = NULL;
struct compat_timeval tv;
@@ -1284,6 +1290,13 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
return ret;
}
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
+ compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+ struct compat_timeval __user *, tvp)
+{
+ return do_compat_select(n, inp, outp, exp, tvp);
+}
+
struct compat_sel_arg_struct {
compat_ulong_t n;
compat_uptr_t inp;
@@ -1298,8 +1311,8 @@ COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
- return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
- compat_ptr(a.exp), compat_ptr(a.tvp));
+ return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+ compat_ptr(a.exp), compat_ptr(a.tvp));
}
static long do_compat_pselect(int n, compat_ulong_t __user *inp,
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 9990957264e3..d2187a813376 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -118,13 +118,22 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
#endif
#ifdef BUS_MCEERR_AO
- /*
+ /*
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
if (kinfo->si_signo == SIGBUS &&
- (kinfo->si_code == BUS_MCEERR_AR ||
- kinfo->si_code == BUS_MCEERR_AO))
+ kinfo->si_code == BUS_MCEERR_AO)
+ err |= __put_user((short) kinfo->si_addr_lsb,
+ &uinfo->ssi_addr_lsb);
+#endif
+#ifdef BUS_MCEERR_AR
+ /*
+ * Other callers might not initialize the si_lsb field,
+ * so check explicitly for the right codes here.
+ */
+ if (kinfo->si_signo == SIGBUS &&
+ kinfo->si_code == BUS_MCEERR_AR)
err |= __put_user((short) kinfo->si_addr_lsb,
&uinfo->ssi_addr_lsb);
#endif
@@ -247,8 +256,8 @@ static const struct file_operations signalfd_fops = {
.llseek = noop_llseek,
};
-SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
- size_t, sizemask, int, flags)
+static int do_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask,
+ int flags)
{
sigset_t sigmask;
struct signalfd_ctx *ctx;
@@ -301,17 +310,22 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
return ufd;
}
+SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
+ size_t, sizemask, int, flags)
+{
+ return do_signalfd4(ufd, user_mask, sizemask, flags);
+}
+
SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
size_t, sizemask)
{
- return sys_signalfd4(ufd, user_mask, sizemask, 0);
+ return do_signalfd4(ufd, user_mask, sizemask, 0);
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
- const compat_sigset_t __user *,sigmask,
- compat_size_t, sigsetsize,
- int, flags)
+static long do_compat_signalfd4(int ufd,
+ const compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize, int flags)
{
sigset_t tmp;
sigset_t __user *ksigmask;
@@ -324,13 +338,21 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
return -EFAULT;
- return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
+ return do_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
+}
+
+COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize,
+ int, flags)
+{
+ return do_compat_signalfd4(ufd, sigmask, sigsetsize, flags);
}
COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd,
const compat_sigset_t __user *,sigmask,
compat_size_t, sigsetsize)
{
- return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+ return do_compat_signalfd4(ufd, sigmask, sigsetsize, 0);
}
#endif
diff --git a/fs/splice.c b/fs/splice.c
index 39e2dc01ac12..005d09cf3fa8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1331,8 +1331,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov,
* Currently we punt and implement it as a normal copy, see pipe_to_user().
*
*/
-SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
- unsigned long, nr_segs, unsigned int, flags)
+static long do_vmsplice(int fd, const struct iovec __user *iov,
+ unsigned long nr_segs, unsigned int flags)
{
struct fd f;
long error;
@@ -1358,6 +1358,12 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
return error;
}
+SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
+ unsigned long, nr_segs, unsigned int, flags)
+{
+ return do_vmsplice(fd, iov, nr_segs, flags);
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
unsigned int, nr_segs, unsigned int, flags)
@@ -1375,7 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
put_user(v.iov_len, &iov[i].iov_len))
return -EFAULT;
}
- return sys_vmsplice(fd, iov, nr_segs, flags);
+ return do_vmsplice(fd, iov, nr_segs, flags);
}
#endif
diff --git a/fs/stat.c b/fs/stat.c
index 873785dae022..f8e6fb2c3657 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -379,8 +379,8 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
return error;
}
-SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
- char __user *, buf, int, bufsiz)
+static int do_readlinkat(int dfd, const char __user *pathname,
+ char __user *buf, int bufsiz)
{
struct path path;
int error;
@@ -415,10 +415,16 @@ retry:
return error;
}
+SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
+ char __user *, buf, int, bufsiz)
+{
+ return do_readlinkat(dfd, pathname, buf, bufsiz);
+}
+
SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
int, bufsiz)
{
- return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
+ return do_readlinkat(AT_FDCWD, path, buf, bufsiz);
}
diff --git a/fs/sync.c b/fs/sync.c
index 6e0a2cbaf6de..b54e0541ad89 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -105,7 +105,7 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
* just write metadata (such as inodes or bitmaps) to block device page cache
* and do not sync it on their own in ->sync_fs().
*/
-SYSCALL_DEFINE0(sync)
+void ksys_sync(void)
{
int nowait = 0, wait = 1;
@@ -117,6 +117,11 @@ SYSCALL_DEFINE0(sync)
iterate_bdevs(fdatawait_one_bdev, NULL);
if (unlikely(laptop_mode))
laptop_sync_completion();
+}
+
+SYSCALL_DEFINE0(sync)
+{
+ ksys_sync();
return 0;
}
@@ -187,12 +192,8 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
if (!file->f_op->fsync)
return -EINVAL;
- if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
- spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
- spin_unlock(&inode->i_lock);
+ if (!datasync && (inode->i_state & I_DIRTY_TIME))
mark_inode_dirty_sync(inode);
- }
return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);
@@ -280,8 +281,8 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
* already-instantiated disk blocks, there are no guarantees here that the data
* will be available after a crash.
*/
-SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
- unsigned int, flags)
+int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
+ unsigned int flags)
{
int ret;
struct fd f;
@@ -359,10 +360,16 @@ out:
return ret;
}
+SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
+ unsigned int, flags)
+{
+ return ksys_sync_file_range(fd, offset, nbytes, flags);
+}
+
/* It would be nice if people remember that not all the world's an i386
when they introduce new system calls */
SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
loff_t, offset, loff_t, nbytes)
{
- return sys_sync_file_range(fd, offset, nbytes, flags);
+ return ksys_sync_file_range(fd, offset, nbytes, flags);
}
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 8664db25a9a6..215c225b2ca1 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -106,6 +106,7 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
{
return sysfs_do_create_link(kobj, target, name, 0);
}
+EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn);
/**
* sysfs_delete_link - remove symlink in object's directory.
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index cf348ba99238..1acb2ff505e6 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1256,7 +1256,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
* Inode length changed, so we have to make sure
* @I_DIRTY_DATASYNC is set.
*/
- __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
else
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 356c2bf148a5..cd31e4f6d6da 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -257,12 +257,22 @@ const struct file_operations udf_file_operations = {
static int udf_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
+ struct super_block *sb = inode->i_sb;
int error;
error = setattr_prepare(dentry, attr);
if (error)
return error;
+ if ((attr->ia_valid & ATTR_UID) &&
+ UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET) &&
+ !uid_eq(attr->ia_uid, UDF_SB(sb)->s_uid))
+ return -EPERM;
+ if ((attr->ia_valid & ATTR_GID) &&
+ UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET) &&
+ !gid_eq(attr->ia_gid, UDF_SB(sb)->s_gid))
+ return -EPERM;
+
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
error = udf_setsize(inode, attr->ia_size);
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index b6e420c1bfeb..b7a0d4b4bda1 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -104,6 +104,10 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
}
inode_init_owner(inode, dir, mode);
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
+ inode->i_uid = sbi->s_uid;
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
+ inode->i_gid = sbi->s_gid;
iinfo->i_location.logicalBlockNum = block;
iinfo->i_location.partitionReferenceNum =
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c23744d5ae5c..c80765d62f7e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1275,6 +1275,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
unsigned int indirections = 0;
int bs = inode->i_sb->s_blocksize;
int ret = -EIO;
+ uint32_t uid, gid;
reread:
if (iloc->partitionReferenceNum >= sbi->s_partitions) {
@@ -1400,17 +1401,19 @@ reread:
ret = -EIO;
read_lock(&sbi->s_cred_lock);
- i_uid_write(inode, le32_to_cpu(fe->uid));
- if (!uid_valid(inode->i_uid) ||
- UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
+ uid = le32_to_cpu(fe->uid);
+ if (uid == UDF_INVALID_ID ||
UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET))
- inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
+ inode->i_uid = sbi->s_uid;
+ else
+ i_uid_write(inode, uid);
- i_gid_write(inode, le32_to_cpu(fe->gid));
- if (!gid_valid(inode->i_gid) ||
- UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) ||
+ gid = le32_to_cpu(fe->gid);
+ if (gid == UDF_INVALID_ID ||
UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
- inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
+ inode->i_gid = sbi->s_gid;
+ else
+ i_gid_write(inode, gid);
if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
sbi->s_fmode != UDF_INVALID_MODE)
@@ -1655,12 +1658,12 @@ static int udf_update_inode(struct inode *inode, int do_sync)
}
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
- fe->uid = cpu_to_le32(-1);
+ fe->uid = cpu_to_le32(UDF_INVALID_ID);
else
fe->uid = cpu_to_le32(i_uid_read(inode));
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET))
- fe->gid = cpu_to_le32(-1);
+ fe->gid = cpu_to_le32(UDF_INVALID_ID);
else
fe->gid = cpu_to_le32(i_gid_read(inode));
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f73239a9a97d..7949c338efa5 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -64,14 +64,13 @@
#include <linux/init.h>
#include <linux/uaccess.h>
-#define VDS_POS_PRIMARY_VOL_DESC 0
-#define VDS_POS_UNALLOC_SPACE_DESC 1
-#define VDS_POS_LOGICAL_VOL_DESC 2
-#define VDS_POS_PARTITION_DESC 3
-#define VDS_POS_IMP_USE_VOL_DESC 4
-#define VDS_POS_VOL_DESC_PTR 5
-#define VDS_POS_TERMINATING_DESC 6
-#define VDS_POS_LENGTH 7
+enum {
+ VDS_POS_PRIMARY_VOL_DESC,
+ VDS_POS_UNALLOC_SPACE_DESC,
+ VDS_POS_LOGICAL_VOL_DESC,
+ VDS_POS_IMP_USE_VOL_DESC,
+ VDS_POS_LENGTH
+};
#define VSD_FIRST_SECTOR_OFFSET 32768
#define VSD_MAX_SECTOR_OFFSET 0x800000
@@ -223,10 +222,6 @@ struct udf_options {
unsigned int session;
unsigned int lastblock;
unsigned int anchor;
- unsigned int volume;
- unsigned short partition;
- unsigned int fileset;
- unsigned int rootdir;
unsigned int flags;
umode_t umask;
kgid_t gid;
@@ -349,12 +344,8 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",shortad");
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET))
seq_puts(seq, ",uid=forget");
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE))
- seq_puts(seq, ",uid=ignore");
if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET))
seq_puts(seq, ",gid=forget");
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE))
- seq_puts(seq, ",gid=ignore");
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid));
if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
@@ -371,10 +362,6 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
if (sbi->s_anchor != 0)
seq_printf(seq, ",anchor=%u", sbi->s_anchor);
- /*
- * volume, partition, fileset and rootdir seem to be ignored
- * currently
- */
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
seq_puts(seq, ",utf8");
if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map)
@@ -487,14 +474,9 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
int option;
uopt->novrs = 0;
- uopt->partition = 0xFFFF;
uopt->session = 0xFFFFFFFF;
uopt->lastblock = 0;
uopt->anchor = 0;
- uopt->volume = 0xFFFFFFFF;
- uopt->rootdir = 0xFFFFFFFF;
- uopt->fileset = 0xFFFFFFFF;
- uopt->nls_map = NULL;
if (!options)
return 1;
@@ -582,42 +564,30 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
uopt->anchor = option;
break;
case Opt_volume:
- if (match_int(args, &option))
- return 0;
- uopt->volume = option;
- break;
case Opt_partition:
- if (match_int(args, &option))
- return 0;
- uopt->partition = option;
- break;
case Opt_fileset:
- if (match_int(args, &option))
- return 0;
- uopt->fileset = option;
- break;
case Opt_rootdir:
- if (match_int(args, &option))
- return 0;
- uopt->rootdir = option;
+ /* Ignored (never implemented properly) */
break;
case Opt_utf8:
uopt->flags |= (1 << UDF_FLAG_UTF8);
break;
#ifdef CONFIG_UDF_NLS
case Opt_iocharset:
- uopt->nls_map = load_nls(args[0].from);
- uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
+ if (!remount) {
+ if (uopt->nls_map)
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = load_nls(args[0].from);
+ uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
+ }
break;
#endif
- case Opt_uignore:
- uopt->flags |= (1 << UDF_FLAG_UID_IGNORE);
- break;
case Opt_uforget:
uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
break;
+ case Opt_uignore:
case Opt_gignore:
- uopt->flags |= (1 << UDF_FLAG_GID_IGNORE);
+ /* These options are superseeded by uid=<number> */
break;
case Opt_gforget:
uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
@@ -660,6 +630,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
uopt.umask = sbi->s_umask;
uopt.fmode = sbi->s_fmode;
uopt.dmode = sbi->s_dmode;
+ uopt.nls_map = NULL;
if (!udf_parse_options(options, &uopt, true))
return -EINVAL;
@@ -1592,6 +1563,60 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
sbi->s_lvid_bh = NULL;
}
+/*
+ * Step for reallocation of table of partition descriptor sequence numbers.
+ * Must be power of 2.
+ */
+#define PART_DESC_ALLOC_STEP 32
+
+struct desc_seq_scan_data {
+ struct udf_vds_record vds[VDS_POS_LENGTH];
+ unsigned int size_part_descs;
+ struct udf_vds_record *part_descs_loc;
+};
+
+static struct udf_vds_record *handle_partition_descriptor(
+ struct buffer_head *bh,
+ struct desc_seq_scan_data *data)
+{
+ struct partitionDesc *desc = (struct partitionDesc *)bh->b_data;
+ int partnum;
+
+ partnum = le16_to_cpu(desc->partitionNumber);
+ if (partnum >= data->size_part_descs) {
+ struct udf_vds_record *new_loc;
+ unsigned int new_size = ALIGN(partnum, PART_DESC_ALLOC_STEP);
+
+ new_loc = kzalloc(sizeof(*new_loc) * new_size, GFP_KERNEL);
+ if (!new_loc)
+ return ERR_PTR(-ENOMEM);
+ memcpy(new_loc, data->part_descs_loc,
+ data->size_part_descs * sizeof(*new_loc));
+ kfree(data->part_descs_loc);
+ data->part_descs_loc = new_loc;
+ data->size_part_descs = new_size;
+ }
+ return &(data->part_descs_loc[partnum]);
+}
+
+
+static struct udf_vds_record *get_volume_descriptor_record(uint16_t ident,
+ struct buffer_head *bh, struct desc_seq_scan_data *data)
+{
+ switch (ident) {
+ case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */
+ return &(data->vds[VDS_POS_PRIMARY_VOL_DESC]);
+ case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */
+ return &(data->vds[VDS_POS_IMP_USE_VOL_DESC]);
+ case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */
+ return &(data->vds[VDS_POS_LOGICAL_VOL_DESC]);
+ case TAG_IDENT_USD: /* ISO 13346 3/10.8 */
+ return &(data->vds[VDS_POS_UNALLOC_SPACE_DESC]);
+ case TAG_IDENT_PD: /* ISO 13346 3/10.5 */
+ return handle_partition_descriptor(bh, data);
+ }
+ return NULL;
+}
/*
* Process a main/reserve volume descriptor sequence.
@@ -1608,18 +1633,23 @@ static noinline int udf_process_sequence(
struct kernel_lb_addr *fileset)
{
struct buffer_head *bh = NULL;
- struct udf_vds_record vds[VDS_POS_LENGTH];
struct udf_vds_record *curr;
struct generic_desc *gd;
struct volDescPtr *vdp;
bool done = false;
uint32_t vdsn;
uint16_t ident;
- long next_s = 0, next_e = 0;
int ret;
unsigned int indirections = 0;
-
- memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
+ struct desc_seq_scan_data data;
+ unsigned int i;
+
+ memset(data.vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
+ data.size_part_descs = PART_DESC_ALLOC_STEP;
+ data.part_descs_loc = kzalloc(sizeof(*data.part_descs_loc) *
+ data.size_part_descs, GFP_KERNEL);
+ if (!data.part_descs_loc)
+ return -ENOMEM;
/*
* Read the main descriptor sequence and find which descriptors
@@ -1628,79 +1658,51 @@ static noinline int udf_process_sequence(
for (; (!done && block <= lastblock); block++) {
bh = udf_read_tagged(sb, block, block, &ident);
- if (!bh) {
- udf_err(sb,
- "Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
- (unsigned long long)block);
- return -EAGAIN;
- }
+ if (!bh)
+ break;
/* Process each descriptor (ISO 13346 3/8.3-8.4) */
gd = (struct generic_desc *)bh->b_data;
vdsn = le32_to_cpu(gd->volDescSeqNum);
switch (ident) {
- case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */
- curr = &vds[VDS_POS_PRIMARY_VOL_DESC];
- if (vdsn >= curr->volDescSeqNum) {
- curr->volDescSeqNum = vdsn;
- curr->block = block;
- }
- break;
case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */
- curr = &vds[VDS_POS_VOL_DESC_PTR];
- if (vdsn >= curr->volDescSeqNum) {
- curr->volDescSeqNum = vdsn;
- curr->block = block;
-
- vdp = (struct volDescPtr *)bh->b_data;
- next_s = le32_to_cpu(
- vdp->nextVolDescSeqExt.extLocation);
- next_e = le32_to_cpu(
- vdp->nextVolDescSeqExt.extLength);
- next_e = next_e >> sb->s_blocksize_bits;
- next_e += next_s;
+ if (++indirections > UDF_MAX_TD_NESTING) {
+ udf_err(sb, "too many Volume Descriptor "
+ "Pointers (max %u supported)\n",
+ UDF_MAX_TD_NESTING);
+ brelse(bh);
+ return -EIO;
}
+
+ vdp = (struct volDescPtr *)bh->b_data;
+ block = le32_to_cpu(vdp->nextVolDescSeqExt.extLocation);
+ lastblock = le32_to_cpu(
+ vdp->nextVolDescSeqExt.extLength) >>
+ sb->s_blocksize_bits;
+ lastblock += block - 1;
+ /* For loop is going to increment 'block' again */
+ block--;
break;
+ case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */
case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */
- curr = &vds[VDS_POS_IMP_USE_VOL_DESC];
- if (vdsn >= curr->volDescSeqNum) {
- curr->volDescSeqNum = vdsn;
- curr->block = block;
- }
- break;
- case TAG_IDENT_PD: /* ISO 13346 3/10.5 */
- curr = &vds[VDS_POS_PARTITION_DESC];
- if (!curr->block)
- curr->block = block;
- break;
case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */
- curr = &vds[VDS_POS_LOGICAL_VOL_DESC];
- if (vdsn >= curr->volDescSeqNum) {
- curr->volDescSeqNum = vdsn;
- curr->block = block;
- }
- break;
case TAG_IDENT_USD: /* ISO 13346 3/10.8 */
- curr = &vds[VDS_POS_UNALLOC_SPACE_DESC];
+ case TAG_IDENT_PD: /* ISO 13346 3/10.5 */
+ curr = get_volume_descriptor_record(ident, bh, &data);
+ if (IS_ERR(curr)) {
+ brelse(bh);
+ return PTR_ERR(curr);
+ }
+ /* Descriptor we don't care about? */
+ if (!curr)
+ break;
if (vdsn >= curr->volDescSeqNum) {
curr->volDescSeqNum = vdsn;
curr->block = block;
}
break;
case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
- if (++indirections > UDF_MAX_TD_NESTING) {
- udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING);
- brelse(bh);
- return -EIO;
- }
-
- vds[VDS_POS_TERMINATING_DESC].block = block;
- if (next_e) {
- block = next_s;
- lastblock = next_e;
- next_s = next_e = 0;
- } else
- done = true;
+ done = true;
break;
}
brelse(bh);
@@ -1709,31 +1711,27 @@ static noinline int udf_process_sequence(
* Now read interesting descriptors again and process them
* in a suitable order
*/
- if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
+ if (!data.vds[VDS_POS_PRIMARY_VOL_DESC].block) {
udf_err(sb, "Primary Volume Descriptor not found!\n");
return -EAGAIN;
}
- ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block);
+ ret = udf_load_pvoldesc(sb, data.vds[VDS_POS_PRIMARY_VOL_DESC].block);
if (ret < 0)
return ret;
- if (vds[VDS_POS_LOGICAL_VOL_DESC].block) {
+ if (data.vds[VDS_POS_LOGICAL_VOL_DESC].block) {
ret = udf_load_logicalvol(sb,
- vds[VDS_POS_LOGICAL_VOL_DESC].block,
- fileset);
+ data.vds[VDS_POS_LOGICAL_VOL_DESC].block,
+ fileset);
if (ret < 0)
return ret;
}
- if (vds[VDS_POS_PARTITION_DESC].block) {
- /*
- * We rescan the whole descriptor sequence to find
- * partition descriptor blocks and process them.
- */
- for (block = vds[VDS_POS_PARTITION_DESC].block;
- block < vds[VDS_POS_TERMINATING_DESC].block;
- block++) {
- ret = udf_load_partdesc(sb, block);
+ /* Now handle prevailing Partition Descriptors */
+ for (i = 0; i < data.size_part_descs; i++) {
+ if (data.part_descs_loc[i].block) {
+ ret = udf_load_partdesc(sb,
+ data.part_descs_loc[i].block);
if (ret < 0)
return ret;
}
@@ -1760,13 +1758,13 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
main_e = main_e >> sb->s_blocksize_bits;
- main_e += main_s;
+ main_e += main_s - 1;
/* Locate the reserve sequence */
reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
reserve_e = reserve_e >> sb->s_blocksize_bits;
- reserve_e += reserve_s;
+ reserve_e += reserve_s - 1;
/* Process the main & reserve sequences */
/* responsible for finding the PartitionDesc(s) */
@@ -1994,7 +1992,10 @@ static void udf_open_lvid(struct super_block *sb)
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
ktime_get_real_ts(&ts);
udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
- lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
+ if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
+ lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
+ else
+ UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);
lvid->descTag.descCRC = cpu_to_le16(
crc_itu_t(0, (char *)lvid + sizeof(struct tag),
@@ -2034,7 +2035,8 @@ static void udf_close_lvid(struct super_block *sb)
lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
- lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
+ lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
lvid->descTag.descCRC = cpu_to_le16(
crc_itu_t(0, (char *)lvid + sizeof(struct tag),
@@ -2091,11 +2093,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
bool lvid_open = false;
uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
- uopt.uid = INVALID_UID;
- uopt.gid = INVALID_GID;
+ /* By default we'll use overflow[ug]id when UDF inode [ug]id == -1 */
+ uopt.uid = make_kuid(current_user_ns(), overflowuid);
+ uopt.gid = make_kgid(current_user_ns(), overflowgid);
uopt.umask = 0;
uopt.fmode = UDF_INVALID_MODE;
uopt.dmode = UDF_INVALID_MODE;
+ uopt.nls_map = NULL;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -2276,8 +2280,8 @@ error_out:
iput(sbi->s_vat_inode);
parse_options_failure:
#ifdef CONFIG_UDF_NLS
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
- unload_nls(sbi->s_nls_map);
+ if (uopt.nls_map)
+ unload_nls(uopt.nls_map);
#endif
if (lvid_open)
udf_close_lvid(sb);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 68c9f1d618f5..9dd3e1b9619e 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -23,14 +23,13 @@
#define UDF_FLAG_NLS_MAP 9
#define UDF_FLAG_UTF8 10
#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */
-#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */
-#define UDF_FLAG_GID_FORGET 13
-#define UDF_FLAG_GID_IGNORE 14
-#define UDF_FLAG_UID_SET 15
-#define UDF_FLAG_GID_SET 16
-#define UDF_FLAG_SESSION_SET 17
-#define UDF_FLAG_LASTBLOCK_SET 18
-#define UDF_FLAG_BLOCKSIZE_SET 19
+#define UDF_FLAG_GID_FORGET 12
+#define UDF_FLAG_UID_SET 13
+#define UDF_FLAG_GID_SET 14
+#define UDF_FLAG_SESSION_SET 15
+#define UDF_FLAG_LASTBLOCK_SET 16
+#define UDF_FLAG_BLOCKSIZE_SET 17
+#define UDF_FLAG_INCONSISTENT 18
#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index f5e0fe78979e..68e8a64d22e0 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -48,6 +48,8 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb,
#define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF
#define UDF_EXTENT_FLAG_MASK 0xC0000000
+#define UDF_INVALID_ID ((uint32_t)-1)
+
#define UDF_NAME_PAD 4
#define UDF_NAME_LEN 254
#define UDF_NAME_LEN_CS0 255
diff --git a/fs/utimes.c b/fs/utimes.c
index e4b3d7c2c9f5..69d4b6ba1bfb 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -184,8 +184,8 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
}
-SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
- struct timeval __user *, utimes)
+static long do_futimesat(int dfd, const char __user *filename,
+ struct timeval __user *utimes)
{
struct timeval times[2];
struct timespec64 tstimes[2];
@@ -212,10 +212,17 @@ SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
}
+
+SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
+ struct timeval __user *, utimes)
+{
+ return do_futimesat(dfd, filename, utimes);
+}
+
SYSCALL_DEFINE2(utimes, char __user *, filename,
struct timeval __user *, utimes)
{
- return sys_futimesat(AT_FDCWD, filename, utimes);
+ return do_futimesat(AT_FDCWD, filename, utimes);
}
#ifdef CONFIG_COMPAT
@@ -253,7 +260,8 @@ COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filena
return do_utimes(dfd, filename, t ? tv : NULL, flags);
}
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
+static long do_compat_futimesat(unsigned int dfd, const char __user *filename,
+ struct compat_timeval __user *t)
{
struct timespec64 tv[2];
@@ -272,8 +280,15 @@ COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filena
return do_utimes(dfd, filename, t ? tv : NULL, 0);
}
+COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd,
+ const char __user *, filename,
+ struct compat_timeval __user *, t)
+{
+ return do_compat_futimesat(dfd, filename, t);
+}
+
COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
{
- return compat_sys_futimesat(AT_FDCWD, filename, t);
+ return do_compat_futimesat(AT_FDCWD, filename, t);
}
#endif
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 393b6849aeb3..7bace03dc9dc 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -46,13 +46,13 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
}
void *
-kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
+kmem_alloc_large(size_t size, xfs_km_flags_t flags)
{
unsigned nofs_flag = 0;
void *ptr;
gfp_t lflags;
- ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
+ ptr = kmem_alloc(size, flags | KM_MAYFAIL);
if (ptr)
return ptr;
@@ -67,7 +67,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
nofs_flag = memalloc_nofs_save();
lflags = kmem_flags_convert(flags);
- ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL);
+ ptr = __vmalloc(size, lflags, PAGE_KERNEL);
if (flags & KM_NOFS)
memalloc_nofs_restore(nofs_flag);
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 4b87472f35bc..6023b594ead7 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -71,7 +71,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
}
extern void *kmem_alloc(size_t, xfs_km_flags_t);
-extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
+extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
{
@@ -85,6 +85,12 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
return kmem_alloc(size, flags | KM_ZERO);
}
+static inline void *
+kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
+{
+ return kmem_alloc_large(size, flags | KM_ZERO);
+}
+
/*
* Zone interfaces
*/
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 2291f4224e24..03885a968de8 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -95,13 +95,13 @@ xfs_ag_resv_critical(
switch (type) {
case XFS_AG_RESV_METADATA:
- avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved;
+ avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
orig = pag->pag_meta_resv.ar_asked;
break;
- case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_RMAPBT:
avail = pag->pagf_freeblks + pag->pagf_flcount -
pag->pag_meta_resv.ar_reserved;
- orig = pag->pag_agfl_resv.ar_asked;
+ orig = pag->pag_rmapbt_resv.ar_asked;
break;
default:
ASSERT(0);
@@ -126,10 +126,10 @@ xfs_ag_resv_needed(
{
xfs_extlen_t len;
- len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved;
+ len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
switch (type) {
case XFS_AG_RESV_METADATA:
- case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_RMAPBT:
len -= xfs_perag_resv(pag, type)->ar_reserved;
break;
case XFS_AG_RESV_NONE:
@@ -160,10 +160,11 @@ __xfs_ag_resv_free(
if (pag->pag_agno == 0)
pag->pag_mount->m_ag_max_usable += resv->ar_asked;
/*
- * AGFL blocks are always considered "free", so whatever
- * was reserved at mount time must be given back at umount.
+ * RMAPBT blocks come from the AGFL and AGFL blocks are always
+ * considered "free", so whatever was reserved at mount time must be
+ * given back at umount.
*/
- if (type == XFS_AG_RESV_AGFL)
+ if (type == XFS_AG_RESV_RMAPBT)
oldresv = resv->ar_orig_reserved;
else
oldresv = resv->ar_reserved;
@@ -185,7 +186,7 @@ xfs_ag_resv_free(
int error;
int err2;
- error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL);
+ error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
if (err2 && !error)
error = err2;
@@ -284,15 +285,15 @@ xfs_ag_resv_init(
}
}
- /* Create the AGFL metadata reservation */
- if (pag->pag_agfl_resv.ar_asked == 0) {
+ /* Create the RMAPBT metadata reservation */
+ if (pag->pag_rmapbt_resv.ar_asked == 0) {
ask = used = 0;
error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
if (error)
goto out;
- error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
if (error)
goto out;
}
@@ -304,7 +305,7 @@ xfs_ag_resv_init(
return error;
ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
- xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
+ xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <=
pag->pagf_freeblks + pag->pagf_flcount);
#endif
out:
@@ -325,8 +326,10 @@ xfs_ag_resv_alloc_extent(
trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
switch (type) {
- case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_AGFL:
+ return;
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_RMAPBT:
resv = xfs_perag_resv(pag, type);
break;
default:
@@ -341,7 +344,7 @@ xfs_ag_resv_alloc_extent(
len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
resv->ar_reserved -= len;
- if (type == XFS_AG_RESV_AGFL)
+ if (type == XFS_AG_RESV_RMAPBT)
return;
/* Allocations of reserved blocks only need on-disk sb updates... */
xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
@@ -365,8 +368,10 @@ xfs_ag_resv_free_extent(
trace_xfs_ag_resv_free_extent(pag, type, len);
switch (type) {
- case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_AGFL:
+ return;
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_RMAPBT:
resv = xfs_perag_resv(pag, type);
break;
default:
@@ -379,7 +384,7 @@ xfs_ag_resv_free_extent(
leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
resv->ar_reserved += leftover;
- if (type == XFS_AG_RESV_AGFL)
+ if (type == XFS_AG_RESV_RMAPBT)
return;
/* Freeing into the reserved pool only requires on-disk update... */
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
index 8d6c687deef3..938f2f96c5e8 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -32,4 +32,35 @@ void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
struct xfs_trans *tp, xfs_extlen_t len);
+/*
+ * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from
+ * the AGFL, they are allocated one at a time and the reservation updates don't
+ * require a transaction.
+ */
+static inline void
+xfs_ag_resv_rmapbt_alloc(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_alloc_arg args = {0};
+ struct xfs_perag *pag;
+
+ args.len = 1;
+ pag = xfs_perag_get(mp, agno);
+ xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args);
+ xfs_perag_put(pag);
+}
+
+static inline void
+xfs_ag_resv_rmapbt_free(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
+ xfs_perag_put(pag);
+}
+
#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index c02781a4c091..39387bdd225d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -53,6 +53,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+/*
+ * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
+ * the beginning of the block for a proper header with the location information
+ * and CRC.
+ */
+unsigned int
+xfs_agfl_size(
+ struct xfs_mount *mp)
+{
+ unsigned int size = mp->m_sb.sb_sectsize;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ size -= sizeof(struct xfs_agfl);
+
+ return size / sizeof(xfs_agblock_t);
+}
+
unsigned int
xfs_refc_block(
struct xfs_mount *mp)
@@ -550,7 +567,7 @@ xfs_agfl_verify(
if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
return __this_address;
- for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+ for (i = 0; i < xfs_agfl_size(mp); i++) {
if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return __this_address;
@@ -1564,7 +1581,6 @@ xfs_alloc_ag_vextent_small(
int *stat) /* status: 0-freelist, 1-normal/none */
{
struct xfs_owner_info oinfo;
- struct xfs_perag *pag;
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
@@ -1616,18 +1632,13 @@ xfs_alloc_ag_vextent_small(
/*
* If we're feeding an AGFL block to something that
* doesn't live in the free space, we need to clear
- * out the OWN_AG rmap and add the block back to
- * the AGFL per-AG reservation.
+ * out the OWN_AG rmap.
*/
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
error = xfs_rmap_free(args->tp, args->agbp, args->agno,
fbno, 1, &oinfo);
if (error)
goto error0;
- pag = xfs_perag_get(args->mp, args->agno);
- xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
- args->tp, 1);
- xfs_perag_put(pag);
*stat = 0;
return 0;
@@ -1911,14 +1922,12 @@ xfs_free_ag_extent(
XFS_STATS_INC(mp, xs_freex);
XFS_STATS_ADD(mp, xs_freeb, len);
- trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
- haveleft, haveright);
+ trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
return 0;
error0:
- trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
- -1, -1);
+ trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -2054,6 +2063,93 @@ xfs_alloc_space_available(
}
/*
+ * Check the agfl fields of the agf for inconsistency or corruption. The purpose
+ * is to detect an agfl header padding mismatch between current and early v5
+ * kernels. This problem manifests as a 1-slot size difference between the
+ * on-disk flcount and the active [first, last] range of a wrapped agfl. This
+ * may also catch variants of agfl count corruption unrelated to padding. Either
+ * way, we'll reset the agfl and warn the user.
+ *
+ * Return true if a reset is required before the agfl can be used, false
+ * otherwise.
+ */
+static bool
+xfs_agfl_needs_reset(
+ struct xfs_mount *mp,
+ struct xfs_agf *agf)
+{
+ uint32_t f = be32_to_cpu(agf->agf_flfirst);
+ uint32_t l = be32_to_cpu(agf->agf_fllast);
+ uint32_t c = be32_to_cpu(agf->agf_flcount);
+ int agfl_size = xfs_agfl_size(mp);
+ int active;
+
+ /* no agfl header on v4 supers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+
+ /*
+ * The agf read verifier catches severe corruption of these fields.
+ * Repeat some sanity checks to cover a packed -> unpacked mismatch if
+ * the verifier allows it.
+ */
+ if (f >= agfl_size || l >= agfl_size)
+ return true;
+ if (c > agfl_size)
+ return true;
+
+ /*
+ * Check consistency between the on-disk count and the active range. An
+ * agfl padding mismatch manifests as an inconsistent flcount.
+ */
+ if (c && l >= f)
+ active = l - f + 1;
+ else if (c)
+ active = agfl_size - f + l + 1;
+ else
+ active = 0;
+
+ return active != c;
+}
+
+/*
+ * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
+ * agfl content cannot be trusted. Warn the user that a repair is required to
+ * recover leaked blocks.
+ *
+ * The purpose of this mechanism is to handle filesystems affected by the agfl
+ * header padding mismatch problem. A reset keeps the filesystem online with a
+ * relatively minor free space accounting inconsistency rather than suffer the
+ * inevitable crash from use of an invalid agfl block.
+ */
+static void
+xfs_agfl_reset(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+
+ ASSERT(pag->pagf_agflreset);
+ trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
+
+ xfs_warn(mp,
+ "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
+ "Please unmount and run xfs_repair.",
+ pag->pag_agno, pag->pagf_flcount);
+
+ agf->agf_flfirst = 0;
+ agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
+ agf->agf_flcount = 0;
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
+ XFS_AGF_FLCOUNT);
+
+ pag->pagf_flcount = 0;
+ pag->pagf_agflreset = false;
+}
+
+/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
@@ -2114,6 +2210,10 @@ xfs_alloc_fix_freelist(
}
}
+ /* reset a padding mismatched agfl before final free space check */
+ if (pag->pagf_agflreset)
+ xfs_agfl_reset(tp, agbp, pag);
+
/* If there isn't enough total space or single-extent, reject it. */
need = xfs_alloc_min_freelist(mp, pag);
if (!xfs_alloc_space_available(args, need, flags))
@@ -2266,10 +2366,11 @@ xfs_alloc_get_freelist(
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
- if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
+ if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, -1);
xfs_trans_agflist_delta(tp, -1);
pag->pagf_flcount--;
@@ -2377,10 +2478,11 @@ xfs_alloc_put_freelist(
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
be32_add_cpu(&agf->agf_fllast, 1);
- if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
+ if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, 1);
xfs_trans_agflist_delta(tp, 1);
pag->pagf_flcount++;
@@ -2395,7 +2497,7 @@ xfs_alloc_put_freelist(
xfs_alloc_log_agf(tp, agbp, logflags);
- ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
+ ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
@@ -2428,9 +2530,9 @@ xfs_agf_verify(
if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+ be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
+ be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
return __this_address;
if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
@@ -2588,6 +2690,7 @@ xfs_alloc_read_agf(
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
pag->pagf_init = 1;
+ pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
}
#ifdef DEBUG
else if (!XFS_FORCED_SHUTDOWN(mp)) {
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 65a0cafe06e4..a311a2414a6b 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -26,6 +26,8 @@ struct xfs_trans;
extern struct workqueue_struct *xfs_alloc_wq;
+unsigned int xfs_agfl_size(struct xfs_mount *mp);
+
/*
* Freespace allocation types. Argument to xfs_alloc_[v]extent.
*/
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 6840b588187e..b451649ba176 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -74,18 +74,13 @@ xfs_allocbt_alloc_block(
int error;
xfs_agblock_t bno;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
/* Allocate the new block from the freelist. If we can't, give up. */
error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
&bno, 1);
- if (error) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (error)
return error;
- }
if (bno == NULLAGBLOCK) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
@@ -95,7 +90,6 @@ xfs_allocbt_alloc_block(
xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index daae00ed30c5..3b03d886df66 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1244,8 +1244,9 @@ xfs_iread_extents(
xfs_warn(ip->i_mount,
"corrupt dinode %Lu, (btree extents).",
(unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, ip->i_mount, block);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ __func__, block, sizeof(*block),
+ __this_address);
error = -EFSCORRUPTED;
goto out_brelse;
}
@@ -1261,11 +1262,15 @@ xfs_iread_extents(
*/
frp = XFS_BMBT_REC_ADDR(mp, block, 1);
for (j = 0; j < num_recs; j++, frp++, i++) {
+ xfs_failaddr_t fa;
+
xfs_bmbt_disk_get_all(frp, &new);
- if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) {
- XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
- XFS_ERRLEVEL_LOW, mp);
+ fa = xfs_bmap_validate_extent(ip, whichfork, &new);
+ if (fa) {
error = -EFSCORRUPTED;
+ xfs_inode_verifier_error(ip, error,
+ "xfs_iread_extents(2)",
+ frp, sizeof(*frp), fa);
goto out_brelse;
}
xfs_iext_insert(ip, &icur, &new, state);
@@ -6154,3 +6159,39 @@ xfs_bmap_finish_one(
return error;
}
+
+/* Check that an inode's extent does not have invalid flags or bad ranges. */
+xfs_failaddr_t
+xfs_bmap_validate_extent(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsblock_t endfsb;
+ bool isrt;
+
+ isrt = XFS_IS_REALTIME_INODE(ip);
+ endfsb = irec->br_startblock + irec->br_blockcount - 1;
+ if (isrt) {
+ if (!xfs_verify_rtbno(mp, irec->br_startblock))
+ return __this_address;
+ if (!xfs_verify_rtbno(mp, endfsb))
+ return __this_address;
+ } else {
+ if (!xfs_verify_fsbno(mp, irec->br_startblock))
+ return __this_address;
+ if (!xfs_verify_fsbno(mp, endfsb))
+ return __this_address;
+ if (XFS_FSB_TO_AGNO(mp, irec->br_startblock) !=
+ XFS_FSB_TO_AGNO(mp, endfsb))
+ return __this_address;
+ }
+ if (irec->br_state != XFS_EXT_NORM) {
+ if (whichfork != XFS_DATA_FORK)
+ return __this_address;
+ if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
+ return __this_address;
+ }
+ return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e36d75799cd5..f3be6416260b 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -274,4 +274,7 @@ static inline int xfs_bmap_fork_to_state(int whichfork)
}
}
+xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *irec);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 9faf479aba49..d89d06bea6e3 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -272,10 +272,10 @@ xfs_bmbt_alloc_block(
cur->bc_private.b.dfops->dop_low = true;
}
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
+
ASSERT(args.len == 1);
cur->bc_private.b.firstblock = args.fsbno;
cur->bc_private.b.allocated++;
@@ -286,12 +286,10 @@ xfs_bmbt_alloc_block(
new->l = cpu_to_be64(args.fsbno);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 135b8c56d23e..e4505746ccaa 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -118,18 +118,4 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
-/*
- * Check that the extent does not contain an invalid unwritten extent flag.
- */
-static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork,
- struct xfs_bmbt_irec *irec)
-{
- if (irec->br_state == XFS_EXT_NORM)
- return true;
- if (whichfork == XFS_DATA_FORK &&
- xfs_sb_version_hasextflgbit(&mp->m_sb))
- return true;
- return false;
-}
-
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 79ee4a1951d1..edc0193358a5 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1438,8 +1438,6 @@ xfs_btree_log_keys(
int first,
int last)
{
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
if (bp) {
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
@@ -1450,8 +1448,6 @@ xfs_btree_log_keys(
xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
xfs_ilog_fbroot(cur->bc_private.b.whichfork));
}
-
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
}
/*
@@ -1464,15 +1460,12 @@ xfs_btree_log_recs(
int first,
int last)
{
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_rec_offset(cur, first),
xfs_btree_rec_offset(cur, last + 1) - 1);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
}
/*
@@ -1485,8 +1478,6 @@ xfs_btree_log_ptrs(
int first, /* index of first pointer to log */
int last) /* index of last pointer to log */
{
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
if (bp) {
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -1501,7 +1492,6 @@ xfs_btree_log_ptrs(
xfs_ilog_fbroot(cur->bc_private.b.whichfork));
}
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
}
/*
@@ -1543,9 +1533,6 @@ xfs_btree_log_block(
XFS_BTREE_LBLOCK_CRC_LEN
};
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
-
if (bp) {
int nbits;
@@ -1573,8 +1560,6 @@ xfs_btree_log_block(
xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
xfs_ilog_fbroot(cur->bc_private.b.whichfork));
}
-
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
}
/*
@@ -1593,9 +1578,6 @@ xfs_btree_increment(
int error; /* error return value */
int lev;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGI(cur, level);
-
ASSERT(level < cur->bc_nlevels);
/* Read-ahead to the right at this level. */
@@ -1671,17 +1653,14 @@ xfs_btree_increment(
cur->bc_ptrs[lev] = 1;
}
out1:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
out0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
@@ -1701,9 +1680,6 @@ xfs_btree_decrement(
int lev;
union xfs_btree_ptr ptr;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGI(cur, level);
-
ASSERT(level < cur->bc_nlevels);
/* Read-ahead to the left at this level. */
@@ -1769,17 +1745,14 @@ xfs_btree_decrement(
cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
}
out1:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
out0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
@@ -1881,9 +1854,6 @@ xfs_btree_lookup(
union xfs_btree_ptr *pp; /* ptr to btree block */
union xfs_btree_ptr ptr; /* ptr to btree block */
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGI(cur, dir);
-
XFS_BTREE_STATS_INC(cur, lookup);
/* No such thing as a zero-level tree. */
@@ -1929,7 +1899,6 @@ xfs_btree_lookup(
ASSERT(level == 0 && cur->bc_nlevels == 1);
cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
@@ -2004,7 +1973,6 @@ xfs_btree_lookup(
if (error)
goto error0;
XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
}
@@ -2019,11 +1987,9 @@ xfs_btree_lookup(
*stat = 1;
else
*stat = 0;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
@@ -2169,10 +2135,8 @@ __xfs_btree_updkeys(
trace_xfs_btree_updkeys(cur, level, bp);
#ifdef DEBUG
error = xfs_btree_check_block(cur, block, level, bp);
- if (error) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (error)
return error;
- }
#endif
ptr = cur->bc_ptrs[level];
nlkey = xfs_btree_key_addr(cur, ptr, block);
@@ -2224,9 +2188,6 @@ xfs_btree_update_keys(
if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
return __xfs_btree_updkeys(cur, level, block, bp, false);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
-
/*
* Go up the tree from this level toward the root.
* At each level, update the key value to the value input.
@@ -2241,10 +2202,8 @@ xfs_btree_update_keys(
block = xfs_btree_get_block(cur, level, &bp);
#ifdef DEBUG
error = xfs_btree_check_block(cur, block, level, bp);
- if (error) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (error)
return error;
- }
#endif
ptr = cur->bc_ptrs[level];
kp = xfs_btree_key_addr(cur, ptr, block);
@@ -2252,7 +2211,6 @@ xfs_btree_update_keys(
xfs_btree_log_keys(cur, bp, ptr, ptr);
}
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return 0;
}
@@ -2272,9 +2230,6 @@ xfs_btree_update(
int ptr;
union xfs_btree_rec *rp;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGR(cur, rec);
-
/* Pick up the current block. */
block = xfs_btree_get_block(cur, 0, &bp);
@@ -2307,11 +2262,9 @@ xfs_btree_update(
goto error0;
}
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
@@ -2339,9 +2292,6 @@ xfs_btree_lshift(
int error; /* error return value */
int i;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGI(cur, level);
-
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
level == cur->bc_nlevels - 1)
goto out0;
@@ -2500,21 +2450,17 @@ xfs_btree_lshift(
/* Slide the cursor value left one. */
cur->bc_ptrs[level]--;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
out0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
error1:
- XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
return error;
}
@@ -2541,9 +2487,6 @@ xfs_btree_rshift(
int error; /* error return value */
int i; /* loop counter */
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGI(cur, level);
-
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
(level == cur->bc_nlevels - 1))
goto out0;
@@ -2676,21 +2619,17 @@ xfs_btree_rshift(
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
out0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
error1:
- XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
return error;
}
@@ -2726,9 +2665,6 @@ __xfs_btree_split(
int i;
#endif
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
-
XFS_BTREE_STATS_INC(cur, split);
/* Set up left block (current one). */
@@ -2878,16 +2814,13 @@ __xfs_btree_split(
(*curp)->bc_ptrs[level + 1]++;
}
*ptrp = rptr;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
out0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
@@ -2994,7 +2927,6 @@ xfs_btree_new_iroot(
int i; /* loop counter */
#endif
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_STATS_INC(cur, newroot);
ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
@@ -3008,10 +2940,9 @@ xfs_btree_new_iroot(
error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
if (error)
goto error0;
- if (*stat == 0) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ if (*stat == 0)
return 0;
- }
+
XFS_BTREE_STATS_INC(cur, alloc);
/* Copy the root into a real block. */
@@ -3074,10 +3005,8 @@ xfs_btree_new_iroot(
*logflags |=
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
*stat = 1;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
@@ -3102,7 +3031,6 @@ xfs_btree_new_root(
union xfs_btree_ptr rptr;
union xfs_btree_ptr lptr;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_STATS_INC(cur, newroot);
/* initialise our start point from the cursor */
@@ -3202,14 +3130,11 @@ xfs_btree_new_root(
xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
cur->bc_ptrs[cur->bc_nlevels] = nptr;
cur->bc_nlevels++;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
out0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
@@ -3230,7 +3155,7 @@ xfs_btree_make_block_unfull(
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
level == cur->bc_nlevels - 1) {
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_private.b.ip;
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
@@ -3309,9 +3234,6 @@ xfs_btree_insrec(
#endif
xfs_daddr_t old_bn;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
-
ncur = NULL;
lkey = &nkey;
@@ -3324,14 +3246,12 @@ xfs_btree_insrec(
error = xfs_btree_new_root(cur, stat);
xfs_btree_set_ptr_null(cur, ptrp);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return error;
}
/* If we're off the left edge, return failure. */
ptr = cur->bc_ptrs[level];
if (ptr == 0) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
@@ -3489,12 +3409,10 @@ xfs_btree_insrec(
*curp = ncur;
}
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
@@ -3572,11 +3490,9 @@ xfs_btree_insert(
}
} while (!xfs_btree_ptr_is_null(cur, &nptr));
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = i;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
@@ -3611,8 +3527,6 @@ xfs_btree_kill_iroot(
int i;
#endif
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
ASSERT(cur->bc_nlevels > 1);
@@ -3670,19 +3584,15 @@ xfs_btree_kill_iroot(
#ifdef DEBUG
for (i = 0; i < numrecs; i++) {
error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
- if (error) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (error)
return error;
- }
}
#endif
xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
error = xfs_btree_free_block(cur, cbp);
- if (error) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (error)
return error;
- }
cur->bc_bufs[level - 1] = NULL;
be16_add_cpu(&block->bb_level, -1);
@@ -3690,7 +3600,6 @@ xfs_btree_kill_iroot(
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
cur->bc_nlevels--;
out0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return 0;
}
@@ -3706,7 +3615,6 @@ xfs_btree_kill_root(
{
int error;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_STATS_INC(cur, killroot);
/*
@@ -3716,16 +3624,13 @@ xfs_btree_kill_root(
cur->bc_ops->set_root(cur, newroot, -1);
error = xfs_btree_free_block(cur, bp);
- if (error) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (error)
return error;
- }
cur->bc_bufs[level] = NULL;
cur->bc_ra[level] = 0;
cur->bc_nlevels--;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return 0;
}
@@ -3744,7 +3649,6 @@ xfs_btree_dec_cursor(
return error;
}
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
}
@@ -3780,15 +3684,11 @@ xfs_btree_delrec(
struct xfs_btree_cur *tcur; /* temporary btree cursor */
int numrecs; /* temporary numrec count */
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGI(cur, level);
-
tcur = NULL;
/* Get the index of the entry being deleted, check for nothing there. */
ptr = cur->bc_ptrs[level];
if (ptr == 0) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
@@ -3805,7 +3705,6 @@ xfs_btree_delrec(
/* Fail if we're off the end of the block. */
if (ptr > numrecs) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
@@ -4080,7 +3979,7 @@ xfs_btree_delrec(
tcur = NULL;
if (level == 0)
cur->bc_ptrs[0]++;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
*stat = 1;
return 0;
}
@@ -4250,13 +4149,11 @@ xfs_btree_delrec(
* call updkeys directly.
*/
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
/* Return value means the next level up has something to do. */
*stat = 2;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
if (tcur)
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
return error;
@@ -4277,8 +4174,6 @@ xfs_btree_delete(
int i;
bool joined = false;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
/*
* Go up the tree, starting at leaf level.
*
@@ -4314,11 +4209,9 @@ xfs_btree_delete(
}
}
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = i;
return 0;
error0:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 50440b5618e8..58e30c0975c3 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -473,25 +473,6 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
-/*
- * Trace hooks. Currently not implemented as they need to be ported
- * over to the generic tracing functionality, which is some effort.
- *
- * i,j = integer (32 bit)
- * b = btree block buffer (xfs_buf_t)
- * p = btree ptr
- * r = btree record
- * k = btree key
- */
-#define XFS_BTREE_TRACE_ARGBI(c, b, i)
-#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
-#define XFS_BTREE_TRACE_ARGI(c, i)
-#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
-#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
-#define XFS_BTREE_TRACE_ARGIK(c, i, k)
-#define XFS_BTREE_TRACE_ARGR(c, r)
-#define XFS_BTREE_TRACE_CURSOR(c, t)
-
xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 388d67c5c903..989e95a53db2 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -173,7 +173,7 @@ extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
+extern int xfs_dir2_data_use_free(struct xfs_da_args *args,
struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
int *needlogp, int *needscanp);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 2da86a394bcf..875893ded514 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -451,15 +451,19 @@ xfs_dir2_block_addname(
* No stale entries, will use enddup space to hold new leaf.
*/
if (!btp->stale) {
+ xfs_dir2_data_aoff_t aoff;
+
/*
* Mark the space needed for the new leaf entry, now in use.
*/
- xfs_dir2_data_use_free(args, bp, enddup,
- (xfs_dir2_data_aoff_t)
- ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
- sizeof(*blp)),
- (xfs_dir2_data_aoff_t)sizeof(*blp),
- &needlog, &needscan);
+ aoff = (xfs_dir2_data_aoff_t)((char *)enddup - (char *)hdr +
+ be16_to_cpu(enddup->length) - sizeof(*blp));
+ error = xfs_dir2_data_use_free(args, bp, enddup, aoff,
+ (xfs_dir2_data_aoff_t)sizeof(*blp), &needlog,
+ &needscan);
+ if (error)
+ return error;
+
/*
* Update the tail (entry count).
*/
@@ -541,9 +545,11 @@ xfs_dir2_block_addname(
/*
* Mark space for the data entry used.
*/
- xfs_dir2_data_use_free(args, bp, dup,
- (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
- (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+ error = xfs_dir2_data_use_free(args, bp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+ (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+ if (error)
+ return error;
/*
* Create the new data entry.
*/
@@ -997,8 +1003,10 @@ xfs_dir2_leaf_to_block(
/*
* Use up the space at the end of the block (blp/btp).
*/
- xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
- &needlog, &needscan);
+ error = xfs_dir2_data_use_free(args, dbp, dup,
+ args->geo->blksize - size, size, &needlog, &needscan);
+ if (error)
+ return error;
/*
* Initialize the block tail.
*/
@@ -1110,18 +1118,14 @@ xfs_dir2_sf_to_block(
* Add block 0 to the inode.
*/
error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
- if (error) {
- kmem_free(sfp);
- return error;
- }
+ if (error)
+ goto out_free;
/*
* Initialize the data block, then convert it to block format.
*/
error = xfs_dir3_data_init(args, blkno, &bp);
- if (error) {
- kmem_free(sfp);
- return error;
- }
+ if (error)
+ goto out_free;
xfs_dir3_block_init(mp, tp, bp, dp);
hdr = bp->b_addr;
@@ -1136,8 +1140,10 @@ xfs_dir2_sf_to_block(
*/
dup = dp->d_ops->data_unused_p(hdr);
needlog = needscan = 0;
- xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
- i, &needlog, &needscan);
+ error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+ i, &needlog, &needscan);
+ if (error)
+ goto out_free;
ASSERT(needscan == 0);
/*
* Fill in the tail.
@@ -1150,9 +1156,11 @@ xfs_dir2_sf_to_block(
/*
* Remove the freespace, we'll manage it.
*/
- xfs_dir2_data_use_free(args, bp, dup,
- (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
- be16_to_cpu(dup->length), &needlog, &needscan);
+ error = xfs_dir2_data_use_free(args, bp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+ be16_to_cpu(dup->length), &needlog, &needscan);
+ if (error)
+ goto out_free;
/*
* Create entry for .
*/
@@ -1256,4 +1264,7 @@ xfs_dir2_sf_to_block(
xfs_dir2_block_log_tail(tp, bp);
xfs_dir3_data_check(dp, bp);
return 0;
+out_free:
+ kmem_free(sfp);
+ return error;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 920279485275..cb67ec730b9b 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -932,10 +932,51 @@ xfs_dir2_data_make_free(
*needscanp = needscan;
}
+/* Check our free data for obvious signs of corruption. */
+static inline xfs_failaddr_t
+xfs_dir2_data_check_free(
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_data_unused *dup,
+ xfs_dir2_data_aoff_t offset,
+ xfs_dir2_data_aoff_t len)
+{
+ if (hdr->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC) &&
+ hdr->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC) &&
+ hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) &&
+ hdr->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+ return __this_address;
+ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+ return __this_address;
+ if (offset < (char *)dup - (char *)hdr)
+ return __this_address;
+ if (offset + len > (char *)dup + be16_to_cpu(dup->length) - (char *)hdr)
+ return __this_address;
+ if ((char *)dup - (char *)hdr !=
+ be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)))
+ return __this_address;
+ return NULL;
+}
+
+/* Sanity-check a new bestfree entry. */
+static inline xfs_failaddr_t
+xfs_dir2_data_check_new_free(
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_data_free *dfp,
+ struct xfs_dir2_data_unused *newdup)
+{
+ if (dfp == NULL)
+ return __this_address;
+ if (dfp->length != newdup->length)
+ return __this_address;
+ if (be16_to_cpu(dfp->offset) != (char *)newdup - (char *)hdr)
+ return __this_address;
+ return NULL;
+}
+
/*
* Take a byte range out of an existing unused space and make it un-free.
*/
-void
+int
xfs_dir2_data_use_free(
struct xfs_da_args *args,
struct xfs_buf *bp,
@@ -947,23 +988,19 @@ xfs_dir2_data_use_free(
{
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_free_t *dfp; /* bestfree pointer */
+ xfs_dir2_data_unused_t *newdup; /* new unused entry */
+ xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
+ struct xfs_dir2_data_free *bf;
+ xfs_failaddr_t fa;
int matchback; /* matches end of freespace */
int matchfront; /* matches start of freespace */
int needscan; /* need to regen bestfree */
- xfs_dir2_data_unused_t *newdup; /* new unused entry */
- xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
int oldlen; /* old unused entry's length */
- struct xfs_dir2_data_free *bf;
hdr = bp->b_addr;
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
- ASSERT(offset >= (char *)dup - (char *)hdr);
- ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
- ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+ fa = xfs_dir2_data_check_free(hdr, dup, offset, len);
+ if (fa)
+ goto corrupt;
/*
* Look up the entry in the bestfree table.
*/
@@ -1008,9 +1045,9 @@ xfs_dir2_data_use_free(
xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
needlogp);
- ASSERT(dfp != NULL);
- ASSERT(dfp->length == newdup->length);
- ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+ fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup);
+ if (fa)
+ goto corrupt;
/*
* If we got inserted at the last slot,
* that means we don't know if there was a better
@@ -1036,9 +1073,9 @@ xfs_dir2_data_use_free(
xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
needlogp);
- ASSERT(dfp != NULL);
- ASSERT(dfp->length == newdup->length);
- ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+ fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup);
+ if (fa)
+ goto corrupt;
/*
* If we got inserted at the last slot,
* that means we don't know if there was a better
@@ -1084,6 +1121,11 @@ xfs_dir2_data_use_free(
}
}
*needscanp = needscan;
+ return 0;
+corrupt:
+ xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount,
+ hdr, __FILE__, __LINE__, fa);
+ return -EFSCORRUPTED;
}
/* Find the end of the entry data in a data/block format dir block. */
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index d7e630f41f9c..50fc9c0c5e2b 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -877,9 +877,13 @@ xfs_dir2_leaf_addname(
/*
* Mark the initial part of our freespace in use for the new entry.
*/
- xfs_dir2_data_use_free(args, dbp, dup,
- (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
- &needlog, &needscan);
+ error = xfs_dir2_data_use_free(args, dbp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+ length, &needlog, &needscan);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
+ return error;
+ }
/*
* Initialize our new entry (at last).
*/
@@ -1415,7 +1419,8 @@ xfs_dir2_leaf_removename(
oldbest = be16_to_cpu(bf[0].length);
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
- ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
+ if (be16_to_cpu(bestsp[db]) != oldbest)
+ return -EFSCORRUPTED;
/*
* Mark the former data entry unused.
*/
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 239d97a64296..9df096cc3c37 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -387,8 +387,9 @@ xfs_dir2_leaf_to_node(
dp->d_ops->free_hdr_from_disk(&freehdr, free);
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
- ASSERT(be32_to_cpu(ltp->bestcount) <=
- (uint)dp->i_d.di_size / args->geo->blksize);
+ if (be32_to_cpu(ltp->bestcount) >
+ (uint)dp->i_d.di_size / args->geo->blksize)
+ return -EFSCORRUPTED;
/*
* Copy freespace entries from the leaf block to the new block.
@@ -1728,6 +1729,7 @@ xfs_dir2_node_addname_int(
__be16 *bests;
struct xfs_dir3_icfree_hdr freehdr;
struct xfs_dir2_data_free *bf;
+ xfs_dir2_data_aoff_t aoff;
dp = args->dp;
mp = dp->i_mount;
@@ -2022,9 +2024,13 @@ xfs_dir2_node_addname_int(
/*
* Mark the first part of the unused space, inuse for us.
*/
- xfs_dir2_data_use_free(args, dbp, dup,
- (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
- &needlog, &needscan);
+ aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+ error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
+ &needlog, &needscan);
+ if (error) {
+ xfs_trans_brelse(tp, dbp);
+ return error;
+ }
/*
* Fill in the new entry and log it.
*/
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 1acb584fc5f7..42956d8d95ed 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -803,24 +803,13 @@ typedef struct xfs_agi {
&(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
(__be32 *)(bp)->b_addr)
-/*
- * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
- * slots in the beginning of the block for a proper header with the
- * location information and CRC.
- */
-#define XFS_AGFL_SIZE(mp) \
- (((mp)->m_sb.sb_sectsize - \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
- sizeof(struct xfs_agfl) : 0)) / \
- sizeof(xfs_agblock_t))
-
typedef struct xfs_agfl {
__be32 agfl_magicnum;
__be32 agfl_seqno;
uuid_t agfl_uuid;
__be64 agfl_lsn;
__be32 agfl_crc;
- __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
+ __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */
} __attribute__((packed)) xfs_agfl_t;
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index af197a5f3a82..a2dd7f4a2719 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -93,8 +93,6 @@ __xfs_inobt_alloc_block(
int error; /* error return value */
xfs_agblock_t sbno = be32_to_cpu(start->s);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
@@ -107,17 +105,14 @@ __xfs_inobt_alloc_block(
args.resv = resv;
error = xfs_alloc_vextent(&args);
- if (error) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (error)
return error;
- }
+
if (args.fsbno == NULLFSBLOCK) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
ASSERT(args.len == 1);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
*stat = 1;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 4fe17b368316..ef68b1de006a 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -93,20 +93,26 @@ xfs_inode_buf_verify(
bool readahead)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
+ xfs_agnumber_t agno;
int i;
int ni;
/*
* Validate the magic number and version of every inode in the buffer
*/
+ agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
int di_ok;
xfs_dinode_t *dip;
+ xfs_agino_t unlinked_ino;
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
+ unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
- xfs_dinode_good_version(mp, dip->di_version);
+ xfs_dinode_good_version(mp, dip->di_version) &&
+ (unlinked_ino == NULLAGINO ||
+ xfs_verify_agino(mp, agno, unlinked_ino));
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -115,16 +121,18 @@ xfs_inode_buf_verify(
return;
}
- xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
#ifdef DEBUG
xfs_alert(mp,
"bad inode magic/vsn daddr %lld #%d (magic=%x)",
(unsigned long long)bp->b_bn, i,
be16_to_cpu(dip->di_magic));
#endif
+ xfs_buf_verifier_error(bp, -EFSCORRUPTED,
+ __func__, dip, sizeof(*dip),
+ NULL);
+ return;
}
}
- xfs_inobp_check(mp, bp);
}
@@ -564,10 +572,7 @@ xfs_iread(
/* initialise the on-disk inode core */
memset(&ip->i_d, 0, sizeof(ip->i_d));
VFS_I(ip)->i_generation = prandom_u32();
- if (xfs_sb_version_hascrc(&mp->m_sb))
- ip->i_d.di_version = 3;
- else
- ip->i_d.di_version = 2;
+ ip->i_d.di_version = 3;
return 0;
}
@@ -649,3 +654,108 @@ xfs_iread(
xfs_trans_brelse(tp, bp);
return error;
}
+
+/*
+ * Validate di_extsize hint.
+ *
+ * The rules are documented at xfs_ioctl_setattr_check_extsize().
+ * These functions must be kept in sync with each other.
+ */
+xfs_failaddr_t
+xfs_inode_validate_extsize(
+ struct xfs_mount *mp,
+ uint32_t extsize,
+ uint16_t mode,
+ uint16_t flags)
+{
+ bool rt_flag;
+ bool hint_flag;
+ bool inherit_flag;
+ uint32_t extsize_bytes;
+ uint32_t blocksize_bytes;
+
+ rt_flag = (flags & XFS_DIFLAG_REALTIME);
+ hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
+ inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
+ extsize_bytes = XFS_FSB_TO_B(mp, extsize);
+
+ if (rt_flag)
+ blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+ else
+ blocksize_bytes = mp->m_sb.sb_blocksize;
+
+ if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
+ return __this_address;
+
+ if (hint_flag && !S_ISREG(mode))
+ return __this_address;
+
+ if (inherit_flag && !S_ISDIR(mode))
+ return __this_address;
+
+ if ((hint_flag || inherit_flag) && extsize == 0)
+ return __this_address;
+
+ if (!(hint_flag || inherit_flag) && extsize != 0)
+ return __this_address;
+
+ if (extsize_bytes % blocksize_bytes)
+ return __this_address;
+
+ if (extsize > MAXEXTLEN)
+ return __this_address;
+
+ if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
+ return __this_address;
+
+ return NULL;
+}
+
+/*
+ * Validate di_cowextsize hint.
+ *
+ * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
+ * These functions must be kept in sync with each other.
+ */
+xfs_failaddr_t
+xfs_inode_validate_cowextsize(
+ struct xfs_mount *mp,
+ uint32_t cowextsize,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ bool rt_flag;
+ bool hint_flag;
+ uint32_t cowextsize_bytes;
+
+ rt_flag = (flags & XFS_DIFLAG_REALTIME);
+ hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
+ cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
+
+ if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb))
+ return __this_address;
+
+ if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode)))
+ return __this_address;
+
+ if (hint_flag && cowextsize == 0)
+ return __this_address;
+
+ if (!hint_flag && cowextsize != 0)
+ return __this_address;
+
+ if (hint_flag && rt_flag)
+ return __this_address;
+
+ if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+ return __this_address;
+
+ if (cowextsize > MAXEXTLEN)
+ return __this_address;
+
+ if (cowextsize > mp->m_sb.sb_agblocks / 2)
+ return __this_address;
+
+ return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 8a5e1da52d74..d9a376a78ee2 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -84,5 +84,10 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_dinode *dip);
+xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
+ uint32_t extsize, uint16_t mode, uint16_t flags);
+xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
+ uint32_t cowextsize, uint16_t mode, uint16_t flags,
+ uint64_t flags2);
#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 866d2861c625..701c42a28d05 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -195,8 +195,9 @@ xfs_iformat_local(
"corrupt inode %Lu (bad size %d for local fork, size = %d).",
(unsigned long long) ip->i_ino, size,
XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
- XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iformat_local", dip, sizeof(*dip),
+ __this_address);
return -EFSCORRUPTED;
}
@@ -231,8 +232,9 @@ xfs_iformat_extents(
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
(unsigned long long) ip->i_ino, nex);
- XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
- mp, dip);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iformat_extents(1)", dip, sizeof(*dip),
+ __this_address);
return -EFSCORRUPTED;
}
@@ -245,10 +247,14 @@ xfs_iformat_extents(
xfs_iext_first(ifp, &icur);
for (i = 0; i < nex; i++, dp++) {
+ xfs_failaddr_t fa;
+
xfs_bmbt_disk_get_all(dp, &new);
- if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) {
- XFS_ERROR_REPORT("xfs_iformat_extents(2)",
- XFS_ERRLEVEL_LOW, mp);
+ fa = xfs_bmap_validate_extent(ip, whichfork, &new);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iformat_extents(2)",
+ dp, sizeof(*dp), fa);
return -EFSCORRUPTED;
}
@@ -305,8 +311,9 @@ xfs_iformat_btree(
level == 0 || level > XFS_BTREE_MAXLEVELS) {
xfs_warn(mp, "corrupt inode %Lu (btree).",
(unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
- mp, dip);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iformat_btree", dfp, size,
+ __this_address);
return -EFSCORRUPTED;
}
@@ -595,7 +602,7 @@ xfs_iextents_copy(
for_each_xfs_iext(ifp, &icur, &rec) {
if (isnullstartblock(rec.br_startblock))
continue;
- ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, &rec));
+ ASSERT(xfs_bmap_validate_extent(ip, whichfork, &rec) == NULL);
xfs_bmbt_disk_set_all(dp, &rec);
trace_xfs_write_extent(ip, &icur, state, _RET_IP_);
copied += sizeof(struct xfs_bmbt_rec);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 8479769e470d..265fdcefcbae 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -79,8 +79,6 @@ xfs_refcountbt_alloc_block(
struct xfs_alloc_arg args; /* block allocation args */
int error; /* error return value */
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
@@ -98,7 +96,6 @@ xfs_refcountbt_alloc_block(
trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
args.agbno, 1);
if (args.fsbno == NULLFSBLOCK) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
@@ -109,12 +106,10 @@ xfs_refcountbt_alloc_block(
be32_add_cpu(&agf->agf_refcount_blocks, 1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
out_error:
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index e829c3e489ea..8b0d0de1cd11 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -104,20 +104,15 @@ xfs_rmapbt_alloc_block(
int error;
xfs_agblock_t bno;
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
/* Allocate the new block from the freelist. If we can't, give up. */
error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
&bno, 1);
- if (error) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (error)
return error;
- }
trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
bno, 1);
if (bno == NULLAGBLOCK) {
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
@@ -130,7 +125,8 @@ xfs_rmapbt_alloc_block(
be32_add_cpu(&agf->agf_rmap_blocks, 1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
+
*stat = 1;
return 0;
}
@@ -158,6 +154,8 @@ xfs_rmapbt_free_block(
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
+ xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
+
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a55f7a45fa78..53433cc024fd 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -731,7 +731,6 @@ xfs_sb_mount_common(
struct xfs_sb *sbp)
{
mp->m_agfrotor = mp->m_agirotor = 0;
- spin_lock_init(&mp->m_agirotor_lock);
mp->m_maxagi = mp->m_sb.sb_agcount;
mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index fd975524f460..018aabbd9394 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -80,7 +80,7 @@ xfs_scrub_walk_agfl(
}
/* first to the end */
- for (i = flfirst; i < XFS_AGFL_SIZE(mp); i++) {
+ for (i = flfirst; i < xfs_agfl_size(mp); i++) {
error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
if (error)
return error;
@@ -664,7 +664,7 @@ xfs_scrub_agf(
if (agfl_last > agfl_first)
fl_count = agfl_last - agfl_first + 1;
else
- fl_count = XFS_AGFL_SIZE(mp) - agfl_first + agfl_last + 1;
+ fl_count = xfs_agfl_size(mp) - agfl_first + agfl_last + 1;
if (agfl_count != 0 && fl_count != agfl_count)
xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
@@ -767,7 +767,7 @@ int
xfs_scrub_agfl(
struct xfs_scrub_context *sc)
{
- struct xfs_scrub_agfl_info sai = { 0 };
+ struct xfs_scrub_agfl_info sai;
struct xfs_agf *agf;
xfs_agnumber_t agno;
unsigned int agflcount;
@@ -791,10 +791,11 @@ xfs_scrub_agfl(
/* Allocate buffer to ensure uniqueness of AGFL entries. */
agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
agflcount = be32_to_cpu(agf->agf_flcount);
- if (agflcount > XFS_AGFL_SIZE(sc->mp)) {
+ if (agflcount > xfs_agfl_size(sc->mp)) {
xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
goto out;
}
+ memset(&sai, 0, sizeof(sai));
sai.sz_entries = agflcount;
sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS);
if (!sai.entries) {
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 4ed80474f545..127575f0abfb 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -98,7 +98,7 @@ xfs_scrub_xattr_listent(
if (flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
- xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino, NULL);
+ xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino);
return;
}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index d00282130492..639d14b51e90 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -37,6 +37,7 @@
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
#include "xfs_refcount.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
@@ -423,6 +424,169 @@ xfs_scrub_bmap_btree(
return error;
}
+struct xfs_scrub_bmap_check_rmap_info {
+ struct xfs_scrub_context *sc;
+ int whichfork;
+ struct xfs_iext_cursor icur;
+};
+
+/* Can we find bmaps that fit this rmap? */
+STATIC int
+xfs_scrub_bmap_check_rmap(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_scrub_bmap_check_rmap_info *sbcri = priv;
+ struct xfs_ifork *ifp;
+ struct xfs_scrub_context *sc = sbcri->sc;
+ bool have_map;
+
+ /* Is this even the right fork? */
+ if (rec->rm_owner != sc->ip->i_ino)
+ return 0;
+ if ((sbcri->whichfork == XFS_ATTR_FORK) ^
+ !!(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ return 0;
+
+ /* Now look up the bmbt record. */
+ ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork);
+ if (!ifp) {
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ goto out;
+ }
+ have_map = xfs_iext_lookup_extent(sc->ip, ifp, rec->rm_offset,
+ &sbcri->icur, &irec);
+ if (!have_map)
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ /*
+ * bmap extent record lengths are constrained to 2^21 blocks in length
+ * because of space constraints in the on-disk metadata structure.
+ * However, rmap extent record lengths are constrained only by AG
+ * length, so we have to loop through the bmbt to make sure that the
+ * entire rmap is covered by bmbt records.
+ */
+ while (have_map) {
+ if (irec.br_startoff != rec->rm_offset)
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
+ cur->bc_private.a.agno, rec->rm_startblock))
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ if (irec.br_blockcount > rec->rm_blockcount)
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ break;
+ rec->rm_startblock += irec.br_blockcount;
+ rec->rm_offset += irec.br_blockcount;
+ rec->rm_blockcount -= irec.br_blockcount;
+ if (rec->rm_blockcount == 0)
+ break;
+ have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec);
+ if (!have_map)
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ }
+
+out:
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+ return 0;
+}
+
+/* Make sure each rmap has a corresponding bmbt entry. */
+STATIC int
+xfs_scrub_bmap_check_ag_rmaps(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_agnumber_t agno)
+{
+ struct xfs_scrub_bmap_check_rmap_info sbcri;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf;
+ int error;
+
+ error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf);
+ if (error)
+ return error;
+
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno);
+ if (!cur) {
+ error = -ENOMEM;
+ goto out_agf;
+ }
+
+ sbcri.sc = sc;
+ sbcri.whichfork = whichfork;
+ error = xfs_rmap_query_all(cur, xfs_scrub_bmap_check_rmap, &sbcri);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ error = 0;
+
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+out_agf:
+ xfs_trans_brelse(sc->tp, agf);
+ return error;
+}
+
+/* Make sure each rmap has a corresponding bmbt entry. */
+STATIC int
+xfs_scrub_bmap_check_rmaps(
+ struct xfs_scrub_context *sc,
+ int whichfork)
+{
+ loff_t size;
+ xfs_agnumber_t agno;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) ||
+ whichfork == XFS_COW_FORK ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return 0;
+
+ /* Don't support realtime rmap checks yet. */
+ if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
+ return 0;
+
+ /*
+ * Only do this for complex maps that are in btree format, or for
+ * situations where we would seem to have a size but zero extents.
+ * The inode repair code can zap broken iforks, which means we have
+ * to flag this bmap as corrupt if there are rmaps that need to be
+ * reattached.
+ */
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ size = i_size_read(VFS_I(sc->ip));
+ break;
+ case XFS_ATTR_FORK:
+ size = XFS_IFORK_Q(sc->ip);
+ break;
+ default:
+ size = 0;
+ break;
+ }
+ if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0))
+ return 0;
+
+ for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) {
+ error = xfs_scrub_bmap_check_ag_rmaps(sc, whichfork, agno);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ break;
+ }
+
+ return 0;
+}
+
/*
* Scrub an inode fork's block mappings.
*
@@ -457,16 +621,16 @@ xfs_scrub_bmap(
goto out;
/* No CoW forks on non-reflink inodes/filesystems. */
if (!xfs_is_reflink_inode(ip)) {
- xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
goto out;
}
break;
case XFS_ATTR_FORK:
if (!ifp)
- goto out;
+ goto out_check_rmap;
if (!xfs_sb_version_hasattr(&mp->m_sb) &&
!xfs_sb_version_hasattr2(&mp->m_sb))
- xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
break;
default:
ASSERT(whichfork == XFS_DATA_FORK);
@@ -534,6 +698,10 @@ xfs_scrub_bmap(
goto out;
}
+out_check_rmap:
+ error = xfs_scrub_bmap_check_rmaps(sc, whichfork);
+ if (!xfs_scrub_fblock_xref_process_error(sc, whichfork, 0, &error))
+ goto out;
out:
return error;
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 8033ab9d8f47..8ed91d5c868d 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -213,12 +213,10 @@ xfs_scrub_block_set_preen(
void
xfs_scrub_ino_set_preen(
struct xfs_scrub_context *sc,
- xfs_ino_t ino,
- struct xfs_buf *bp)
+ xfs_ino_t ino)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
- trace_xfs_scrub_ino_preen(sc, ino, bp ? bp->b_bn : 0,
- __return_address);
+ trace_xfs_scrub_ino_preen(sc, ino, __return_address);
}
/* Record a corrupt block. */
@@ -249,22 +247,20 @@ xfs_scrub_block_xref_set_corrupt(
void
xfs_scrub_ino_set_corrupt(
struct xfs_scrub_context *sc,
- xfs_ino_t ino,
- struct xfs_buf *bp)
+ xfs_ino_t ino)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address);
+ trace_xfs_scrub_ino_error(sc, ino, __return_address);
}
/* Record a corruption while cross-referencing with an inode. */
void
xfs_scrub_ino_xref_set_corrupt(
struct xfs_scrub_context *sc,
- xfs_ino_t ino,
- struct xfs_buf *bp)
+ xfs_ino_t ino)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
- trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address);
+ trace_xfs_scrub_ino_error(sc, ino, __return_address);
}
/* Record corruption in a block indexed by a file fork. */
@@ -296,12 +292,10 @@ xfs_scrub_fblock_xref_set_corrupt(
void
xfs_scrub_ino_set_warning(
struct xfs_scrub_context *sc,
- xfs_ino_t ino,
- struct xfs_buf *bp)
+ xfs_ino_t ino)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
- trace_xfs_scrub_ino_warning(sc, ino, bp ? bp->b_bn : 0,
- __return_address);
+ trace_xfs_scrub_ino_warning(sc, ino, __return_address);
}
/* Warn about a block indexed by a file fork that needs review. */
@@ -619,7 +613,7 @@ xfs_scrub_checkpoint_log(
{
int error;
- error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
if (error)
return error;
xfs_ail_push_all_sync(mp->m_ail);
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index ddb65d22c76a..deaf60400981 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -63,25 +63,22 @@ bool xfs_scrub_fblock_xref_process_error(struct xfs_scrub_context *sc,
void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc,
struct xfs_buf *bp);
-void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino,
- struct xfs_buf *bp);
+void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino);
void xfs_scrub_block_set_corrupt(struct xfs_scrub_context *sc,
struct xfs_buf *bp);
-void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino,
- struct xfs_buf *bp);
+void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino);
void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork,
xfs_fileoff_t offset);
void xfs_scrub_block_xref_set_corrupt(struct xfs_scrub_context *sc,
struct xfs_buf *bp);
-void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino,
- struct xfs_buf *bp);
+void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc,
+ xfs_ino_t ino);
void xfs_scrub_fblock_xref_set_corrupt(struct xfs_scrub_context *sc,
int whichfork, xfs_fileoff_t offset);
-void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino,
- struct xfs_buf *bp);
+void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino);
void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork,
xfs_fileoff_t offset);
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 50b6a26b0299..38f29806eb54 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -781,7 +781,7 @@ xfs_scrub_directory(
/* Plausible size? */
if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) {
- xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
goto out;
}
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 63ab3f98430d..106ca4bd753f 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -259,7 +259,8 @@ xfs_scrub_iallocbt_check_freemask(
error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
&dip, &bp, 0, 0);
- if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error))
+ if (!xfs_scrub_btree_xref_process_error(bs->sc, bs->cur, 0,
+ &error))
continue;
/* Which inodes are free? */
@@ -433,7 +434,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes(
if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
if (blocks != inode_blocks)
- xfs_scrub_btree_set_corrupt(sc, sc->sa.ino_cur, 0);
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
/* Scrub the inode btrees for some AG. */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 21297bef8df1..df14930e4fc5 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -89,67 +89,21 @@ out:
/* Inode core */
-/*
- * Validate di_extsize hint.
- *
- * The rules are documented at xfs_ioctl_setattr_check_extsize().
- * These functions must be kept in sync with each other.
- */
+/* Validate di_extsize hint. */
STATIC void
xfs_scrub_inode_extsize(
struct xfs_scrub_context *sc,
- struct xfs_buf *bp,
struct xfs_dinode *dip,
xfs_ino_t ino,
uint16_t mode,
uint16_t flags)
{
- struct xfs_mount *mp = sc->mp;
- bool rt_flag;
- bool hint_flag;
- bool inherit_flag;
- uint32_t extsize;
- uint32_t extsize_bytes;
- uint32_t blocksize_bytes;
-
- rt_flag = (flags & XFS_DIFLAG_REALTIME);
- hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
- inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
- extsize = be32_to_cpu(dip->di_extsize);
- extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize);
-
- if (rt_flag)
- blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
- else
- blocksize_bytes = mp->m_sb.sb_blocksize;
-
- if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
- goto bad;
-
- if (hint_flag && !S_ISREG(mode))
- goto bad;
-
- if (inherit_flag && !S_ISDIR(mode))
- goto bad;
-
- if ((hint_flag || inherit_flag) && extsize == 0)
- goto bad;
-
- if (!(hint_flag || inherit_flag) && extsize != 0)
- goto bad;
-
- if (extsize_bytes % blocksize_bytes)
- goto bad;
-
- if (extsize > MAXEXTLEN)
- goto bad;
+ xfs_failaddr_t fa;
- if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
- goto bad;
-
- return;
-bad:
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ fa = xfs_inode_validate_extsize(sc->mp, be32_to_cpu(dip->di_extsize),
+ mode, flags);
+ if (fa)
+ xfs_scrub_ino_set_corrupt(sc, ino);
}
/*
@@ -161,58 +115,25 @@ bad:
STATIC void
xfs_scrub_inode_cowextsize(
struct xfs_scrub_context *sc,
- struct xfs_buf *bp,
struct xfs_dinode *dip,
xfs_ino_t ino,
uint16_t mode,
uint16_t flags,
uint64_t flags2)
{
- struct xfs_mount *mp = sc->mp;
- bool rt_flag;
- bool hint_flag;
- uint32_t extsize;
- uint32_t extsize_bytes;
-
- rt_flag = (flags & XFS_DIFLAG_REALTIME);
- hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
- extsize = be32_to_cpu(dip->di_cowextsize);
- extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize);
-
- if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb))
- goto bad;
-
- if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode)))
- goto bad;
-
- if (hint_flag && extsize == 0)
- goto bad;
-
- if (!hint_flag && extsize != 0)
- goto bad;
-
- if (hint_flag && rt_flag)
- goto bad;
-
- if (extsize_bytes % mp->m_sb.sb_blocksize)
- goto bad;
-
- if (extsize > MAXEXTLEN)
- goto bad;
-
- if (extsize > mp->m_sb.sb_agblocks / 2)
- goto bad;
+ xfs_failaddr_t fa;
- return;
-bad:
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ fa = xfs_inode_validate_cowextsize(sc->mp,
+ be32_to_cpu(dip->di_cowextsize), mode, flags,
+ flags2);
+ if (fa)
+ xfs_scrub_ino_set_corrupt(sc, ino);
}
/* Make sure the di_flags make sense for the inode. */
STATIC void
xfs_scrub_inode_flags(
struct xfs_scrub_context *sc,
- struct xfs_buf *bp,
struct xfs_dinode *dip,
xfs_ino_t ino,
uint16_t mode,
@@ -251,14 +172,13 @@ xfs_scrub_inode_flags(
return;
bad:
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
}
/* Make sure the di_flags2 make sense for the inode. */
STATIC void
xfs_scrub_inode_flags2(
struct xfs_scrub_context *sc,
- struct xfs_buf *bp,
struct xfs_dinode *dip,
xfs_ino_t ino,
uint16_t mode,
@@ -295,14 +215,13 @@ xfs_scrub_inode_flags2(
return;
bad:
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
}
/* Scrub all the ondisk inode fields. */
STATIC void
xfs_scrub_dinode(
struct xfs_scrub_context *sc,
- struct xfs_buf *bp,
struct xfs_dinode *dip,
xfs_ino_t ino)
{
@@ -333,7 +252,7 @@ xfs_scrub_dinode(
/* mode is recognized */
break;
default:
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
}
@@ -344,22 +263,22 @@ xfs_scrub_dinode(
* We autoconvert v1 inodes into v2 inodes on writeout,
* so just mark this inode for preening.
*/
- xfs_scrub_ino_set_preen(sc, ino, bp);
+ xfs_scrub_ino_set_preen(sc, ino);
break;
case 2:
case 3:
if (dip->di_onlink != 0)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
if (dip->di_mode == 0 && sc->ip)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
if (dip->di_projid_hi != 0 &&
!xfs_sb_version_hasprojid32bit(&mp->m_sb))
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
default:
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
return;
}
@@ -369,40 +288,40 @@ xfs_scrub_dinode(
*/
if (dip->di_uid == cpu_to_be32(-1U) ||
dip->di_gid == cpu_to_be32(-1U))
- xfs_scrub_ino_set_warning(sc, ino, bp);
+ xfs_scrub_ino_set_warning(sc, ino);
/* di_format */
switch (dip->di_format) {
case XFS_DINODE_FMT_DEV:
if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
!S_ISFIFO(mode) && !S_ISSOCK(mode))
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_LOCAL:
if (!S_ISDIR(mode) && !S_ISLNK(mode))
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_EXTENTS:
if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
if (!S_ISREG(mode) && !S_ISDIR(mode))
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_UUID:
default:
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
}
/* di_[amc]time.nsec */
if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
/*
* di_size. xfs_dinode_verify checks for things that screw up
@@ -411,19 +330,19 @@ xfs_scrub_dinode(
*/
isize = be64_to_cpu(dip->di_size);
if (isize & (1ULL << 63))
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
/* Devices, fifos, and sockets must have zero size */
if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
/* Directories can't be larger than the data section size (32G) */
if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE))
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
/* Symlinks can't be larger than SYMLINK_MAXLEN */
if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN))
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
/*
* Warn if the running kernel can't handle the kinds of offsets
@@ -432,7 +351,7 @@ xfs_scrub_dinode(
* overly large offsets, flag the inode for admin review.
*/
if (isize >= mp->m_super->s_maxbytes)
- xfs_scrub_ino_set_warning(sc, ino, bp);
+ xfs_scrub_ino_set_warning(sc, ino);
/* di_nblocks */
if (flags2 & XFS_DIFLAG2_REFLINK) {
@@ -447,15 +366,15 @@ xfs_scrub_dinode(
*/
if (be64_to_cpu(dip->di_nblocks) >=
mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
} else {
if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
}
- xfs_scrub_inode_flags(sc, bp, dip, ino, mode, flags);
+ xfs_scrub_inode_flags(sc, dip, ino, mode, flags);
- xfs_scrub_inode_extsize(sc, bp, dip, ino, mode, flags);
+ xfs_scrub_inode_extsize(sc, dip, ino, mode, flags);
/* di_nextents */
nextents = be32_to_cpu(dip->di_nextents);
@@ -463,31 +382,31 @@ xfs_scrub_dinode(
switch (dip->di_format) {
case XFS_DINODE_FMT_EXTENTS:
if (nextents > fork_recs)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
if (nextents <= fork_recs)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
default:
if (nextents != 0)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
}
/* di_forkoff */
if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
if (dip->di_anextents != 0 && dip->di_forkoff == 0)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
/* di_aformat */
if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
dip->di_aformat != XFS_DINODE_FMT_BTREE)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
/* di_anextents */
nextents = be16_to_cpu(dip->di_anextents);
@@ -495,92 +414,26 @@ xfs_scrub_dinode(
switch (dip->di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
if (nextents > fork_recs)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
if (nextents <= fork_recs)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
break;
default:
if (nextents != 0)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
}
if (dip->di_version >= 3) {
if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
- xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2);
- xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags,
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ xfs_scrub_inode_flags2(sc, dip, ino, mode, flags, flags2);
+ xfs_scrub_inode_cowextsize(sc, dip, ino, mode, flags,
flags2);
}
}
-/* Map and read a raw inode. */
-STATIC int
-xfs_scrub_inode_map_raw(
- struct xfs_scrub_context *sc,
- xfs_ino_t ino,
- struct xfs_buf **bpp,
- struct xfs_dinode **dipp)
-{
- struct xfs_imap imap;
- struct xfs_mount *mp = sc->mp;
- struct xfs_buf *bp = NULL;
- struct xfs_dinode *dip;
- int error;
-
- error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED);
- if (error == -EINVAL) {
- /*
- * Inode could have gotten deleted out from under us;
- * just forget about it.
- */
- error = -ENOENT;
- goto out;
- }
- if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
- XFS_INO_TO_AGBNO(mp, ino), &error))
- goto out;
-
- error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp,
- NULL);
- if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
- XFS_INO_TO_AGBNO(mp, ino), &error))
- goto out;
-
- /*
- * Is this really an inode? We disabled verifiers in the above
- * xfs_trans_read_buf call because the inode buffer verifier
- * fails on /any/ inode record in the inode cluster with a bad
- * magic or version number, not just the one that we're
- * checking. Therefore, grab the buffer unconditionally, attach
- * the inode verifiers by hand, and run the inode verifier only
- * on the one inode we want.
- */
- bp->b_ops = &xfs_inode_buf_ops;
- dip = xfs_buf_offset(bp, imap.im_boffset);
- if (xfs_dinode_verify(mp, ino, dip) != NULL ||
- !xfs_dinode_good_version(mp, dip->di_version)) {
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
- goto out_buf;
- }
-
- /* ...and is it the one we asked for? */
- if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) {
- error = -ENOENT;
- goto out_buf;
- }
-
- *dipp = dip;
- *bpp = bp;
-out:
- return error;
-out_buf:
- xfs_trans_brelse(sc->tp, bp);
- return error;
-}
-
/*
* Make sure the finobt doesn't think this inode is free.
* We don't have to check the inobt ourselves because we got the inode via
@@ -645,18 +498,18 @@ xfs_scrub_inode_xref_bmap(
if (!xfs_scrub_should_check_xref(sc, &error, NULL))
return;
if (nextents < be32_to_cpu(dip->di_nextents))
- xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL);
+ xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino);
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
&nextents, &acount);
if (!xfs_scrub_should_check_xref(sc, &error, NULL))
return;
if (nextents != be16_to_cpu(dip->di_anextents))
- xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL);
+ xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino);
/* Check nblocks against the inode. */
if (count + acount != be64_to_cpu(dip->di_nblocks))
- xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL);
+ xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino);
}
/* Cross-reference with the other btrees. */
@@ -700,8 +553,7 @@ xfs_scrub_inode_xref(
static void
xfs_scrub_inode_check_reflink_iflag(
struct xfs_scrub_context *sc,
- xfs_ino_t ino,
- struct xfs_buf *bp)
+ xfs_ino_t ino)
{
struct xfs_mount *mp = sc->mp;
bool has_shared;
@@ -716,9 +568,9 @@ xfs_scrub_inode_check_reflink_iflag(
XFS_INO_TO_AGBNO(mp, ino), &error))
return;
if (xfs_is_reflink_inode(sc->ip) && !has_shared)
- xfs_scrub_ino_set_preen(sc, ino, bp);
+ xfs_scrub_ino_set_preen(sc, ino);
else if (!xfs_is_reflink_inode(sc->ip) && has_shared)
- xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ xfs_scrub_ino_set_corrupt(sc, ino);
}
/* Scrub an inode. */
@@ -727,43 +579,33 @@ xfs_scrub_inode(
struct xfs_scrub_context *sc)
{
struct xfs_dinode di;
- struct xfs_buf *bp = NULL;
- struct xfs_dinode *dip;
- xfs_ino_t ino;
int error = 0;
- /* Did we get the in-core inode, or are we doing this manually? */
- if (sc->ip) {
- ino = sc->ip->i_ino;
- xfs_inode_to_disk(sc->ip, &di, 0);
- dip = &di;
- } else {
- /* Map & read inode. */
- ino = sc->sm->sm_ino;
- error = xfs_scrub_inode_map_raw(sc, ino, &bp, &dip);
- if (error || !bp)
- goto out;
+ /*
+ * If sc->ip is NULL, that means that the setup function called
+ * xfs_iget to look up the inode. xfs_iget returned a EFSCORRUPTED
+ * and a NULL inode, so flag the corruption error and return.
+ */
+ if (!sc->ip) {
+ xfs_scrub_ino_set_corrupt(sc, sc->sm->sm_ino);
+ return 0;
}
- xfs_scrub_dinode(sc, bp, dip, ino);
+ /* Scrub the inode core. */
+ xfs_inode_to_disk(sc->ip, &di, 0);
+ xfs_scrub_dinode(sc, &di, sc->ip->i_ino);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
- /* Now let's do the things that require a live inode. */
- if (!sc->ip)
- goto out;
-
/*
* Look for discrepancies between file's data blocks and the reflink
* iflag. We already checked the iflag against the file mode when
* we scrubbed the dinode.
*/
if (S_ISREG(VFS_I(sc->ip)->i_mode))
- xfs_scrub_inode_check_reflink_iflag(sc, ino, bp);
+ xfs_scrub_inode_check_reflink_iflag(sc, sc->ip->i_ino);
- xfs_scrub_inode_xref(sc, ino, dip);
+ xfs_scrub_inode_xref(sc, sc->ip->i_ino, &di);
out:
- if (bp)
- xfs_trans_brelse(sc->tp, bp);
return error;
}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 0d3851410c74..1fb88c18d455 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -167,8 +167,18 @@ xfs_scrub_parent_validate(
* if the parent pointer erroneously points to a file, we
* can't use DONTCACHE here because DONTCACHE inodes can trigger
* immediate inactive cleanup of the inode.
+ *
+ * If _iget returns -EINVAL then the parent inode number is garbage
+ * and the directory is corrupt. If the _iget returns -EFSCORRUPTED
+ * or -EFSBADCRC then the parent is corrupt which is a cross
+ * referencing error. Any other error is an operational error.
*/
- error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp);
+ error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp);
+ if (error == -EINVAL) {
+ error = -EFSCORRUPTED;
+ xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
+ goto out;
+ }
if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 51daa4ae2627..6ba465e6c885 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -219,7 +219,7 @@ xfs_scrub_quota(
/* Look for problem extents. */
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
- xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
goto out_unlock_inode;
}
max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 26390991369a..39c41dfe08ee 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -116,8 +116,7 @@ xfs_scrub_xref_is_used_rt_space(
if (!xfs_scrub_should_check_xref(sc, &error, NULL))
goto out_unlock;
if (is_free)
- xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino,
- NULL);
+ xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
out_unlock:
xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4dc896852bf0..5d2b1c241be5 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -174,53 +174,32 @@ DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_error);
DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_preen);
DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class,
- TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, xfs_daddr_t daddr,
- void *ret_ip),
- TP_ARGS(sc, ino, daddr, ret_ip),
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, void *ret_ip),
+ TP_ARGS(sc, ino, ret_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(unsigned int, type)
- __field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, bno)
__field(void *, ret_ip)
),
TP_fast_assign(
- xfs_fsblock_t fsbno;
- xfs_agnumber_t agno;
- xfs_agblock_t bno;
-
- if (daddr) {
- fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr);
- agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
- bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
- } else {
- agno = XFS_INO_TO_AGNO(sc->mp, ino);
- bno = XFS_AGINO_TO_AGBNO(sc->mp,
- XFS_INO_TO_AGINO(sc->mp, ino));
- }
-
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = ino;
__entry->type = sc->sm->sm_type;
- __entry->agno = agno;
- __entry->bno = bno;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx type %u agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->type,
- __entry->agno,
- __entry->bno,
__entry->ret_ip)
)
#define DEFINE_SCRUB_INO_ERROR_EVENT(name) \
DEFINE_EVENT(xfs_scrub_ino_error_class, name, \
TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, \
- xfs_daddr_t daddr, void *ret_ip), \
- TP_ARGS(sc, ino, daddr, ret_ip))
+ void *ret_ip), \
+ TP_ARGS(sc, ino, ret_ip))
DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_error);
DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_preen);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 9c6a830da0ee..31f1f10eecd1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -209,7 +209,8 @@ xfs_setfilesize_trans_alloc(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
+ XFS_TRANS_NOFS, &tp);
if (error)
return error;
@@ -1330,21 +1331,20 @@ xfs_get_blocks(
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
- error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
- &imap, &nimaps, XFS_BMAPI_ENTIRE);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, 0);
if (error)
goto out_unlock;
-
- if (nimaps) {
- trace_xfs_get_blocks_found(ip, offset, size,
- imap.br_state == XFS_EXT_UNWRITTEN ?
- XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
- xfs_iunlock(ip, lockmode);
- } else {
+ if (!nimaps) {
trace_xfs_get_blocks_notfound(ip, offset, size);
goto out_unlock;
}
+ trace_xfs_get_blocks_found(ip, offset, size,
+ imap.br_state == XFS_EXT_UNWRITTEN ?
+ XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
+ xfs_iunlock(ip, lockmode);
+
/* trim mapping down to size requested */
xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
@@ -1390,7 +1390,7 @@ xfs_vm_bmap(
/*
* The swap code (ab-)uses ->bmap to get a block mapping and then
- * bypasseѕ the file system for actual I/O. We really can't allow
+ * bypasses the file system for actual I/O. We really can't allow
* that on reflinks inodes, so we have to skip out here. And yes,
* 0 is the magic code for a bmap error.
*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index c83f549dc17b..05dee8fdd895 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1208,18 +1208,15 @@ xfs_free_file_space(
/*
* Now that we've unmap all full blocks we'll have to zero out any
- * partial block at the beginning and/or end. xfs_zero_range is
- * smart enough to skip any holes, including those we just created,
- * but we must take care not to zero beyond EOF and enlarge i_size.
+ * partial block at the beginning and/or end. iomap_zero_range is smart
+ * enough to skip any holes, including those we just created, but we
+ * must take care not to zero beyond EOF and enlarge i_size.
*/
-
if (offset >= XFS_ISIZE(ip))
return 0;
-
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
-
- return xfs_zero_range(ip, offset, len, NULL);
+ return iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops);
}
/*
@@ -1899,17 +1896,28 @@ xfs_swap_extents(
* performed with log redo items!
*/
if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ int w = XFS_DATA_FORK;
+ uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w);
+ uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w);
+
+ /*
+ * Conceptually this shouldn't affect the shape of either bmbt,
+ * but since we atomically move extents one by one, we reserve
+ * enough space to rebuild both trees.
+ */
+ resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w);
+ resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
+
/*
- * Conceptually this shouldn't affect the shape of either
- * bmbt, but since we atomically move extents one by one,
- * we reserve enough space to rebuild both trees.
+ * Handle the corner case where either inode might straddle the
+ * btree format boundary. If so, the inode could bounce between
+ * btree <-> extent format on unmap -> remap cycles, freeing and
+ * allocating a bmapbt block each time.
*/
- resblks = XFS_SWAP_RMAP_SPACE_RES(mp,
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK),
- XFS_DATA_FORK) +
- XFS_SWAP_RMAP_SPACE_RES(mp,
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
- XFS_DATA_FORK);
+ if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1))
+ resblks += XFS_IFORK_MAXEXT(ip, w);
+ if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1))
+ resblks += XFS_IFORK_MAXEXT(tip, w);
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
@@ -2003,11 +2011,11 @@ xfs_swap_extents(
ip->i_cowfp = tip->i_cowfp;
tip->i_cowfp = cowfp;
- if (ip->i_cowfp && ip->i_cnextents)
+ if (ip->i_cowfp && ip->i_cowfp->if_bytes)
xfs_inode_set_cowblocks_tag(ip);
else
xfs_inode_clear_cowblocks_tag(ip);
- if (tip->i_cowfp && tip->i_cnextents)
+ if (tip->i_cowfp && tip->i_cowfp->if_bytes)
xfs_inode_set_cowblocks_tag(tip);
else
xfs_inode_clear_cowblocks_tag(tip);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d1da2ee9e6db..ac669a10c62f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1708,7 +1708,7 @@ xfs_buftarg_isolate(
* zero. If the value is already zero, we need to reclaim the
* buffer, otherwise it gets another trip through the LRU.
*/
- if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
spin_unlock(&bp->b_lock);
return LRU_ROTATE;
}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 270ddb4d2313..82ad270e390e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -460,7 +460,7 @@ xfs_buf_item_unpin(
list_del_init(&bp->b_li_list);
bp->b_iodone = NULL;
} else {
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
ASSERT(bp->b_log_item == NULL);
@@ -1057,12 +1057,12 @@ xfs_buf_do_callbacks_fail(
lip = list_first_entry(&bp->b_li_list, struct xfs_log_item,
li_bio_list);
ailp = lip->li_ailp;
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
if (lip->li_ops->iop_error)
lip->li_ops->iop_error(lip, bp);
}
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
}
static bool
@@ -1226,7 +1226,7 @@ xfs_buf_iodone(
*
* Either way, AIL is useless if we're forcing a shutdown.
*/
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
}
@@ -1246,7 +1246,7 @@ xfs_buf_resubmit_failed_buffers(
/*
* Clear XFS_LI_FAILED flag from all items before resubmit
*
- * XFS_LI_FAILED set/clear is protected by xa_lock, caller this
+ * XFS_LI_FAILED set/clear is protected by ail_lock, caller this
* function already have it acquired
*/
list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 43572f8a1b8e..a7daef9e16bf 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -394,8 +394,6 @@ xfs_qm_dqalloc(
error1:
xfs_defer_cancel(&dfops);
error0:
- xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-
return error;
}
@@ -920,7 +918,7 @@ xfs_qm_dqflush_done(
(lip->li_flags & XFS_LI_FAILED))) {
/* xfs_trans_ail_delete() drops the AIL lock. */
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
if (lip->li_lsn == qip->qli_flush_lsn) {
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
} else {
@@ -930,7 +928,7 @@ xfs_qm_dqflush_done(
*/
if (lip->li_flags & XFS_LI_FAILED)
xfs_clear_li_failed(lip);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
}
}
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 96eaa6933709..4b331e354da7 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -157,8 +157,9 @@ xfs_dquot_item_error(
STATIC uint
xfs_qm_dquot_logitem_push(
struct xfs_log_item *lip,
- struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock)
- __acquires(&lip->li_ailp->xa_lock)
+ struct list_head *buffer_list)
+ __releases(&lip->li_ailp->ail_lock)
+ __acquires(&lip->li_ailp->ail_lock)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
struct xfs_buf *bp = lip->li_buf;
@@ -205,7 +206,7 @@ xfs_qm_dquot_logitem_push(
goto out_unlock;
}
- spin_unlock(&lip->li_ailp->xa_lock);
+ spin_unlock(&lip->li_ailp->ail_lock);
error = xfs_qm_dqflush(dqp, &bp);
if (error) {
@@ -217,7 +218,7 @@ xfs_qm_dquot_logitem_push(
xfs_buf_relse(bp);
}
- spin_lock(&lip->li_ailp->xa_lock);
+ spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
xfs_dqunlock(dqp);
return rval;
@@ -400,7 +401,7 @@ xfs_qm_qoffend_logitem_committed(
* Delete the qoff-start logitem from the AIL.
* xfs_trans_ail_delete() drops the AIL lock.
*/
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
kmem_free(qfs->qql_item.li_lv_shadow);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ccf520f0b00d..a63f5083f497 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -347,27 +347,32 @@ xfs_corruption_error(
* values, and omit the stack trace unless the error level is tuned high.
*/
void
-xfs_verifier_error(
+xfs_buf_verifier_error(
struct xfs_buf *bp,
int error,
+ const char *name,
+ void *buf,
+ size_t bufsz,
xfs_failaddr_t failaddr)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
xfs_failaddr_t fa;
+ int sz;
fa = failaddr ? failaddr : __return_address;
__xfs_buf_ioerror(bp, error, fa);
- xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx",
+ xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
- fa, bp->b_ops->name, bp->b_bn);
+ fa, bp->b_ops->name, bp->b_bn, name);
xfs_alert(mp, "Unmount and run xfs_repair");
if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz);
xfs_alert(mp, "First %d bytes of corrupted metadata buffer:",
- XFS_CORRUPTION_DUMP_LEN);
- xfs_hex_dump(xfs_buf_offset(bp, 0), XFS_CORRUPTION_DUMP_LEN);
+ sz);
+ xfs_hex_dump(buf, sz);
}
if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
@@ -375,6 +380,20 @@ xfs_verifier_error(
}
/*
+ * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
+ * values, and omit the stack trace unless the error level is tuned high.
+ */
+void
+xfs_verifier_error(
+ struct xfs_buf *bp,
+ int error,
+ xfs_failaddr_t failaddr)
+{
+ return xfs_buf_verifier_error(bp, error, "", xfs_buf_offset(bp, 0),
+ XFS_CORRUPTION_DUMP_LEN, failaddr);
+}
+
+/*
* Warnings for inode corruption problems. Don't bother with the stack
* trace unless the error level is turned up high.
*/
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 7e728c5a46b8..ce391349e78b 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -26,6 +26,9 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
extern void xfs_corruption_error(const char *tag, int level,
struct xfs_mount *mp, void *p, const char *filename,
int linenum, xfs_failaddr_t failaddr);
+extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
+ const char *name, void *buf, size_t bufsz,
+ xfs_failaddr_t failaddr);
extern void xfs_verifier_error(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index fe1bfee35898..eed698aa9f16 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -122,7 +122,7 @@ xfs_nfs_get_inode(
struct super_block *sb,
u64 ino,
u32 generation)
- {
+{
xfs_mount_t *mp = XFS_M(sb);
xfs_inode_t *ip;
int error;
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 77760dbf0242..13e3d1a69e76 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -611,10 +611,9 @@ xfs_extent_busy_flush(
unsigned busy_gen)
{
DEFINE_WAIT (wait);
- int log_flushed = 0, error;
+ int error;
- trace_xfs_log_force(mp, 0, _THIS_IP_);
- error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
if (error)
return;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9ea08326f876..299aee4b7b0b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -48,20 +48,6 @@
static const struct vm_operations_struct xfs_file_vm_ops;
-/*
- * Clear the specified ranges to zero through either the pagecache or DAX.
- * Holes and unwritten extents will be left as-is as they already are zeroed.
- */
-int
-xfs_zero_range(
- struct xfs_inode *ip,
- xfs_off_t pos,
- xfs_off_t count,
- bool *did_zero)
-{
- return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops);
-}
-
int
xfs_update_prealloc_flags(
struct xfs_inode *ip,
@@ -122,7 +108,7 @@ xfs_dir_fsync(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
STATIC int
@@ -182,7 +168,7 @@ xfs_file_fsync(
}
if (lsn) {
- error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+ error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
ip->i_itemp->ili_fsync_fields = 0;
}
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -301,31 +287,6 @@ xfs_file_read_iter(
}
/*
- * Zero any on disk space between the current EOF and the new, larger EOF.
- *
- * This handles the normal case of zeroing the remainder of the last block in
- * the file and the unusual case of zeroing blocks out beyond the size of the
- * file. This second case only happens with fixed size extents and when the
- * system crashes before the inode size was updated but after blocks were
- * allocated.
- *
- * Expects the iolock to be held exclusive, and will take the ilock internally.
- */
-int /* error (positive) */
-xfs_zero_eof(
- struct xfs_inode *ip,
- xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize, /* current inode size */
- bool *did_zeroing)
-{
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(offset > isize);
-
- trace_xfs_zero_eof(ip, isize, offset - isize);
- return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
-}
-
-/*
* Common pre-write limit and setup checks.
*
* Called with the iolocked held either shared and exclusive according to
@@ -344,6 +305,7 @@ xfs_file_aio_write_checks(
ssize_t error = 0;
size_t count = iov_iter_count(from);
bool drained_dio = false;
+ loff_t isize;
restart:
error = generic_write_checks(iocb, from);
@@ -380,7 +342,8 @@ restart:
* and hence be able to correctly determine if we need to run zeroing.
*/
spin_lock(&ip->i_flags_lock);
- if (iocb->ki_pos > i_size_read(inode)) {
+ isize = i_size_read(inode);
+ if (iocb->ki_pos > isize) {
spin_unlock(&ip->i_flags_lock);
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
@@ -401,7 +364,10 @@ restart:
drained_dio = true;
goto restart;
}
- error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL);
+
+ trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
+ error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
+ NULL, &xfs_iomap_ops);
if (error)
return error;
} else
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 8b4545623e25..523792768080 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -217,7 +217,7 @@ xfs_growfs_data_private(
}
agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
- for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+ for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
error = xfs_bwrite(bp);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index d53a316162d6..9a18f69f6e96 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -483,7 +483,28 @@ xfs_iget_cache_miss(
trace_xfs_iget_miss(ip);
- if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+
+ /*
+ * If we are allocating a new inode, then check what was returned is
+ * actually a free, empty inode. If we are not allocating an inode,
+ * the check we didn't find a free inode.
+ */
+ if (flags & XFS_IGET_CREATE) {
+ if (VFS_I(ip)->i_mode != 0) {
+ xfs_warn(mp,
+"Corruption detected! Free inode 0x%llx not marked free on disk",
+ ino);
+ error = -EFSCORRUPTED;
+ goto out_destroy;
+ }
+ if (ip->i_d.di_nblocks != 0) {
+ xfs_warn(mp,
+"Corruption detected! Free inode 0x%llx has blocks allocated!",
+ ino);
+ error = -EFSCORRUPTED;
+ goto out_destroy;
+ }
+ } else if (VFS_I(ip)->i_mode == 0) {
error = -ENOENT;
goto out_destroy;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 604ee384a00a..3e3aab3888fa 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1872,6 +1872,7 @@ xfs_inactive(
xfs_inode_t *ip)
{
struct xfs_mount *mp;
+ struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
int error;
int truncate = 0;
@@ -1892,6 +1893,10 @@ xfs_inactive(
if (mp->m_flags & XFS_MOUNT_RDONLY)
return;
+ /* Try to clean out the cow blocks if there are any. */
+ if (xfs_is_reflink_inode(ip) && cow_ifp->if_bytes > 0)
+ xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+
if (VFS_I(ip)->i_nlink != 0) {
/*
* force is true because we are evicting an inode from the
@@ -2470,6 +2475,10 @@ xfs_ifree(
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+ /* Don't attempt to replay owner changes for a deleted inode */
+ ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
+
/*
* Bump the generation count so no one will be confused
* by reincarnations of this inode.
@@ -2497,7 +2506,7 @@ xfs_iunpin(
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
/* Give the log a push to start the unpinning I/O */
- xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
+ xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3e8dc990d41c..132d8aa2afc4 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -443,10 +443,6 @@ enum xfs_prealloc_flags {
int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags);
-int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
- xfs_fsize_t isize, bool *did_zeroing);
-int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
- bool *did_zero);
/* from xfs_iops.c */
extern void xfs_setup_inode(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d5037f060d6f..34b91b789702 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -502,8 +502,8 @@ STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
struct list_head *buffer_list)
- __releases(&lip->li_ailp->xa_lock)
- __acquires(&lip->li_ailp->xa_lock)
+ __releases(&lip->li_ailp->ail_lock)
+ __acquires(&lip->li_ailp->ail_lock)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
@@ -562,7 +562,7 @@ xfs_inode_item_push(
ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
- spin_unlock(&lip->li_ailp->xa_lock);
+ spin_unlock(&lip->li_ailp->ail_lock);
error = xfs_iflush(ip, &bp);
if (!error) {
@@ -571,7 +571,7 @@ xfs_inode_item_push(
xfs_buf_relse(bp);
}
- spin_lock(&lip->li_ailp->xa_lock);
+ spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return rval;
@@ -579,9 +579,6 @@ out_unlock:
/*
* Unlock the inode associated with the inode log item.
- * Clear the fields of the inode and inode log item that
- * are specific to the current transaction. If the
- * hold flags is set, do not unlock the inode.
*/
STATIC void
xfs_inode_item_unlock(
@@ -637,10 +634,6 @@ xfs_inode_item_committed(
return lsn;
}
-/*
- * XXX rcc - this one really has to do something. Probably needs
- * to stamp in a new field in the incore inode.
- */
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
@@ -759,7 +752,7 @@ xfs_iflush_done(
bool mlip_changed = false;
/* this is an opencoded batch version of xfs_trans_ail_delete */
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
list_for_each_entry(blip, &tmp, li_bio_list) {
if (INODE_ITEM(blip)->ili_logged &&
blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
@@ -770,15 +763,15 @@ xfs_iflush_done(
}
if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
- xlog_assign_tail_lsn_locked(ailp->xa_mount);
- if (list_empty(&ailp->xa_ail))
- wake_up_all(&ailp->xa_empty);
+ if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
+ xlog_assign_tail_lsn_locked(ailp->ail_mount);
+ if (list_empty(&ailp->ail_head))
+ wake_up_all(&ailp->ail_empty);
}
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
if (mlip_changed)
- xfs_log_space_wake(ailp->xa_mount);
+ xfs_log_space_wake(ailp->ail_mount);
}
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 66e1edbfb2b2..046469fcc1b8 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -955,15 +955,29 @@ static inline bool imap_needs_alloc(struct inode *inode,
(IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
}
+static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps)
+{
+ return nimaps &&
+ imap->br_startblock != HOLESTARTBLOCK &&
+ imap->br_state != XFS_EXT_UNWRITTEN;
+}
+
static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
{
/*
- * COW writes will allocate delalloc space, so we need to make sure
- * to take the lock exclusively here.
+ * COW writes may allocate delalloc space or convert unwritten COW
+ * extents, so we need to make sure to take the lock exclusively here.
*/
if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
return true;
- if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE))
+
+ /*
+ * Extents not yet cached requires exclusive access, don't block.
+ * This is an opencoded xfs_ilock_data_map_shared() to cater for the
+ * non-blocking behaviour.
+ */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ !(ip->i_df.if_flags & XFS_IFEXTENTS))
return true;
return false;
}
@@ -993,16 +1007,18 @@ xfs_file_iomap_begin(
return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
}
- if (need_excl_ilock(ip, flags)) {
+ if (need_excl_ilock(ip, flags))
lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- } else {
- lockmode = xfs_ilock_data_map_shared(ip);
- }
+ else
+ lockmode = XFS_ILOCK_SHARED;
- if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) {
- error = -EAGAIN;
- goto out_unlock;
+ if (flags & IOMAP_NOWAIT) {
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+ return -EAGAIN;
+ if (!xfs_ilock_nowait(ip, lockmode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, lockmode);
}
ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -1024,7 +1040,9 @@ xfs_file_iomap_begin(
goto out_unlock;
}
- if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+ if (xfs_is_reflink_inode(ip) &&
+ ((flags & IOMAP_WRITE) ||
+ ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) {
if (flags & IOMAP_DIRECT) {
/*
* A reflinked inode will result in CoW alloc.
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 56475fcd76f2..e0307fbff911 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -46,6 +46,7 @@
#include <linux/security.h>
#include <linux/iomap.h>
#include <linux/slab.h>
+#include <linux/iversion.h>
/*
* Directories have different lock order w.r.t. mmap_sem compared to regular
@@ -874,7 +875,9 @@ xfs_setattr_size(
* truncate.
*/
if (newsize > oldsize) {
- error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
+ trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
+ error = iomap_zero_range(inode, oldsize, newsize - oldsize,
+ &did_zeroing, &xfs_iomap_ops);
} else {
error = iomap_truncate_page(inode, newsize, &did_zeroing,
&xfs_iomap_ops);
@@ -1052,11 +1055,21 @@ xfs_vn_update_time(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
+ int log_flags = XFS_ILOG_TIMESTAMP;
struct xfs_trans *tp;
int error;
trace_xfs_update_time(ip);
+ if (inode->i_sb->s_flags & SB_LAZYTIME) {
+ if (!((flags & S_VERSION) &&
+ inode_maybe_inc_iversion(inode, false)))
+ return generic_update_time(inode, now, flags);
+
+ /* Capture the iversion update that just occurred */
+ log_flags |= XFS_ILOG_CORE;
+ }
+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
if (error)
return error;
@@ -1070,7 +1083,7 @@ xfs_vn_update_time(
inode->i_atime = *now;
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+ xfs_trans_log_inode(tp, ip, log_flags);
return xfs_trans_commit(tp);
}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3e5ba1ecc080..b9c9c848146b 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -869,7 +869,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
return 0;
}
- error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
#ifdef DEBUG
@@ -1149,7 +1149,7 @@ xlog_assign_tail_lsn_locked(
struct xfs_log_item *lip;
xfs_lsn_t tail_lsn;
- assert_spin_locked(&mp->m_ail->xa_lock);
+ assert_spin_locked(&mp->m_ail->ail_lock);
/*
* To make sure we always have a valid LSN for the log tail we keep
@@ -1172,9 +1172,9 @@ xlog_assign_tail_lsn(
{
xfs_lsn_t tail_lsn;
- spin_lock(&mp->m_ail->xa_lock);
+ spin_lock(&mp->m_ail->ail_lock);
tail_lsn = xlog_assign_tail_lsn_locked(mp);
- spin_unlock(&mp->m_ail->xa_lock);
+ spin_unlock(&mp->m_ail->ail_lock);
return tail_lsn;
}
@@ -3304,269 +3304,215 @@ xlog_state_switch_iclogs(
* not in the active nor dirty state.
*/
int
-_xfs_log_force(
+xfs_log_force(
struct xfs_mount *mp,
- uint flags,
- int *log_flushed)
+ uint flags)
{
struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
xfs_lsn_t lsn;
XFS_STATS_INC(mp, xs_log_force);
+ trace_xfs_log_force(mp, 0, _RET_IP_);
xlog_cil_force(log);
spin_lock(&log->l_icloglock);
-
iclog = log->l_iclog;
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- spin_unlock(&log->l_icloglock);
- return -EIO;
- }
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ goto out_error;
- /* If the head iclog is not active nor dirty, we just attach
- * ourselves to the head and go to sleep.
- */
- if (iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_DIRTY) {
+ if (iclog->ic_state == XLOG_STATE_DIRTY ||
+ (iclog->ic_state == XLOG_STATE_ACTIVE &&
+ atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) {
/*
- * If the head is dirty or (active and empty), then
- * we need to look at the previous iclog. If the previous
- * iclog is active or dirty we are done. There is nothing
- * to sync out. Otherwise, we attach ourselves to the
+ * If the head is dirty or (active and empty), then we need to
+ * look at the previous iclog.
+ *
+ * If the previous iclog is active or dirty we are done. There
+ * is nothing to sync out. Otherwise, we attach ourselves to the
* previous iclog and go to sleep.
*/
- if (iclog->ic_state == XLOG_STATE_DIRTY ||
- (atomic_read(&iclog->ic_refcnt) == 0
- && iclog->ic_offset == 0)) {
- iclog = iclog->ic_prev;
- if (iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_DIRTY)
- goto no_sleep;
- else
- goto maybe_sleep;
- } else {
- if (atomic_read(&iclog->ic_refcnt) == 0) {
- /* We are the only one with access to this
- * iclog. Flush it out now. There should
- * be a roundoff of zero to show that someone
- * has already taken care of the roundoff from
- * the previous sync.
- */
- atomic_inc(&iclog->ic_refcnt);
- lsn = be64_to_cpu(iclog->ic_header.h_lsn);
- xlog_state_switch_iclogs(log, iclog, 0);
- spin_unlock(&log->l_icloglock);
+ iclog = iclog->ic_prev;
+ if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_DIRTY)
+ goto out_unlock;
+ } else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+ if (atomic_read(&iclog->ic_refcnt) == 0) {
+ /*
+ * We are the only one with access to this iclog.
+ *
+ * Flush it out now. There should be a roundoff of zero
+ * to show that someone has already taken care of the
+ * roundoff from the previous sync.
+ */
+ atomic_inc(&iclog->ic_refcnt);
+ lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ xlog_state_switch_iclogs(log, iclog, 0);
+ spin_unlock(&log->l_icloglock);
- if (xlog_state_release_iclog(log, iclog))
- return -EIO;
-
- if (log_flushed)
- *log_flushed = 1;
- spin_lock(&log->l_icloglock);
- if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
- iclog->ic_state != XLOG_STATE_DIRTY)
- goto maybe_sleep;
- else
- goto no_sleep;
- } else {
- /* Someone else is writing to this iclog.
- * Use its call to flush out the data. However,
- * the other thread may not force out this LR,
- * so we mark it WANT_SYNC.
- */
- xlog_state_switch_iclogs(log, iclog, 0);
- goto maybe_sleep;
- }
- }
- }
+ if (xlog_state_release_iclog(log, iclog))
+ return -EIO;
- /* By the time we come around again, the iclog could've been filled
- * which would give it another lsn. If we have a new lsn, just
- * return because the relevant data has been flushed.
- */
-maybe_sleep:
- if (flags & XFS_LOG_SYNC) {
- /*
- * We must check if we're shutting down here, before
- * we wait, while we're holding the l_icloglock.
- * Then we check again after waking up, in case our
- * sleep was disturbed by a bad news.
- */
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- spin_unlock(&log->l_icloglock);
- return -EIO;
+ spin_lock(&log->l_icloglock);
+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn ||
+ iclog->ic_state == XLOG_STATE_DIRTY)
+ goto out_unlock;
+ } else {
+ /*
+ * Someone else is writing to this iclog.
+ *
+ * Use its call to flush out the data. However, the
+ * other thread may not force out this LR, so we mark
+ * it WANT_SYNC.
+ */
+ xlog_state_switch_iclogs(log, iclog, 0);
}
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ } else {
/*
- * No need to grab the log lock here since we're
- * only deciding whether or not to return EIO
- * and the memory read should be atomic.
+ * If the head iclog is not active nor dirty, we just attach
+ * ourselves to the head and go to sleep if necessary.
*/
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
- } else {
-
-no_sleep:
- spin_unlock(&log->l_icloglock);
+ ;
}
+
+ if (!(flags & XFS_LOG_SYNC))
+ goto out_unlock;
+
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ goto out_error;
+ XFS_STATS_INC(mp, xs_log_force_sleep);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ return -EIO;
return 0;
-}
-/*
- * Wrapper for _xfs_log_force(), to be used when caller doesn't care
- * about errors or whether the log was flushed or not. This is the normal
- * interface to use when trying to unpin items or move the log forward.
- */
-void
-xfs_log_force(
- xfs_mount_t *mp,
- uint flags)
-{
- trace_xfs_log_force(mp, 0, _RET_IP_);
- _xfs_log_force(mp, flags, NULL);
+out_unlock:
+ spin_unlock(&log->l_icloglock);
+ return 0;
+out_error:
+ spin_unlock(&log->l_icloglock);
+ return -EIO;
}
-/*
- * Force the in-core log to disk for a specific LSN.
- *
- * Find in-core log with lsn.
- * If it is in the DIRTY state, just return.
- * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
- * state and go to sleep or return.
- * If it is in any other state, go to sleep or return.
- *
- * Synchronous forces are implemented with a signal variable. All callers
- * to force a given lsn to disk will wait on a the sv attached to the
- * specific in-core log. When given in-core log finally completes its
- * write to disk, that thread will wake up all threads waiting on the
- * sv.
- */
-int
-_xfs_log_force_lsn(
+static int
+__xfs_log_force_lsn(
struct xfs_mount *mp,
xfs_lsn_t lsn,
uint flags,
- int *log_flushed)
+ int *log_flushed,
+ bool already_slept)
{
struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
- int already_slept = 0;
- ASSERT(lsn != 0);
-
- XFS_STATS_INC(mp, xs_log_force);
-
- lsn = xlog_cil_force_lsn(log, lsn);
- if (lsn == NULLCOMMITLSN)
- return 0;
-
-try_again:
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- spin_unlock(&log->l_icloglock);
- return -EIO;
- }
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ goto out_error;
- do {
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
- iclog = iclog->ic_next;
- continue;
- }
+ while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+ iclog = iclog->ic_next;
+ if (iclog == log->l_iclog)
+ goto out_unlock;
+ }
- if (iclog->ic_state == XLOG_STATE_DIRTY) {
- spin_unlock(&log->l_icloglock);
- return 0;
- }
+ if (iclog->ic_state == XLOG_STATE_DIRTY)
+ goto out_unlock;
- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
- /*
- * We sleep here if we haven't already slept (e.g.
- * this is the first time we've looked at the correct
- * iclog buf) and the buffer before us is going to
- * be sync'ed. The reason for this is that if we
- * are doing sync transactions here, by waiting for
- * the previous I/O to complete, we can allow a few
- * more transactions into this iclog before we close
- * it down.
- *
- * Otherwise, we mark the buffer WANT_SYNC, and bump
- * up the refcnt so we can release the log (which
- * drops the ref count). The state switch keeps new
- * transaction commits from using this buffer. When
- * the current commits finish writing into the buffer,
- * the refcount will drop to zero and the buffer will
- * go out then.
- */
- if (!already_slept &&
- (iclog->ic_prev->ic_state &
- (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
- ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+ if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+ /*
+ * We sleep here if we haven't already slept (e.g. this is the
+ * first time we've looked at the correct iclog buf) and the
+ * buffer before us is going to be sync'ed. The reason for this
+ * is that if we are doing sync transactions here, by waiting
+ * for the previous I/O to complete, we can allow a few more
+ * transactions into this iclog before we close it down.
+ *
+ * Otherwise, we mark the buffer WANT_SYNC, and bump up the
+ * refcnt so we can release the log (which drops the ref count).
+ * The state switch keeps new transaction commits from using
+ * this buffer. When the current commits finish writing into
+ * the buffer, the refcount will drop to zero and the buffer
+ * will go out then.
+ */
+ if (!already_slept &&
+ (iclog->ic_prev->ic_state &
+ (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
+ ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
- XFS_STATS_INC(mp, xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_prev->ic_write_wait,
- &log->l_icloglock);
- already_slept = 1;
- goto try_again;
- }
- atomic_inc(&iclog->ic_refcnt);
- xlog_state_switch_iclogs(log, iclog, 0);
- spin_unlock(&log->l_icloglock);
- if (xlog_state_release_iclog(log, iclog))
- return -EIO;
- if (log_flushed)
- *log_flushed = 1;
- spin_lock(&log->l_icloglock);
+ xlog_wait(&iclog->ic_prev->ic_write_wait,
+ &log->l_icloglock);
+ return -EAGAIN;
}
+ atomic_inc(&iclog->ic_refcnt);
+ xlog_state_switch_iclogs(log, iclog, 0);
+ spin_unlock(&log->l_icloglock);
+ if (xlog_state_release_iclog(log, iclog))
+ return -EIO;
+ if (log_flushed)
+ *log_flushed = 1;
+ spin_lock(&log->l_icloglock);
+ }
- if ((flags & XFS_LOG_SYNC) && /* sleep */
- !(iclog->ic_state &
- (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
- /*
- * Don't wait on completion if we know that we've
- * gotten a log write error.
- */
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- spin_unlock(&log->l_icloglock);
- return -EIO;
- }
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- /*
- * No need to grab the log lock here since we're
- * only deciding whether or not to return EIO
- * and the memory read should be atomic.
- */
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
- } else { /* just return */
- spin_unlock(&log->l_icloglock);
- }
+ if (!(flags & XFS_LOG_SYNC) ||
+ (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)))
+ goto out_unlock;
- return 0;
- } while (iclog != log->l_iclog);
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ goto out_error;
+
+ XFS_STATS_INC(mp, xs_log_force_sleep);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ return -EIO;
+ return 0;
+out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
+out_error:
+ spin_unlock(&log->l_icloglock);
+ return -EIO;
}
/*
- * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
- * about errors or whether the log was flushed or not. This is the normal
- * interface to use when trying to unpin items or move the log forward.
+ * Force the in-core log to disk for a specific LSN.
+ *
+ * Find in-core log with lsn.
+ * If it is in the DIRTY state, just return.
+ * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ * state and go to sleep or return.
+ * If it is in any other state, go to sleep or return.
+ *
+ * Synchronous forces are implemented with a wait queue. All callers trying
+ * to force a given lsn to disk must wait on the queue attached to the
+ * specific in-core log. When given in-core log finally completes its write
+ * to disk, that thread will wake up all threads waiting on the queue.
*/
-void
+int
xfs_log_force_lsn(
- xfs_mount_t *mp,
- xfs_lsn_t lsn,
- uint flags)
+ struct xfs_mount *mp,
+ xfs_lsn_t lsn,
+ uint flags,
+ int *log_flushed)
{
+ int ret;
+ ASSERT(lsn != 0);
+
+ XFS_STATS_INC(mp, xs_log_force);
trace_xfs_log_force(mp, lsn, _RET_IP_);
- _xfs_log_force_lsn(mp, lsn, flags, NULL);
+
+ lsn = xlog_cil_force_lsn(mp->m_log, lsn);
+ if (lsn == NULLCOMMITLSN)
+ return 0;
+
+ ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
+ if (ret == -EAGAIN)
+ ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
+ return ret;
}
/*
@@ -4035,7 +3981,7 @@ xfs_log_force_umount(
* to guarantee this.
*/
if (!logerror)
- _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ xfs_log_force(mp, XFS_LOG_SYNC);
/*
* mark the filesystem and the as in a shutdown state and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index bf212772595c..7e2d62922a16 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -129,18 +129,9 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
bool regrant);
-int _xfs_log_force(struct xfs_mount *mp,
- uint flags,
- int *log_forced);
-void xfs_log_force(struct xfs_mount *mp,
- uint flags);
-int _xfs_log_force_lsn(struct xfs_mount *mp,
- xfs_lsn_t lsn,
- uint flags,
- int *log_forced);
-void xfs_log_force_lsn(struct xfs_mount *mp,
- xfs_lsn_t lsn,
- uint flags);
+int xfs_log_force(struct xfs_mount *mp, uint flags);
+int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
+ int *log_forced);
int xfs_log_mount(struct xfs_mount *mp,
struct xfs_buftarg *log_target,
xfs_daddr_t start_block,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 43aa42a3a5d3..cb376ac8a595 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -202,7 +202,7 @@ xlog_cil_alloc_shadow_bufs(
*/
kmem_free(lip->li_lv_shadow);
- lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+ lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
lv->lv_item = lip;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 00240c9ee72e..2b2383f1895e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3173,13 +3173,6 @@ xlog_recover_inode_pass2(
/* recover the log dinode inode into the on disk inode */
xfs_log_dinode_to_disk(ldip, dip);
- /* the rest is in on-disk format */
- if (item->ri_buf[1].i_len > isize) {
- memcpy((char *)dip + isize,
- item->ri_buf[1].i_addr + isize,
- item->ri_buf[1].i_len - isize);
- }
-
fields = in_f->ilf_fields;
if (fields & XFS_ILOG_DEV)
xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
@@ -3252,7 +3245,9 @@ xlog_recover_inode_pass2(
}
out_owner_change:
- if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+ /* Recover the swapext owner change unless inode has been deleted */
+ if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
+ (dip->di_mode != 0))
error = xfs_recover_inode_owner_change(mp, dip, in_f,
buffer_list);
/* re-generate the checksum. */
@@ -3434,7 +3429,7 @@ xlog_recover_efi_pass2(
}
atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
- spin_lock(&log->l_ailp->xa_lock);
+ spin_lock(&log->l_ailp->ail_lock);
/*
* The EFI has two references. One for the EFD and one for EFI to ensure
* it makes it into the AIL. Insert the EFI into the AIL directly and
@@ -3477,7 +3472,7 @@ xlog_recover_efd_pass2(
* Search for the EFI with the id in the EFD format structure in the
* AIL.
*/
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
if (lip->li_type == XFS_LI_EFI) {
@@ -3487,9 +3482,9 @@ xlog_recover_efd_pass2(
* Drop the EFD reference to the EFI. This
* removes the EFI from the AIL and frees it.
*/
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
xfs_efi_release(efip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
break;
}
}
@@ -3497,7 +3492,7 @@ xlog_recover_efd_pass2(
}
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
return 0;
}
@@ -3530,7 +3525,7 @@ xlog_recover_rui_pass2(
}
atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
- spin_lock(&log->l_ailp->xa_lock);
+ spin_lock(&log->l_ailp->ail_lock);
/*
* The RUI has two references. One for the RUD and one for RUI to ensure
* it makes it into the AIL. Insert the RUI into the AIL directly and
@@ -3570,7 +3565,7 @@ xlog_recover_rud_pass2(
* Search for the RUI with the id in the RUD format structure in the
* AIL.
*/
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
if (lip->li_type == XFS_LI_RUI) {
@@ -3580,9 +3575,9 @@ xlog_recover_rud_pass2(
* Drop the RUD reference to the RUI. This
* removes the RUI from the AIL and frees it.
*/
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
xfs_rui_release(ruip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
break;
}
}
@@ -3590,7 +3585,7 @@ xlog_recover_rud_pass2(
}
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
return 0;
}
@@ -3646,7 +3641,7 @@ xlog_recover_cui_pass2(
}
atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
- spin_lock(&log->l_ailp->xa_lock);
+ spin_lock(&log->l_ailp->ail_lock);
/*
* The CUI has two references. One for the CUD and one for CUI to ensure
* it makes it into the AIL. Insert the CUI into the AIL directly and
@@ -3687,7 +3682,7 @@ xlog_recover_cud_pass2(
* Search for the CUI with the id in the CUD format structure in the
* AIL.
*/
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
if (lip->li_type == XFS_LI_CUI) {
@@ -3697,9 +3692,9 @@ xlog_recover_cud_pass2(
* Drop the CUD reference to the CUI. This
* removes the CUI from the AIL and frees it.
*/
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
xfs_cui_release(cuip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
break;
}
}
@@ -3707,7 +3702,7 @@ xlog_recover_cud_pass2(
}
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
return 0;
}
@@ -3765,7 +3760,7 @@ xlog_recover_bui_pass2(
}
atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
- spin_lock(&log->l_ailp->xa_lock);
+ spin_lock(&log->l_ailp->ail_lock);
/*
* The RUI has two references. One for the RUD and one for RUI to ensure
* it makes it into the AIL. Insert the RUI into the AIL directly and
@@ -3806,7 +3801,7 @@ xlog_recover_bud_pass2(
* Search for the BUI with the id in the BUD format structure in the
* AIL.
*/
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
if (lip->li_type == XFS_LI_BUI) {
@@ -3816,9 +3811,9 @@ xlog_recover_bud_pass2(
* Drop the BUD reference to the BUI. This
* removes the BUI from the AIL and frees it.
*/
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
xfs_bui_release(buip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
break;
}
}
@@ -3826,7 +3821,7 @@ xlog_recover_bud_pass2(
}
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
return 0;
}
@@ -4659,9 +4654,9 @@ xlog_recover_process_efi(
if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
return 0;
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
error = xfs_efi_recover(mp, efip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
return error;
}
@@ -4677,9 +4672,9 @@ xlog_recover_cancel_efi(
efip = container_of(lip, struct xfs_efi_log_item, efi_item);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
xfs_efi_release(efip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
}
/* Recover the RUI if necessary. */
@@ -4699,9 +4694,9 @@ xlog_recover_process_rui(
if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
return 0;
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
error = xfs_rui_recover(mp, ruip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
return error;
}
@@ -4717,9 +4712,9 @@ xlog_recover_cancel_rui(
ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
xfs_rui_release(ruip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
}
/* Recover the CUI if necessary. */
@@ -4740,9 +4735,9 @@ xlog_recover_process_cui(
if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
return 0;
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
error = xfs_cui_recover(mp, cuip, dfops);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
return error;
}
@@ -4758,9 +4753,9 @@ xlog_recover_cancel_cui(
cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
xfs_cui_release(cuip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
}
/* Recover the BUI if necessary. */
@@ -4781,9 +4776,9 @@ xlog_recover_process_bui(
if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
return 0;
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
error = xfs_bui_recover(mp, buip, dfops);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
return error;
}
@@ -4799,9 +4794,9 @@ xlog_recover_cancel_bui(
buip = container_of(lip, struct xfs_bui_log_item, bui_item);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
xfs_bui_release(buip);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
}
/* Is this log item a deferred action intent? */
@@ -4889,7 +4884,7 @@ xlog_recover_process_intents(
#endif
ailp = log->l_ailp;
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
@@ -4943,7 +4938,7 @@ xlog_recover_process_intents(
}
out:
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
if (error)
xfs_defer_cancel(&dfops);
else
@@ -4966,7 +4961,7 @@ xlog_recover_cancel_intents(
struct xfs_ail *ailp;
ailp = log->l_ailp;
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
/*
@@ -5000,7 +4995,7 @@ xlog_recover_cancel_intents(
}
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
return error;
}
@@ -5127,16 +5122,9 @@ xlog_recover_process_iunlinks(
xfs_agino_t agino;
int bucket;
int error;
- uint mp_dmevmask;
mp = log->l_mp;
- /*
- * Prevent any DMAPI event from being sent while in this function.
- */
- mp_dmevmask = mp->m_dmevmask;
- mp->m_dmevmask = 0;
-
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
/*
* Find the agi for this ag.
@@ -5172,8 +5160,6 @@ xlog_recover_process_iunlinks(
}
xfs_buf_rele(agibp);
}
-
- mp->m_dmevmask = mp_dmevmask;
}
STATIC int
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 98fd41cbb9e1..a901b86772f8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -803,8 +803,6 @@ xfs_mountfs(
get_unaligned_be16(&sbp->sb_uuid.b[4]);
mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]);
- mp->m_dmevmask = 0; /* not persistent; set after each mount */
-
error = xfs_da_mount(mp);
if (error) {
xfs_warn(mp, "Failed dir/attr init: %d", error);
@@ -819,8 +817,6 @@ xfs_mountfs(
/*
* Allocate and initialize the per-ag data.
*/
- spin_lock_init(&mp->m_perag_lock);
- INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e0792d036be2..10b90bbc5162 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -138,7 +138,6 @@ typedef struct xfs_mount {
spinlock_t m_perag_lock; /* lock for m_perag_tree */
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
- uint m_dmevmask; /* DMI events for this FS */
uint64_t m_flags; /* global mount flags */
bool m_inotbt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */
@@ -326,8 +325,9 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
/* per-AG block reservation data structures*/
enum xfs_ag_resv_type {
XFS_AG_RESV_NONE = 0,
- XFS_AG_RESV_METADATA,
XFS_AG_RESV_AGFL,
+ XFS_AG_RESV_METADATA,
+ XFS_AG_RESV_RMAPBT,
};
struct xfs_ag_resv {
@@ -353,6 +353,7 @@ typedef struct xfs_perag {
char pagi_inodeok; /* The agi is ok for inodes */
uint8_t pagf_levels[XFS_BTNUM_AGF];
/* # of levels in bno & cnt btree */
+ bool pagf_agflreset; /* agfl requires reset before use */
uint32_t pagf_flcount; /* count of blocks in freelist */
xfs_extlen_t pagf_freeblks; /* total free blocks */
xfs_extlen_t pagf_longest; /* longest free space */
@@ -391,8 +392,8 @@ typedef struct xfs_perag {
/* Blocks reserved for all kinds of metadata. */
struct xfs_ag_resv pag_meta_resv;
- /* Blocks reserved for just AGFL-based metadata. */
- struct xfs_ag_resv pag_agfl_resv;
+ /* Blocks reserved for the reverse mapping btree. */
+ struct xfs_ag_resv pag_rmapbt_resv;
/* reference count */
uint8_t pagf_refcount_level;
@@ -406,8 +407,8 @@ xfs_perag_resv(
switch (type) {
case XFS_AG_RESV_METADATA:
return &pag->pag_meta_resv;
- case XFS_AG_RESV_AGFL:
- return &pag->pag_agfl_resv;
+ case XFS_AG_RESV_RMAPBT:
+ return &pag->pag_rmapbt_resv;
default:
return NULL;
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 3a55d6fc271b..7a39f40645f7 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -23,6 +23,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
+#include "xfs_shared.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
@@ -456,10 +457,12 @@ xfs_cui_recover(
* transaction. Normally, any work that needs to be deferred
* gets attached to the same defer_ops that scheduled the
* refcount update. However, we're in log recovery here, so we
- * we create our own defer_ops and use that to finish up any
- * work that doesn't fit.
+ * we use the passed in defer_ops and to finish up any work that
+ * doesn't fit. We need to reserve enough blocks to handle a
+ * full btree split on either end of the refcount range.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
cudp = xfs_trans_get_cud(tp, cuip);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 270246943a06..cdbd342a5249 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -394,7 +394,7 @@ xfs_reflink_allocate_cow(
retry:
ASSERT(xfs_is_reflink_inode(ip));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/*
* Even if the extent is not shared we might have a preallocation for
@@ -668,7 +668,7 @@ xfs_reflink_cancel_cow_range(
/* Start a rolling transaction to remove the mappings */
error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
- 0, 0, 0, &tp);
+ 0, 0, XFS_TRANS_NOFS, &tp);
if (error)
goto out;
@@ -741,7 +741,7 @@ xfs_reflink_end_cow(
(unsigned int)(end_fsb - offset_fsb),
XFS_DATA_FORK);
error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
- resblks, 0, XFS_TRANS_RESERVE, &tp);
+ resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
if (error)
goto out;
@@ -762,10 +762,8 @@ xfs_reflink_end_cow(
xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
/* Extent delete may have bumped ext forward */
- if (!del.br_blockcount) {
- xfs_iext_prev(ifp, &icur);
- goto next_extent;
- }
+ if (!del.br_blockcount)
+ goto prev_extent;
ASSERT(!isnullstartblock(got.br_startblock));
@@ -774,10 +772,8 @@ xfs_reflink_end_cow(
* speculatively preallocated CoW extents that have been
* allocated but have not yet been involved in a write.
*/
- if (got.br_state == XFS_EXT_UNWRITTEN) {
- xfs_iext_prev(ifp, &icur);
- goto next_extent;
- }
+ if (got.br_state == XFS_EXT_UNWRITTEN)
+ goto prev_extent;
/* Unmap the old blocks in the data fork. */
xfs_defer_init(&dfops, &firstfsb);
@@ -816,9 +812,12 @@ xfs_reflink_end_cow(
error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_defer;
-next_extent:
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
+ continue;
+prev_extent:
+ if (!xfs_iext_prev_extent(ifp, &icur, &got))
+ break;
}
error = xfs_trans_commit(tp);
@@ -1061,7 +1060,7 @@ xfs_reflink_ag_has_free_space(
return 0;
pag = xfs_perag_get(mp, agno);
- if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) ||
+ if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
error = -ENOSPC;
xfs_perag_put(pag);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index f3b139c9aa16..49d3124863a8 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -23,6 +23,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
+#include "xfs_shared.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
@@ -470,7 +471,8 @@ xfs_rui_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
rudp = xfs_trans_get_rud(tp, ruip);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 7aba628dc527..612c1d5348b3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -250,6 +250,7 @@ xfs_parseargs(
return -EINVAL;
break;
case Opt_logdev:
+ kfree(mp->m_logname);
mp->m_logname = match_strdup(args);
if (!mp->m_logname)
return -ENOMEM;
@@ -258,6 +259,7 @@ xfs_parseargs(
xfs_warn(mp, "%s option not allowed on this system", p);
return -EINVAL;
case Opt_rtdev:
+ kfree(mp->m_rtname);
mp->m_rtname = match_strdup(args);
if (!mp->m_rtname)
return -ENOMEM;
@@ -970,7 +972,6 @@ xfs_fs_destroy_inode(
struct inode *inode)
{
struct xfs_inode *ip = XFS_I(inode);
- int error;
trace_xfs_destroy_inode(ip);
@@ -978,14 +979,6 @@ xfs_fs_destroy_inode(
XFS_STATS_INC(ip->i_mount, vn_rele);
XFS_STATS_INC(ip->i_mount, vn_remove);
- if (xfs_is_reflink_inode(ip)) {
- error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
- if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount))
- xfs_warn(ip->i_mount,
-"Error %d while evicting CoW blocks for inode %llu.",
- error, ip->i_ino);
- }
-
xfs_inactive(ip);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -1007,6 +1000,28 @@ xfs_fs_destroy_inode(
xfs_inode_set_reclaim_tag(ip);
}
+static void
+xfs_fs_dirty_inode(
+ struct inode *inode,
+ int flag)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+
+ if (!(inode->i_sb->s_flags & SB_LAZYTIME))
+ return;
+ if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME))
+ return;
+
+ if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp))
+ return;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+ xfs_trans_commit(tp);
+}
+
/*
* Slab object creation initialisation for the XFS inode.
* This covers only the idempotent fields in the XFS inode;
@@ -1564,29 +1579,48 @@ xfs_destroy_percpu_counters(
percpu_counter_destroy(&mp->m_fdblocks);
}
-STATIC int
-xfs_fs_fill_super(
- struct super_block *sb,
- void *data,
- int silent)
+static struct xfs_mount *
+xfs_mount_alloc(
+ struct super_block *sb)
{
- struct inode *root;
- struct xfs_mount *mp = NULL;
- int flags = 0, error = -ENOMEM;
+ struct xfs_mount *mp;
mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
- goto out;
+ return NULL;
+ mp->m_super = sb;
spin_lock_init(&mp->m_sb_lock);
+ spin_lock_init(&mp->m_agirotor_lock);
+ INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
+ spin_lock_init(&mp->m_perag_lock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
+ return mp;
+}
- mp->m_super = sb;
+
+STATIC int
+xfs_fs_fill_super(
+ struct super_block *sb,
+ void *data,
+ int silent)
+{
+ struct inode *root;
+ struct xfs_mount *mp = NULL;
+ int flags = 0, error = -ENOMEM;
+
+ /*
+ * allocate mp and do all low-level struct initializations before we
+ * attach it to the super
+ */
+ mp = xfs_mount_alloc(sb);
+ if (!mp)
+ goto out;
sb->s_fs_info = mp;
error = xfs_parseargs(mp, (char *)data);
@@ -1787,6 +1821,7 @@ xfs_fs_free_cached_objects(
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
+ .dirty_inode = xfs_fs_dirty_inode,
.drop_inode = xfs_fs_drop_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 945de08af7ba..a982c0b623d0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1477,7 +1477,7 @@ TRACE_EVENT(xfs_extent_busy_trim,
__entry->tlen)
);
-TRACE_EVENT(xfs_agf,
+DECLARE_EVENT_CLASS(xfs_agf_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
unsigned long caller_ip),
TP_ARGS(mp, agf, flags, caller_ip),
@@ -1533,6 +1533,13 @@ TRACE_EVENT(xfs_agf,
__entry->longest,
(void *)__entry->caller_ip)
);
+#define DEFINE_AGF_EVENT(name) \
+DEFINE_EVENT(xfs_agf_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, agf, flags, caller_ip))
+DEFINE_AGF_EVENT(xfs_agf);
+DEFINE_AGF_EVENT(xfs_agfl_reset);
TRACE_EVENT(xfs_free_extent,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 86f92df32c42..d6d8f9d129a7 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -119,8 +119,11 @@ xfs_trans_dup(
/* We gave our writer reference to the new transaction */
tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
+
+ ASSERT(tp->t_blk_res >= tp->t_blk_res_used);
ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
tp->t_blk_res = tp->t_blk_res_used;
+
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
ntp->t_pflags = tp->t_pflags;
@@ -344,13 +347,14 @@ xfs_trans_mod_sb(
break;
case XFS_TRANS_SB_FDBLOCKS:
/*
- * Track the number of blocks allocated in the
- * transaction. Make sure it does not exceed the
- * number reserved.
+ * Track the number of blocks allocated in the transaction.
+ * Make sure it does not exceed the number reserved. If so,
+ * shutdown as this can lead to accounting inconsistency.
*/
if (delta < 0) {
tp->t_blk_res_used += (uint)-delta;
- ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
+ if (tp->t_blk_res_used > tp->t_blk_res)
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
tp->t_fdblocks_delta += delta;
if (xfs_sb_version_haslazysbcount(&mp->m_sb))
@@ -803,8 +807,8 @@ xfs_log_item_batch_insert(
{
int i;
- spin_lock(&ailp->xa_lock);
- /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+ spin_lock(&ailp->ail_lock);
+ /* xfs_trans_ail_update_bulk drops ailp->ail_lock */
xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
for (i = 0; i < nr_items; i++) {
@@ -847,9 +851,9 @@ xfs_trans_committed_bulk(
struct xfs_ail_cursor cur;
int i = 0;
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
/* unpin all the log items */
for (lv = log_vector; lv; lv = lv->lv_next ) {
@@ -869,7 +873,7 @@ xfs_trans_committed_bulk(
* object into the AIL as we are in a shutdown situation.
*/
if (aborted) {
- ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+ ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount));
lip->li_ops->iop_unpin(lip, 1);
continue;
}
@@ -883,11 +887,11 @@ xfs_trans_committed_bulk(
* not affect the AIL cursor the bulk insert path is
* using.
*/
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
xfs_trans_ail_update(ailp, lip, item_lsn);
else
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
lip->li_ops->iop_unpin(lip, 0);
continue;
}
@@ -905,9 +909,9 @@ xfs_trans_committed_bulk(
if (i)
xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
}
/*
@@ -966,7 +970,7 @@ __xfs_trans_commit(
* log out now and wait for it.
*/
if (sync) {
- error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
+ error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
XFS_STATS_INC(mp, xs_trans_sync);
} else {
XFS_STATS_INC(mp, xs_trans_async);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index cef89f7127d3..d4a2445215e6 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -40,7 +40,7 @@ xfs_ail_check(
{
xfs_log_item_t *prev_lip;
- if (list_empty(&ailp->xa_ail))
+ if (list_empty(&ailp->ail_head))
return;
/*
@@ -48,11 +48,11 @@ xfs_ail_check(
*/
ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
- if (&prev_lip->li_ail != &ailp->xa_ail)
+ if (&prev_lip->li_ail != &ailp->ail_head)
ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
- if (&prev_lip->li_ail != &ailp->xa_ail)
+ if (&prev_lip->li_ail != &ailp->ail_head)
ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
@@ -69,10 +69,10 @@ static xfs_log_item_t *
xfs_ail_max(
struct xfs_ail *ailp)
{
- if (list_empty(&ailp->xa_ail))
+ if (list_empty(&ailp->ail_head))
return NULL;
- return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
+ return list_entry(ailp->ail_head.prev, xfs_log_item_t, li_ail);
}
/*
@@ -84,7 +84,7 @@ xfs_ail_next(
struct xfs_ail *ailp,
xfs_log_item_t *lip)
{
- if (lip->li_ail.next == &ailp->xa_ail)
+ if (lip->li_ail.next == &ailp->ail_head)
return NULL;
return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
@@ -105,11 +105,11 @@ xfs_ail_min_lsn(
xfs_lsn_t lsn = 0;
xfs_log_item_t *lip;
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
lip = xfs_ail_min(ailp);
if (lip)
lsn = lip->li_lsn;
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
return lsn;
}
@@ -124,11 +124,11 @@ xfs_ail_max_lsn(
xfs_lsn_t lsn = 0;
xfs_log_item_t *lip;
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
lip = xfs_ail_max(ailp);
if (lip)
lsn = lip->li_lsn;
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
return lsn;
}
@@ -146,7 +146,7 @@ xfs_trans_ail_cursor_init(
struct xfs_ail_cursor *cur)
{
cur->item = NULL;
- list_add_tail(&cur->list, &ailp->xa_cursors);
+ list_add_tail(&cur->list, &ailp->ail_cursors);
}
/*
@@ -194,7 +194,7 @@ xfs_trans_ail_cursor_clear(
{
struct xfs_ail_cursor *cur;
- list_for_each_entry(cur, &ailp->xa_cursors, list) {
+ list_for_each_entry(cur, &ailp->ail_cursors, list) {
if (cur->item == lip)
cur->item = (struct xfs_log_item *)
((uintptr_t)cur->item | 1);
@@ -222,7 +222,7 @@ xfs_trans_ail_cursor_first(
goto out;
}
- list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+ list_for_each_entry(lip, &ailp->ail_head, li_ail) {
if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
goto out;
}
@@ -241,7 +241,7 @@ __xfs_trans_ail_cursor_last(
{
xfs_log_item_t *lip;
- list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) {
+ list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) {
if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
return lip;
}
@@ -310,7 +310,7 @@ xfs_ail_splice(
if (lip)
list_splice(list, &lip->li_ail);
else
- list_splice(list, &ailp->xa_ail);
+ list_splice(list, &ailp->ail_head);
}
/*
@@ -335,17 +335,17 @@ xfsaild_push_item(
* If log item pinning is enabled, skip the push and track the item as
* pinned. This can help induce head-behind-tail conditions.
*/
- if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN))
+ if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
- return lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
+ return lip->li_ops->iop_push(lip, &ailp->ail_buf_list);
}
static long
xfsaild_push(
struct xfs_ail *ailp)
{
- xfs_mount_t *mp = ailp->xa_mount;
+ xfs_mount_t *mp = ailp->ail_mount;
struct xfs_ail_cursor cur;
xfs_log_item_t *lip;
xfs_lsn_t lsn;
@@ -360,30 +360,30 @@ xfsaild_push(
* buffers the last time we ran, force the log first and wait for it
* before pushing again.
*/
- if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 &&
- (!list_empty_careful(&ailp->xa_buf_list) ||
+ if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 &&
+ (!list_empty_careful(&ailp->ail_buf_list) ||
xfs_ail_min_lsn(ailp))) {
- ailp->xa_log_flush = 0;
+ ailp->ail_log_flush = 0;
XFS_STATS_INC(mp, xs_push_ail_flush);
xfs_log_force(mp, XFS_LOG_SYNC);
}
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
- /* barrier matches the xa_target update in xfs_ail_push() */
+ /* barrier matches the ail_target update in xfs_ail_push() */
smp_rmb();
- target = ailp->xa_target;
- ailp->xa_target_prev = target;
+ target = ailp->ail_target;
+ ailp->ail_target_prev = target;
- lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
if (!lip) {
/*
* If the AIL is empty or our push has reached the end we are
* done now.
*/
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
goto out_done;
}
@@ -404,7 +404,7 @@ xfsaild_push(
XFS_STATS_INC(mp, xs_push_ail_success);
trace_xfs_ail_push(lip);
- ailp->xa_last_pushed_lsn = lsn;
+ ailp->ail_last_pushed_lsn = lsn;
break;
case XFS_ITEM_FLUSHING:
@@ -423,7 +423,7 @@ xfsaild_push(
trace_xfs_ail_flushing(lip);
flushing++;
- ailp->xa_last_pushed_lsn = lsn;
+ ailp->ail_last_pushed_lsn = lsn;
break;
case XFS_ITEM_PINNED:
@@ -431,7 +431,7 @@ xfsaild_push(
trace_xfs_ail_pinned(lip);
stuck++;
- ailp->xa_log_flush++;
+ ailp->ail_log_flush++;
break;
case XFS_ITEM_LOCKED:
XFS_STATS_INC(mp, xs_push_ail_locked);
@@ -468,10 +468,10 @@ xfsaild_push(
lsn = lip->li_lsn;
}
xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
- if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
- ailp->xa_log_flush++;
+ if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
+ ailp->ail_log_flush++;
if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
out_done:
@@ -481,7 +481,7 @@ out_done:
* AIL before we start the next scan from the start of the AIL.
*/
tout = 50;
- ailp->xa_last_pushed_lsn = 0;
+ ailp->ail_last_pushed_lsn = 0;
} else if (((stuck + flushing) * 100) / count > 90) {
/*
* Either there is a lot of contention on the AIL or we are
@@ -494,7 +494,7 @@ out_done:
* the restart to issue a log force to unpin the stuck items.
*/
tout = 20;
- ailp->xa_last_pushed_lsn = 0;
+ ailp->ail_last_pushed_lsn = 0;
} else {
/*
* Assume we have more work to do in a short while.
@@ -536,26 +536,26 @@ xfsaild(
break;
}
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
/*
* Idle if the AIL is empty and we are not racing with a target
* update. We check the AIL after we set the task to a sleep
- * state to guarantee that we either catch an xa_target update
+ * state to guarantee that we either catch an ail_target update
* or that a wake_up resets the state to TASK_RUNNING.
* Otherwise, we run the risk of sleeping indefinitely.
*
- * The barrier matches the xa_target update in xfs_ail_push().
+ * The barrier matches the ail_target update in xfs_ail_push().
*/
smp_rmb();
if (!xfs_ail_min(ailp) &&
- ailp->xa_target == ailp->xa_target_prev) {
- spin_unlock(&ailp->xa_lock);
+ ailp->ail_target == ailp->ail_target_prev) {
+ spin_unlock(&ailp->ail_lock);
freezable_schedule();
tout = 0;
continue;
}
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
if (tout)
freezable_schedule_timeout(msecs_to_jiffies(tout));
@@ -592,8 +592,8 @@ xfs_ail_push(
xfs_log_item_t *lip;
lip = xfs_ail_min(ailp);
- if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
- XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
+ if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) ||
+ XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0)
return;
/*
@@ -601,10 +601,10 @@ xfs_ail_push(
* the XFS_AIL_PUSHING_BIT.
*/
smp_wmb();
- xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
+ xfs_trans_ail_copy_lsn(ailp, &ailp->ail_target, &threshold_lsn);
smp_wmb();
- wake_up_process(ailp->xa_task);
+ wake_up_process(ailp->ail_task);
}
/*
@@ -630,18 +630,18 @@ xfs_ail_push_all_sync(
struct xfs_log_item *lip;
DEFINE_WAIT(wait);
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
while ((lip = xfs_ail_max(ailp)) != NULL) {
- prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE);
- ailp->xa_target = lip->li_lsn;
- wake_up_process(ailp->xa_task);
- spin_unlock(&ailp->xa_lock);
+ prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE);
+ ailp->ail_target = lip->li_lsn;
+ wake_up_process(ailp->ail_task);
+ spin_unlock(&ailp->ail_lock);
schedule();
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
}
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
- finish_wait(&ailp->xa_empty, &wait);
+ finish_wait(&ailp->ail_empty, &wait);
}
/*
@@ -672,7 +672,7 @@ xfs_trans_ail_update_bulk(
struct xfs_ail_cursor *cur,
struct xfs_log_item **log_items,
int nr_items,
- xfs_lsn_t lsn) __releases(ailp->xa_lock)
+ xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
xfs_log_item_t *mlip;
int mlip_changed = 0;
@@ -705,13 +705,13 @@ xfs_trans_ail_update_bulk(
xfs_ail_splice(ailp, cur, &tmp, lsn);
if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
- xlog_assign_tail_lsn_locked(ailp->xa_mount);
- spin_unlock(&ailp->xa_lock);
+ if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
+ xlog_assign_tail_lsn_locked(ailp->ail_mount);
+ spin_unlock(&ailp->ail_lock);
- xfs_log_space_wake(ailp->xa_mount);
+ xfs_log_space_wake(ailp->ail_mount);
} else {
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
}
}
@@ -756,13 +756,13 @@ void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->xa_lock)
+ int shutdown_type) __releases(ailp->ail_lock)
{
- struct xfs_mount *mp = ailp->xa_mount;
+ struct xfs_mount *mp = ailp->ail_mount;
bool mlip_changed;
if (!(lip->li_flags & XFS_LI_IN_AIL)) {
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
"%s: attempting to delete a log item that is not in the AIL",
@@ -776,13 +776,13 @@ xfs_trans_ail_delete(
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(mp))
xlog_assign_tail_lsn_locked(mp);
- if (list_empty(&ailp->xa_ail))
- wake_up_all(&ailp->xa_empty);
+ if (list_empty(&ailp->ail_head))
+ wake_up_all(&ailp->ail_empty);
}
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
if (mlip_changed)
- xfs_log_space_wake(ailp->xa_mount);
+ xfs_log_space_wake(ailp->ail_mount);
}
int
@@ -795,16 +795,16 @@ xfs_trans_ail_init(
if (!ailp)
return -ENOMEM;
- ailp->xa_mount = mp;
- INIT_LIST_HEAD(&ailp->xa_ail);
- INIT_LIST_HEAD(&ailp->xa_cursors);
- spin_lock_init(&ailp->xa_lock);
- INIT_LIST_HEAD(&ailp->xa_buf_list);
- init_waitqueue_head(&ailp->xa_empty);
+ ailp->ail_mount = mp;
+ INIT_LIST_HEAD(&ailp->ail_head);
+ INIT_LIST_HEAD(&ailp->ail_cursors);
+ spin_lock_init(&ailp->ail_lock);
+ INIT_LIST_HEAD(&ailp->ail_buf_list);
+ init_waitqueue_head(&ailp->ail_empty);
- ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
- ailp->xa_mount->m_fsname);
- if (IS_ERR(ailp->xa_task))
+ ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+ ailp->ail_mount->m_fsname);
+ if (IS_ERR(ailp->ail_task))
goto out_free_ailp;
mp->m_ail = ailp;
@@ -821,6 +821,6 @@ xfs_trans_ail_destroy(
{
struct xfs_ail *ailp = mp->m_ail;
- kthread_stop(ailp->xa_task);
+ kthread_stop(ailp->ail_task);
kmem_free(ailp);
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 653ce379d36b..a5d9dfc45d98 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -431,8 +431,8 @@ xfs_trans_brelse(
* If the fs has shutdown and we dropped the last reference, it may fall
* on us to release a (possibly dirty) bli if it never made it to the
* AIL (e.g., the aborted unpin already happened and didn't release it
- * due to our reference). Since we're already shutdown and need xa_lock,
- * just force remove from the AIL and release the bli here.
+ * due to our reference). Since we're already shutdown and need
+ * ail_lock, just force remove from the AIL and release the bli here.
*/
if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 4a89da4b6fe7..07cea592dc01 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -98,10 +98,24 @@ xfs_trans_log_inode(
xfs_inode_t *ip,
uint flags)
{
+ struct inode *inode = VFS_I(ip);
+
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/*
+ * Don't bother with i_lock for the I_DIRTY_TIME check here, as races
+ * don't matter - we either will need an extra transaction in 24 hours
+ * to log the timestamps, or will clear already cleared fields in the
+ * worst case.
+ */
+ if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) {
+ spin_lock(&inode->i_lock);
+ inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ spin_unlock(&inode->i_lock);
+ }
+
+ /*
* Record the specific change for fdatasync optimisation. This
* allows fdatasync to skip log forces for inodes that are only
* timestamp dirty. We do this before the change count so that
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index b317a3644c00..be24b0c8a332 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -65,17 +65,17 @@ struct xfs_ail_cursor {
* Eventually we need to drive the locking in here as well.
*/
struct xfs_ail {
- struct xfs_mount *xa_mount;
- struct task_struct *xa_task;
- struct list_head xa_ail;
- xfs_lsn_t xa_target;
- xfs_lsn_t xa_target_prev;
- struct list_head xa_cursors;
- spinlock_t xa_lock;
- xfs_lsn_t xa_last_pushed_lsn;
- int xa_log_flush;
- struct list_head xa_buf_list;
- wait_queue_head_t xa_empty;
+ struct xfs_mount *ail_mount;
+ struct task_struct *ail_task;
+ struct list_head ail_head;
+ xfs_lsn_t ail_target;
+ xfs_lsn_t ail_target_prev;
+ struct list_head ail_cursors;
+ spinlock_t ail_lock;
+ xfs_lsn_t ail_last_pushed_lsn;
+ int ail_log_flush;
+ struct list_head ail_buf_list;
+ wait_queue_head_t ail_empty;
};
/*
@@ -84,7 +84,7 @@ struct xfs_ail {
void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
struct xfs_ail_cursor *cur,
struct xfs_log_item **log_items, int nr_items,
- xfs_lsn_t lsn) __releases(ailp->xa_lock);
+ xfs_lsn_t lsn) __releases(ailp->ail_lock);
/*
* Return a pointer to the first item in the AIL. If the AIL is empty, then
* return NULL.
@@ -93,7 +93,7 @@ static inline struct xfs_log_item *
xfs_ail_min(
struct xfs_ail *ailp)
{
- return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item,
+ return list_first_entry_or_null(&ailp->ail_head, struct xfs_log_item,
li_ail);
}
@@ -101,14 +101,14 @@ static inline void
xfs_trans_ail_update(
struct xfs_ail *ailp,
struct xfs_log_item *lip,
- xfs_lsn_t lsn) __releases(ailp->xa_lock)
+ xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->xa_lock);
+ int shutdown_type) __releases(ailp->ail_lock);
static inline void
xfs_trans_ail_remove(
@@ -117,12 +117,12 @@ xfs_trans_ail_remove(
{
struct xfs_ail *ailp = lip->li_ailp;
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
/* xfs_trans_ail_delete() drops the AIL lock */
if (lip->li_flags & XFS_LI_IN_AIL)
xfs_trans_ail_delete(ailp, lip, shutdown_type);
else
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
}
void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
@@ -149,9 +149,9 @@ xfs_trans_ail_copy_lsn(
xfs_lsn_t *src)
{
ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
- spin_lock(&ailp->xa_lock);
+ spin_lock(&ailp->ail_lock);
*dst = *src;
- spin_unlock(&ailp->xa_lock);
+ spin_unlock(&ailp->ail_lock);
}
#else
static inline void
@@ -172,7 +172,7 @@ xfs_clear_li_failed(
struct xfs_buf *bp = lip->li_buf;
ASSERT(lip->li_flags & XFS_LI_IN_AIL);
- lockdep_assert_held(&lip->li_ailp->xa_lock);
+ lockdep_assert_held(&lip->li_ailp->ail_lock);
if (lip->li_flags & XFS_LI_FAILED) {
lip->li_flags &= ~XFS_LI_FAILED;
@@ -186,7 +186,7 @@ xfs_set_li_failed(
struct xfs_log_item *lip,
struct xfs_buf *bp)
{
- lockdep_assert_held(&lip->li_ailp->xa_lock);
+ lockdep_assert_held(&lip->li_ailp->ail_lock);
if (!(lip->li_flags & XFS_LI_FAILED)) {
xfs_buf_hold(bp);