summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c6
-rw-r--r--fs/9p/xattr.c5
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/affs/super.c5
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/aio.c7
-rw-r--r--fs/bad_inode.c4
-rw-r--r--fs/befs/datastream.c8
-rw-r--r--fs/befs/io.c4
-rw-r--r--fs/befs/linuxvfs.c8
-rw-r--r--fs/binfmt_aout.c26
-rw-r--r--fs/binfmt_elf.c18
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/binfmt_flat.c6
-rw-r--r--fs/block_dev.c156
-rw-r--r--fs/btrfs/backref.c4
-rw-r--r--fs/btrfs/btrfs_inode.h12
-rw-r--r--fs/btrfs/check-integrity.c4
-rw-r--r--fs/btrfs/compression.c85
-rw-r--r--fs/btrfs/ctree.c33
-rw-r--r--fs/btrfs/ctree.h1129
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/dev-replace.c103
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/disk-io.c162
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c233
-rw-r--r--fs/btrfs/extent_io.c211
-rw-r--r--fs/btrfs/extent_io.h39
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file-item.c2
-rw-r--r--fs/btrfs/file.c37
-rw-r--r--fs/btrfs/free-space-cache.c20
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/hash.c5
-rw-r--r--fs/btrfs/hash.h1
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode.c517
-rw-r--r--fs/btrfs/ioctl.c228
-rw-r--r--fs/btrfs/ordered-data.c30
-rw-r--r--fs/btrfs/ordered-data.h8
-rw-r--r--fs/btrfs/qgroup.c24
-rw-r--r--fs/btrfs/raid56.c6
-rw-r--r--fs/btrfs/reada.c2
-rw-r--r--fs/btrfs/relocation.c32
-rw-r--r--fs/btrfs/root-tree.c8
-rw-r--r--fs/btrfs/scrub.c86
-rw-r--r--fs/btrfs/send.c68
-rw-r--r--fs/btrfs/struct-funcs.c2
-rw-r--r--fs/btrfs/super.c129
-rw-r--r--fs/btrfs/sysfs.c14
-rw-r--r--fs/btrfs/tests/btrfs-tests.c6
-rw-r--r--fs/btrfs/tests/btrfs-tests.h27
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c13
-rw-r--r--fs/btrfs/tests/extent-io-tests.c92
-rw-r--r--fs/btrfs/tests/free-space-tests.c83
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c30
-rw-r--r--fs/btrfs/tests/inode-tests.c346
-rw-r--r--fs/btrfs/tests/qgroup-tests.c113
-rw-r--r--fs/btrfs/transaction.c135
-rw-r--r--fs/btrfs/transaction.h4
-rw-r--r--fs/btrfs/tree-log.c88
-rw-r--r--fs/btrfs/ulist.c2
-rw-r--r--fs/btrfs/volumes.c613
-rw-r--r--fs/btrfs/volumes.h57
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/buffer.c10
-rw-r--r--fs/cachefiles/interface.c2
-rw-r--r--fs/ceph/addr.c220
-rw-r--r--fs/ceph/cache.c143
-rw-r--r--fs/ceph/cache.h44
-rw-r--r--fs/ceph/caps.c74
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c376
-rw-r--r--fs/ceph/file.c116
-rw-r--r--fs/ceph/inode.c159
-rw-r--r--fs/ceph/ioctl.c14
-rw-r--r--fs/ceph/mds_client.c140
-rw-r--r--fs/ceph/mds_client.h17
-rw-r--r--fs/ceph/mdsmap.c43
-rw-r--r--fs/ceph/super.c47
-rw-r--r--fs/ceph/super.h16
-rw-r--r--fs/ceph/xattr.c32
-rw-r--r--fs/char_dev.c4
-rw-r--r--fs/cifs/cifs_spnego.c67
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsfs.c11
-rw-r--r--fs/cifs/cifsproto.h2
-rw-r--r--fs/cifs/xattr.c9
-rw-r--r--fs/compat.c4
-rw-r--r--fs/coredump.c8
-rw-r--r--fs/crypto/keyinfo.c120
-rw-r--r--fs/dax.c842
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/debugfs/file.c437
-rw-r--r--fs/debugfs/inode.c108
-rw-r--r--fs/debugfs/internal.h26
-rw-r--r--fs/devpts/inode.c191
-rw-r--r--fs/direct-io.c14
-rw-r--r--fs/ecryptfs/crypto.c9
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h4
-rw-r--r--fs/ecryptfs/inode.c7
-rw-r--r--fs/ecryptfs/kthread.c13
-rw-r--r--fs/ecryptfs/mmap.c3
-rw-r--r--fs/efivarfs/inode.c40
-rw-r--r--fs/efs/super.c4
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/exec.c46
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/inode.c12
-rw-r--r--fs/ext2/super.c11
-rw-r--r--fs/ext2/xattr_security.c7
-rw-r--r--fs/ext2/xattr_trusted.c7
-rw-r--r--fs/ext2/xattr_user.c9
-rw-r--r--fs/ext4/balloc.c3
-rw-r--r--fs/ext4/dir.c5
-rw-r--r--fs/ext4/ext4.h20
-rw-r--r--fs/ext4/ext4_jbd2.h15
-rw-r--r--fs/ext4/extents.c20
-rw-r--r--fs/ext4/extents_status.c2
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/ialloc.c59
-rw-r--r--fs/ext4/indirect.c127
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c323
-rw-r--r--fs/ext4/ioctl.c4
-rw-r--r--fs/ext4/mballoc.c12
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c9
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c15
-rw-r--r--fs/ext4/xattr_security.c7
-rw-r--r--fs/ext4/xattr_trusted.c7
-rw-r--r--fs/ext4/xattr_user.c9
-rw-r--r--fs/f2fs/Kconfig8
-rw-r--r--fs/f2fs/acl.c4
-rw-r--r--fs/f2fs/checkpoint.c67
-rw-r--r--fs/f2fs/data.c197
-rw-r--r--fs/f2fs/debug.c25
-rw-r--r--fs/f2fs/dir.c128
-rw-r--r--fs/f2fs/extent_cache.c3
-rw-r--r--fs/f2fs/f2fs.h197
-rw-r--r--fs/f2fs/file.c311
-rw-r--r--fs/f2fs/gc.c27
-rw-r--r--fs/f2fs/inline.c111
-rw-r--r--fs/f2fs/inode.c66
-rw-r--r--fs/f2fs/node.c316
-rw-r--r--fs/f2fs/recovery.c149
-rw-r--r--fs/f2fs/segment.c8
-rw-r--r--fs/f2fs/segment.h9
-rw-r--r--fs/f2fs/super.c288
-rw-r--r--fs/f2fs/xattr.c15
-rw-r--r--fs/fs-writeback.c3
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fuse/dir.c6
-rw-r--r--fs/gfs2/aops.c5
-rw-r--r--fs/gfs2/dir.c15
-rw-r--r--fs/gfs2/file.c31
-rw-r--r--fs/gfs2/glock.c11
-rw-r--r--fs/gfs2/glops.c7
-rw-r--r--fs/gfs2/inode.c6
-rw-r--r--fs/gfs2/meta_io.c7
-rw-r--r--fs/gfs2/meta_io.h8
-rw-r--r--fs/gfs2/rgrp.c16
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/xattr.c6
-rw-r--r--fs/hfs/attr.c6
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfsplus/xattr.c12
-rw-r--r--fs/hfsplus/xattr.h2
-rw-r--r--fs/hfsplus/xattr_security.c7
-rw-r--r--fs/hfsplus/xattr_trusted.c7
-rw-r--r--fs/hfsplus/xattr_user.c7
-rw-r--r--fs/hpfs/super.c42
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c3
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/jffs2/security.c7
-rw-r--r--fs/jffs2/xattr_trusted.c7
-rw-r--r--fs/jffs2/xattr_user.c7
-rw-r--r--fs/jfs/xattr.c14
-rw-r--r--fs/kernfs/dir.c8
-rw-r--r--fs/kernfs/file.c51
-rw-r--r--fs/kernfs/inode.c26
-rw-r--r--fs/kernfs/kernfs-internal.h3
-rw-r--r--fs/libfs.c5
-rw-r--r--fs/namei.c275
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/nfs/callback_proc.c9
-rw-r--r--fs/nfs/callback_xdr.c17
-rw-r--r--fs/nfs/delegation.c9
-rw-r--r--fs/nfs/delegation.h2
-rw-r--r--fs/nfs/direct.c17
-rw-r--r--fs/nfs/filelayout/filelayout.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c200
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h17
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c119
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c107
-rw-r--r--fs/nfs/nfs42xdr.c146
-rw-r--r--fs/nfs/nfs4_fs.h12
-rw-r--r--fs/nfs/nfs4file.c23
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/nfs4proc.c204
-rw-r--r--fs/nfs/nfs4state.c18
-rw-r--r--fs/nfs/nfs4trace.h10
-rw-r--r--fs/nfs/nfs4xdr.c43
-rw-r--r--fs/nfs/pagelist.c6
-rw-r--r--fs/nfs/pnfs.c349
-rw-r--r--fs/nfs/pnfs.h17
-rw-r--r--fs/nfs/pnfs_nfs.c60
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/nfs/write.c64
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfs4callback.c18
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4state.c75
-rw-r--r--fs/nfsd/state.h7
-rw-r--r--fs/nilfs2/alloc.c39
-rw-r--r--fs/nilfs2/alloc.h11
-rw-r--r--fs/nilfs2/bmap.c10
-rw-r--r--fs/nilfs2/bmap.h22
-rw-r--r--fs/nilfs2/btnode.c9
-rw-r--r--fs/nilfs2/btnode.h8
-rw-r--r--fs/nilfs2/btree.c21
-rw-r--r--fs/nilfs2/btree.h6
-rw-r--r--fs/nilfs2/cpfile.c22
-rw-r--r--fs/nilfs2/cpfile.h10
-rw-r--r--fs/nilfs2/dat.c8
-rw-r--r--fs/nilfs2/dat.h8
-rw-r--r--fs/nilfs2/dir.c59
-rw-r--r--fs/nilfs2/direct.c17
-rw-r--r--fs/nilfs2/direct.h6
-rw-r--r--fs/nilfs2/export.h2
-rw-r--r--fs/nilfs2/file.c7
-rw-r--r--fs/nilfs2/gcinode.c9
-rw-r--r--fs/nilfs2/ifile.c14
-rw-r--r--fs/nilfs2/ifile.h9
-rw-r--r--fs/nilfs2/inode.c105
-rw-r--r--fs/nilfs2/ioctl.c17
-rw-r--r--fs/nilfs2/mdt.c63
-rw-r--r--fs/nilfs2/mdt.h21
-rw-r--r--fs/nilfs2/namei.c12
-rw-r--r--fs/nilfs2/nilfs.h33
-rw-r--r--fs/nilfs2/page.c15
-rw-r--r--fs/nilfs2/page.h10
-rw-r--r--fs/nilfs2/recovery.c25
-rw-r--r--fs/nilfs2/segbuf.c10
-rw-r--r--fs/nilfs2/segbuf.h11
-rw-r--r--fs/nilfs2/segment.c125
-rw-r--r--fs/nilfs2/segment.h44
-rw-r--r--fs/nilfs2/sufile.c14
-rw-r--r--fs/nilfs2/sufile.h10
-rw-r--r--fs/nilfs2/super.c21
-rw-r--r--fs/nilfs2/sysfs.c10
-rw-r--r--fs/nilfs2/sysfs.h6
-rw-r--r--fs/nilfs2/the_nilfs.c16
-rw-r--r--fs/nilfs2/the_nilfs.h27
-rw-r--r--fs/notify/fsnotify.h7
-rw-r--r--fs/notify/group.c17
-rw-r--r--fs/notify/mark.c78
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c185
-rw-r--r--fs/ocfs2/cluster/tcp.c17
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/inode.c7
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/slot_map.c6
-rw-r--r--fs/ocfs2/xattr.c23
-rw-r--r--fs/orangefs/xattr.c10
-rw-r--r--fs/overlayfs/copy_up.c26
-rw-r--r--fs/overlayfs/dir.c78
-rw-r--r--fs/overlayfs/inode.c31
-rw-r--r--fs/overlayfs/overlayfs.h6
-rw-r--r--fs/overlayfs/readdir.c18
-rw-r--r--fs/overlayfs/super.c37
-rw-r--r--fs/posix_acl.c6
-rw-r--r--fs/proc/array.c20
-rw-r--r--fs/proc/base.c42
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/root.c7
-rw-r--r--fs/proc/task_mmu.c11
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/ramfs/file-nommu.c8
-rw-r--r--fs/readdir.c16
-rw-r--r--fs/reiserfs/objectid.c2
-rw-r--r--fs/reiserfs/xattr_security.c9
-rw-r--r--fs/reiserfs/xattr_trusted.c9
-rw-r--r--fs/reiserfs/xattr_user.c9
-rw-r--r--fs/select.c67
-rw-r--r--fs/ubifs/debug.c2
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/xattr.c7
-rw-r--r--fs/userfaultfd.c41
-rw-r--r--fs/xattr.c13
-rw-r--r--fs/xfs/kmem.c26
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c58
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c22
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c9
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c99
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h1
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h5
-rw-r--r--fs/xfs/libxfs/xfs_sb.c8
-rw-r--r--fs/xfs/libxfs/xfs_shared.h102
-rw-r--r--fs/xfs/xfs_aops.c353
-rw-r--r--fs/xfs/xfs_aops.h15
-rw-r--r--fs/xfs/xfs_attr.h4
-rw-r--r--fs/xfs/xfs_attr_inactive.c16
-rw-r--r--fs/xfs/xfs_attr_list.c85
-rw-r--r--fs/xfs/xfs_bmap_util.c60
-rw-r--r--fs/xfs/xfs_buf.c12
-rw-r--r--fs/xfs/xfs_buf.h20
-rw-r--r--fs/xfs/xfs_buf_item.c121
-rw-r--r--fs/xfs/xfs_dquot.c9
-rw-r--r--fs/xfs/xfs_file.c15
-rw-r--r--fs/xfs/xfs_fsops.c14
-rw-r--r--fs/xfs/xfs_icache.c290
-rw-r--r--fs/xfs/xfs_inode.c167
-rw-r--r--fs/xfs/xfs_inode.h5
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_ioctl.c31
-rw-r--r--fs/xfs/xfs_iomap.c53
-rw-r--r--fs/xfs/xfs_iops.c117
-rw-r--r--fs/xfs/xfs_log.c62
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_cil.c1
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_log_recover.c12
-rw-r--r--fs/xfs/xfs_mount.c23
-rw-r--r--fs/xfs/xfs_mount.h34
-rw-r--r--fs/xfs/xfs_pnfs.c7
-rw-r--r--fs/xfs/xfs_qm.c9
-rw-r--r--fs/xfs/xfs_qm_syscalls.c26
-rw-r--r--fs/xfs/xfs_rtalloc.c21
-rw-r--r--fs/xfs/xfs_super.c77
-rw-r--r--fs/xfs/xfs_symlink.c37
-rw-r--r--fs/xfs/xfs_sysfs.c291
-rw-r--r--fs/xfs/xfs_sysfs.h3
-rw-r--r--fs/xfs/xfs_trace.h16
-rw-r--r--fs/xfs/xfs_trans.c88
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_xattr.c26
350 files changed, 10235 insertions, 7399 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eb3589edf485..0576eaeb60b9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
}
static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
int retval;
struct posix_acl *acl;
struct v9fs_session_info *v9ses;
- struct inode *inode = d_inode(dentry);
v9ses = v9fs_dentry2v9ses(dentry);
/*
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 18c62bae9591..a6bd349bab23 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
}
static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
const char *full_name = xattr_full_name(handler, name);
diff --git a/fs/Kconfig b/fs/Kconfig
index 6725f59c18e6..b8fcb416be72 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -52,6 +52,7 @@ config FS_DAX_PMD
depends on FS_DAX
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
+ depends on BROKEN
endif # BLOCK
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 2d0cbbd14cfc..72c03354c14b 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -1,6 +1,7 @@
config BINFMT_ELF
bool "Kernel support for ELF binaries"
depends on MMU && (BROKEN || !FRV)
+ select ELFCORE
default y
---help---
ELF (Executable and Linkable Format) is a format for libraries and
@@ -26,6 +27,7 @@ config BINFMT_ELF
config COMPAT_BINFMT_ELF
bool
depends on COMPAT && BINFMT_ELF
+ select ELFCORE
config ARCH_BINFMT_ELF_STATE
bool
@@ -34,6 +36,7 @@ config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y
depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
+ select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
segments of a binary to be located in memory independently of each
@@ -43,6 +46,11 @@ config BINFMT_ELF_FDPIC
It is also possible to run FDPIC ELF binaries on MMU linux also.
+config ELFCORE
+ bool
+ help
+ This option enables kernel/elfcore.o.
+
config CORE_DUMP_DEFAULT_ELF_HEADERS
bool "Write ELF core dumps with partial segments"
default y
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 2a6713b6b9f4..d6384863192c 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -528,7 +528,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
char *prefix = NULL;
new_opts = kstrdup(data, GFP_KERNEL);
- if (!new_opts)
+ if (data && !new_opts)
return -ENOMEM;
pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
@@ -546,7 +546,8 @@ affs_remount(struct super_block *sb, int *flags, char *data)
}
flush_delayed_work(&sbi->sb_work);
- replace_mount_options(sb, new_opts);
+ if (new_opts)
+ replace_mount_options(sb, new_opts);
sbi->s_flags = mount_flags;
sbi->s_mode = mode;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 65de439bdc4f..14d506efd1aa 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -643,10 +643,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
return 0;
result = generic_file_write_iter(iocb, from);
- if (IS_ERR_VALUE(result)) {
- _leave(" = %zd", result);
- return result;
- }
_leave(" = %zd", result);
return result;
diff --git a/fs/aio.c b/fs/aio.c
index a6deaa78326d..fb8e45b88cd4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -496,7 +496,12 @@ static int aio_setup_ring(struct kioctx *ctx)
ctx->mmap_size = nr_pages * PAGE_SIZE;
pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem)) {
+ ctx->mmap_size = 0;
+ aio_free_ring(ctx);
+ return -EINTR;
+ }
+
ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
PROT_READ | PROT_WRITE,
MAP_SHARED, 0, &unused);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 72e35b721608..3ba385eaa26e 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -100,8 +100,8 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
return -EIO;
}
-static int bad_inode_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value, size_t size, int flags)
{
return -EIO;
}
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index dde0b79f3948..af1bc19b7c85 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -48,7 +48,7 @@ struct buffer_head *
befs_read_datastream(struct super_block *sb, const befs_data_stream *ds,
befs_off_t pos, uint * off)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_block_run run;
befs_blocknr_t block; /* block coresponding to pos */
@@ -127,7 +127,7 @@ befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds,
{
befs_off_t bytes_read = 0; /* bytes readed */
u16 plen;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_debug(sb, "---> %s length: %llu", __func__, len);
while (bytes_read < len) {
@@ -429,7 +429,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
struct buffer_head *dbl_indir_block;
struct buffer_head *indir_block;
befs_block_run indir_run;
- befs_disk_inode_addr *iaddr_array = NULL;
+ befs_disk_inode_addr *iaddr_array;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
befs_blocknr_t indir_start_blk =
@@ -488,7 +488,6 @@ befs_find_brun_dblindirect(struct super_block *sb,
iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
brelse(dbl_indir_block);
- iaddr_array = NULL;
/* Read indirect block */
which_block = indir_indx / befs_iaddrs_per_block(sb);
@@ -513,7 +512,6 @@ befs_find_brun_dblindirect(struct super_block *sb,
iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
*run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
brelse(indir_block);
- iaddr_array = NULL;
blockno_at_run_start = indir_start_blk;
blockno_at_run_start += diblklen * dblindir_indx;
diff --git a/fs/befs/io.c b/fs/befs/io.c
index 7a5b4ec21c56..523c8af2d770 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -26,7 +26,7 @@
struct buffer_head *
befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_blocknr_t block = 0;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
@@ -63,7 +63,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
struct buffer_head *
befs_bread(struct super_block *sb, befs_blocknr_t block)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 71112aa07d84..7da05b159ade 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -155,7 +155,7 @@ befs_get_block(struct inode *inode, sector_t block,
static struct dentry *
befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
- struct inode *inode = NULL;
+ struct inode *inode;
struct super_block *sb = dir->i_sb;
const befs_data_stream *ds = &BEFS_I(dir)->i_data.ds;
befs_off_t offset;
@@ -294,10 +294,10 @@ static void init_once(void *foo)
static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
{
- struct buffer_head *bh = NULL;
- befs_inode *raw_inode = NULL;
+ struct buffer_head *bh;
+ befs_inode *raw_inode;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
- struct befs_inode_info *befs_ino = NULL;
+ struct befs_inode_info *befs_ino;
struct inode *inode;
long ret = -EIO;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 4c556680fa74..ae1b5404fced 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -127,12 +127,8 @@ static int set_brk(unsigned long start, unsigned long end)
{
start = PAGE_ALIGN(start);
end = PAGE_ALIGN(end);
- if (end > start) {
- unsigned long addr;
- addr = vm_brk(start, end - start);
- if (BAD_ADDR(addr))
- return addr;
- }
+ if (end > start)
+ return vm_brk(start, end - start);
return 0;
}
@@ -275,7 +271,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
map_size = ex.a_text+ex.a_data;
#endif
error = vm_brk(text_addr & PAGE_MASK, map_size);
- if (error != (text_addr & PAGE_MASK))
+ if (error)
return error;
error = read_code(bprm->file, text_addr, pos,
@@ -297,7 +293,10 @@ static int load_aout_binary(struct linux_binprm * bprm)
}
if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
- vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+ error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+ if (error)
+ return error;
+
read_code(bprm->file, N_TXTADDR(ex), fd_offset,
ex.a_text + ex.a_data);
goto beyond_if;
@@ -378,8 +377,10 @@ static int load_aout_library(struct file *file)
"N_TXTOFF is not page aligned. Please convert library: %pD\n",
file);
}
- vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-
+ retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
+ if (retval)
+ goto out;
+
read_code(file, start_addr, N_TXTOFF(ex),
ex.a_text + ex.a_data);
retval = 0;
@@ -397,9 +398,8 @@ static int load_aout_library(struct file *file)
len = PAGE_ALIGN(ex.a_text + ex.a_data);
bss = ex.a_text + ex.a_data + ex.a_bss;
if (bss > len) {
- error = vm_brk(start_addr + len, bss - len);
- retval = error;
- if (error != start_addr + len)
+ retval = vm_brk(start_addr + len, bss - len);
+ if (retval)
goto out;
}
retval = 0;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 56224ffa94d2..a7a28110dc80 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end)
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end > start) {
- unsigned long addr;
- addr = vm_brk(start, end - start);
- if (BAD_ADDR(addr))
- return addr;
+ int error = vm_brk(start, end - start);
+ if (error)
+ return error;
}
current->mm->start_brk = current->mm->brk = end;
return 0;
@@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
/* Map the last of the bss segment */
error = vm_brk(elf_bss, last_bss - elf_bss);
- if (BAD_ADDR(error))
+ if (error)
goto out;
}
@@ -1176,8 +1175,11 @@ static int load_elf_library(struct file *file)
len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
ELF_MIN_ALIGN - 1);
bss = eppnt->p_memsz + eppnt->p_vaddr;
- if (bss > len)
- vm_brk(len, bss - len);
+ if (bss > len) {
+ error = vm_brk(len, bss - len);
+ if (error)
+ goto out_free_ph;
+ }
error = 0;
out_free_ph:
@@ -2273,7 +2275,7 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Align to page */
- if (!dump_skip(cprm, dataoff - cprm->file->f_pos))
+ if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 71ade0e556b7..203589311bf8 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1787,7 +1787,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
}
- if (!dump_skip(cprm, dataoff - cprm->file->f_pos))
+ if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
if (!elf_fdpic_dump_segments(cprm))
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index f723cd3a455c..caf9e39bb82b 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -337,7 +337,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
"(%d != %d)", (unsigned) r, curid, id);
goto failed;
} else if ( ! p->lib_list[id].loaded &&
- IS_ERR_VALUE(load_flat_shared_library(id, p))) {
+ load_flat_shared_library(id, p) < 0) {
printk("BINFMT_FLAT: failed to load library %d", id);
goto failed;
}
@@ -837,7 +837,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
res = prepare_binprm(&bprm);
- if (!IS_ERR_VALUE(res))
+ if (!res)
res = load_flat_file(&bprm, libs, id, NULL);
abort_creds(bprm.cred);
@@ -883,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm)
stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */
res = load_flat_file(bprm, &libinfo, 0, &stack_len);
- if (IS_ERR_VALUE(res))
+ if (res < 0)
return res;
/* Update data segment pointers for all libraries */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a063d4d8ac39..71ccab1d22c6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
#include <linux/log2.h>
#include <linux/cleancache.h>
#include <linux/dax.h>
+#include <linux/badblocks.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -50,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
+void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
+ va_end(args);
+}
+
static void bdev_write_inode(struct block_device *bdev)
{
struct inode *inode = bdev->bd_inode;
@@ -488,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
+ avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
if (!avail)
return -ERANGE;
if (avail > 0 && avail & ~PAGE_MASK)
@@ -497,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
}
EXPORT_SYMBOL_GPL(bdev_direct_access);
+/**
+ * bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+ struct blk_dax_ctl dax = {
+ .sector = 0,
+ .size = PAGE_SIZE,
+ };
+ int err;
+
+ if (blocksize != PAGE_SIZE) {
+ vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
+ return -EINVAL;
+ }
+
+ err = bdev_direct_access(sb->s_bdev, &dax);
+ if (err < 0) {
+ switch (err) {
+ case -EOPNOTSUPP:
+ vfs_msg(sb, KERN_ERR,
+ "error: device does not support dax");
+ break;
+ case -EINVAL:
+ vfs_msg(sb, KERN_ERR,
+ "error: unaligned partition for dax");
+ break;
+ default:
+ vfs_msg(sb, KERN_ERR,
+ "error: dax access failed (%d)", err);
+ }
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
+
+/**
+ * bdev_dax_capable() - Return if the raw device is capable for dax
+ * @bdev: The device for raw block device access
+ */
+bool bdev_dax_capable(struct block_device *bdev)
+{
+ struct blk_dax_ctl dax = {
+ .size = PAGE_SIZE,
+ };
+
+ if (!IS_ENABLED(CONFIG_FS_DAX))
+ return false;
+
+ dax.sector = 0;
+ if (bdev_direct_access(bdev, &dax) < 0)
+ return false;
+
+ dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
+ if (bdev_direct_access(bdev, &dax) < 0)
+ return false;
+
+ return true;
+}
+
/*
* pseudo-fs
*/
@@ -1238,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
- if (!blkdev_dax_capable(bdev))
+ if (!bdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
@@ -1275,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
- if (!blkdev_dax_capable(bdev))
+ if (!bdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
@@ -1720,79 +1802,13 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback,
};
-#ifdef CONFIG_FS_DAX
-/*
- * In the raw block case we do not need to contend with truncation nor
- * unwritten file extents. Without those concerns there is no need for
- * additional locking beyond the mmap_sem context that these routines
- * are already executing under.
- *
- * Note, there is no protection if the block device is dynamically
- * resized (partition grow/shrink) during a fault. A stable block device
- * size is already not enforced in the blkdev_direct_IO path.
- *
- * For DAX, it is the responsibility of the block device driver to
- * ensure the whole-disk device size is stable while requests are in
- * flight.
- *
- * Finally, unlike the filemap_page_mkwrite() case there is no
- * filesystem superblock to sync against freezing. We still include a
- * pfn_mkwrite callback for dax drivers to receive write fault
- * notifications.
- */
-static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- return __dax_fault(vma, vmf, blkdev_get_block, NULL);
-}
-
-static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
- struct vm_fault *vmf)
-{
- return dax_pfn_mkwrite(vma, vmf);
-}
-
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags)
-{
- return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static const struct vm_operations_struct blkdev_dax_vm_ops = {
- .fault = blkdev_dax_fault,
- .pmd_fault = blkdev_dax_pmd_fault,
- .pfn_mkwrite = blkdev_dax_pfn_mkwrite,
-};
-
-static const struct vm_operations_struct blkdev_default_vm_ops = {
- .fault = filemap_fault,
- .map_pages = filemap_map_pages,
-};
-
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
- struct inode *bd_inode = bdev_file_inode(file);
-
- file_accessed(file);
- if (IS_DAX(bd_inode)) {
- vma->vm_ops = &blkdev_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
- } else {
- vma->vm_ops = &blkdev_default_vm_ops;
- }
-
- return 0;
-}
-#else
-#define blkdev_mmap generic_file_mmap
-#endif
-
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
- .mmap = blkdev_mmap,
+ .mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 80e8472d618b..8bb3509099e8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1939,7 +1939,7 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
* from ipath->fspath->val[i].
* when it returns, there are ipath->fspath->elem_cnt number of paths available
* in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
- * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise,
* it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
* have been needed to return all paths.
*/
@@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
if (!ifp) {
- kfree(fspath);
+ vfree(fspath);
return ERR_PTR(-ENOMEM);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 61205e3bbefa..4919aedb5fc1 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -196,6 +196,16 @@ struct btrfs_inode {
struct list_head delayed_iput;
long delayed_iput_count;
+ /*
+ * To avoid races between lockless (i_mutex not held) direct IO writes
+ * and concurrent fsync requests. Direct IO writes must acquire read
+ * access on this semaphore for creating an extent map and its
+ * corresponding ordered extent. The fast fsync path must acquire write
+ * access on this semaphore before it collects ordered extents and
+ * extent maps.
+ */
+ struct rw_semaphore dio_sem;
+
struct inode vfs_inode;
};
@@ -303,7 +313,7 @@ struct btrfs_dio_private {
struct bio *dio_bio;
/*
- * The original bio may be splited to several sub-bios, this is
+ * The original bio may be split to several sub-bios, this is
* done during endio of sub-bios
*/
int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 516e19d1d202..7706c8dc5fa6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1939,7 +1939,7 @@ again:
/*
* Clear all references of this block. Do not free
* the block itself even if is not referenced anymore
- * because it still carries valueable information
+ * because it still carries valuable information
* like whether it was ever written and IO completed.
*/
list_for_each_entry_safe(l, tmp, &block->ref_to_list,
@@ -2645,7 +2645,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+ indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
btrfsic_get_block_type(state, block),
block->logical_bytenr, block->dev_state->name,
block->dev_bytenr, block->mirror_num);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ff61a41ac90b..658c39b70fba 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -743,8 +743,11 @@ out:
static struct {
struct list_head idle_ws;
spinlock_t ws_lock;
- int num_ws;
- atomic_t alloc_ws;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
wait_queue_head_t ws_wait;
} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
@@ -758,16 +761,34 @@ void __init btrfs_init_compress(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+ struct list_head *workspace;
+
INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
spin_lock_init(&btrfs_comp_ws[i].ws_lock);
- atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+ atomic_set(&btrfs_comp_ws[i].total_ws, 0);
init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+
+ /*
+ * Preallocate one workspace for each compression type so
+ * we can guarantee forward progress in the worst case
+ */
+ workspace = btrfs_compress_op[i]->alloc_workspace();
+ if (IS_ERR(workspace)) {
+ printk(KERN_WARNING
+ "BTRFS: cannot preallocate compression workspace, will try later");
+ } else {
+ atomic_set(&btrfs_comp_ws[i].total_ws, 1);
+ btrfs_comp_ws[i].free_ws = 1;
+ list_add(workspace, &btrfs_comp_ws[i].idle_ws);
+ }
}
}
/*
- * this finds an available workspace or allocates a new one
- * ERR_PTR is returned if things go bad.
+ * This finds an available workspace or allocates a new one.
+ * If it's not possible to allocate a new one, waits until there's one.
+ * Preallocation makes a forward progress guarantees and we do not return
+ * errors.
*/
static struct list_head *find_workspace(int type)
{
@@ -777,36 +798,58 @@ static struct list_head *find_workspace(int type)
struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
- atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
- int *num_ws = &btrfs_comp_ws[idx].num_ws;
+ int *free_ws = &btrfs_comp_ws[idx].free_ws;
again:
spin_lock(ws_lock);
if (!list_empty(idle_ws)) {
workspace = idle_ws->next;
list_del(workspace);
- (*num_ws)--;
+ (*free_ws)--;
spin_unlock(ws_lock);
return workspace;
}
- if (atomic_read(alloc_ws) > cpus) {
+ if (atomic_read(total_ws) > cpus) {
DEFINE_WAIT(wait);
spin_unlock(ws_lock);
prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (atomic_read(alloc_ws) > cpus && !*num_ws)
+ if (atomic_read(total_ws) > cpus && !*free_ws)
schedule();
finish_wait(ws_wait, &wait);
goto again;
}
- atomic_inc(alloc_ws);
+ atomic_inc(total_ws);
spin_unlock(ws_lock);
workspace = btrfs_compress_op[idx]->alloc_workspace();
if (IS_ERR(workspace)) {
- atomic_dec(alloc_ws);
+ atomic_dec(total_ws);
wake_up(ws_wait);
+
+ /*
+ * Do not return the error but go back to waiting. There's a
+ * workspace preallocated for each type and the compression
+ * time is bounded so we get to a workspace eventually. This
+ * makes our caller's life easier.
+ *
+ * To prevent silent and low-probability deadlocks (when the
+ * initial preallocation fails), check if there are any
+ * workspaces at all.
+ */
+ if (atomic_read(total_ws) == 0) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ /* once per minute */ 60 * HZ,
+ /* no burst */ 1);
+
+ if (__ratelimit(&_rs)) {
+ printk(KERN_WARNING
+ "no compression workspaces, low memory, retrying");
+ }
+ }
+ goto again;
}
return workspace;
}
@@ -820,21 +863,21 @@ static void free_workspace(int type, struct list_head *workspace)
int idx = type - 1;
struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
- atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
- int *num_ws = &btrfs_comp_ws[idx].num_ws;
+ int *free_ws = &btrfs_comp_ws[idx].free_ws;
spin_lock(ws_lock);
- if (*num_ws < num_online_cpus()) {
+ if (*free_ws < num_online_cpus()) {
list_add(workspace, idle_ws);
- (*num_ws)++;
+ (*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
btrfs_compress_op[idx]->free_workspace(workspace);
- atomic_dec(alloc_ws);
+ atomic_dec(total_ws);
wake:
/*
* Make sure counter is updated before we wake up waiters.
@@ -857,7 +900,7 @@ static void free_workspaces(void)
workspace = btrfs_comp_ws[i].idle_ws.next;
list_del(workspace);
btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&btrfs_comp_ws[i].alloc_ws);
+ atomic_dec(&btrfs_comp_ws[i].total_ws);
}
}
}
@@ -894,8 +937,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
int ret;
workspace = find_workspace(type);
- if (IS_ERR(workspace))
- return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
start, len, pages,
@@ -930,8 +971,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
int ret;
workspace = find_workspace(type);
- if (IS_ERR(workspace))
- return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
disk_start,
@@ -952,8 +991,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
int ret;
workspace = find_workspace(type);
- if (IS_ERR(workspace))
- return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
dest_page, start_byte,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ec7928a27aaa..6276add8538a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -156,7 +156,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
/*
* RCU really hurts here, we could free up the root node because
- * it was cow'ed but we may not get the new root node yet so do
+ * it was COWed but we may not get the new root node yet so do
* the inc_not_zero dance and if it doesn't work then
* synchronize_rcu and try again.
*/
@@ -955,7 +955,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf)
{
/*
- * Tree blocks not in refernece counted trees and tree roots
+ * Tree blocks not in reference counted trees and tree roots
* are never shared. If a block was allocated after the last
* snapshot and the block was not allocated by tree relocation,
* we know the block is not shared.
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return ret;
if (refs == 0) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
return ret;
}
} else {
@@ -1270,7 +1270,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
/*
* tm is a pointer to the first operation to rewind within eb. then, all
- * previous operations will be rewinded (until we reach something older than
+ * previous operations will be rewound (until we reach something older than
* time_seq).
*/
static void
@@ -1345,7 +1345,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
}
/*
- * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * Called with eb read locked. If the buffer cannot be rewound, the same buffer
* is returned. If rewind operations happen, a fresh buffer is returned. The
* returned buffer is always read-locked. If the returned buffer is not the
* input buffer, the lock on the input buffer is released and the input buffer
@@ -1373,7 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
+ eb->len);
if (!eb_rewin) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1454,7 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
} else if (old_root) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(root->fs_info, logical);
+ eb = alloc_dummy_extent_buffer(root->fs_info, logical,
+ root->nodesize);
} else {
btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
eb = btrfs_clone_extent_buffer(eb_root);
@@ -1516,7 +1518,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
* 3) the root is not forced COW.
*
* What is forced COW:
- * when we create snapshot during commiting the transaction,
+ * when we create snapshot during committing the transaction,
* after we've finished coping src root, we must COW the shared
* block to ensure the metadata consistency.
*/
@@ -1531,7 +1533,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
/*
* cows a single block, see __btrfs_cow_block for the real work.
- * This version of it has extra checks so that a block isn't cow'd more than
+ * This version of it has extra checks so that a block isn't COWed more than
* once per transaction, as long as it hasn't been written yet
*/
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
@@ -1552,6 +1554,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid, root->fs_info->generation);
if (!should_cow_block(trans, root, buf)) {
+ trans->dirty = true;
*cow_ret = buf;
return 0;
}
@@ -1928,7 +1931,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
child = read_node_slot(root, mid, 0);
if (!child) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
goto enospc;
}
@@ -2031,7 +2034,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
*/
if (!left) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
goto enospc;
}
wret = balance_node_right(trans, root, mid, left);
@@ -2510,6 +2513,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
if (!btrfs_buffer_uptodate(tmp, 0, 0))
ret = -EIO;
free_extent_buffer(tmp);
+ } else {
+ ret = PTR_ERR(tmp);
}
return ret;
}
@@ -2773,8 +2778,10 @@ again:
* then we don't want to set the path blocking,
* so we test it here
*/
- if (!should_cow_block(trans, root, b))
+ if (!should_cow_block(trans, root, b)) {
+ trans->dirty = true;
goto cow_done;
+ }
/*
* must have write locks on this node and the
@@ -2986,7 +2993,7 @@ again:
btrfs_unlock_up_safe(p, level + 1);
/*
- * Since we can unwind eb's we want to do a real search every
+ * Since we can unwind ebs we want to do a real search every
* time.
*/
prev_cmp = -1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84a6a5b3384a..101c3cfd3f7c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
#include <asm/kmap_types.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
+#include <linux/btrfs_tree.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/sizes.h>
@@ -64,98 +65,6 @@ struct btrfs_ordered_sum;
#define BTRFS_COMPAT_EXTENT_TREE_V0
-/* holds pointers to all of the tree roots */
-#define BTRFS_ROOT_TREE_OBJECTID 1ULL
-
-/* stores information about which extents are in use, and reference counts */
-#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
-
-/*
- * chunk tree stores translations from logical -> physical block numbering
- * the super block points to the chunk tree
- */
-#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
-
-/*
- * stores information about which areas of a given device are in use.
- * one per device. The tree of tree roots points to the device tree
- */
-#define BTRFS_DEV_TREE_OBJECTID 4ULL
-
-/* one per subvolume, storing files and directories */
-#define BTRFS_FS_TREE_OBJECTID 5ULL
-
-/* directory objectid inside the root tree */
-#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
-
-/* holds checksums of all the data extents */
-#define BTRFS_CSUM_TREE_OBJECTID 7ULL
-
-/* holds quota configuration and tracking */
-#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
-
-/* for storing items that use the BTRFS_UUID_KEY* types */
-#define BTRFS_UUID_TREE_OBJECTID 9ULL
-
-/* tracks free space in block groups. */
-#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
-
-/* device stats in the device tree */
-#define BTRFS_DEV_STATS_OBJECTID 0ULL
-
-/* for storing balance parameters in the root tree */
-#define BTRFS_BALANCE_OBJECTID -4ULL
-
-/* orhpan objectid for tracking unlinked/truncated files */
-#define BTRFS_ORPHAN_OBJECTID -5ULL
-
-/* does write ahead logging to speed up fsyncs */
-#define BTRFS_TREE_LOG_OBJECTID -6ULL
-#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
-
-/* for space balancing */
-#define BTRFS_TREE_RELOC_OBJECTID -8ULL
-#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
-
-/*
- * extent checksums all have this objectid
- * this allows them to share the logging tree
- * for fsyncs
- */
-#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
-
-/* For storing free space cache */
-#define BTRFS_FREE_SPACE_OBJECTID -11ULL
-
-/*
- * The inode number assigned to the special inode for storing
- * free ino cache
- */
-#define BTRFS_FREE_INO_OBJECTID -12ULL
-
-/* dummy objectid represents multiple objectids */
-#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
-
-/*
- * All files have objectids in this range.
- */
-#define BTRFS_FIRST_FREE_OBJECTID 256ULL
-#define BTRFS_LAST_FREE_OBJECTID -256ULL
-#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
-
-
-/*
- * the device items go into the chunk tree. The key is in the form
- * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
- */
-#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
-
-#define BTRFS_BTREE_INODE_OBJECTID 1
-
-#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
-
-#define BTRFS_DEV_REPLACE_DEVID 0ULL
-
/*
* the max metadata block size. This limit is somewhat artificial,
* but the memmove costs go through the roof for larger blocks.
@@ -175,31 +84,14 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_LINK_MAX 65535U
-/* 32 bytes in various csum fields */
-#define BTRFS_CSUM_SIZE 32
-
-/* csum types */
-#define BTRFS_CSUM_TYPE_CRC32 0
-
static const int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
-/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */
#define REQ_GET_READ_MIRRORS (1 << 30)
-#define BTRFS_FT_UNKNOWN 0
-#define BTRFS_FT_REG_FILE 1
-#define BTRFS_FT_DIR 2
-#define BTRFS_FT_CHRDEV 3
-#define BTRFS_FT_BLKDEV 4
-#define BTRFS_FT_FIFO 5
-#define BTRFS_FT_SOCK 6
-#define BTRFS_FT_SYMLINK 7
-#define BTRFS_FT_XATTR 8
-#define BTRFS_FT_MAX 9
-
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
@@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 };
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
-/*
- * The key defines the order in the tree, and so it also defines (optimal)
- * block layout.
- *
- * objectid corresponds to the inode number.
- *
- * type tells us things about the object, and is a kind of stream selector.
- * so for a given inode, keys with type of 1 might refer to the inode data,
- * type of 2 may point to file data in the btree and type == 3 may point to
- * extents.
- *
- * offset is the starting byte offset for this key in the stream.
- *
- * btrfs_disk_key is in disk byte order. struct btrfs_key is always
- * in cpu native order. Otherwise they are identical and their sizes
- * should be the same (ie both packed)
- */
-struct btrfs_disk_key {
- __le64 objectid;
- u8 type;
- __le64 offset;
-} __attribute__ ((__packed__));
-
-struct btrfs_key {
- u64 objectid;
- u8 type;
- u64 offset;
-} __attribute__ ((__packed__));
-
struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
};
-struct btrfs_dev_item {
- /* the internal btrfs device id */
- __le64 devid;
-
- /* size of the device */
- __le64 total_bytes;
-
- /* bytes used */
- __le64 bytes_used;
-
- /* optimal io alignment for this device */
- __le32 io_align;
-
- /* optimal io width for this device */
- __le32 io_width;
-
- /* minimal io size for this device */
- __le32 sector_size;
-
- /* type and info about this device */
- __le64 type;
-
- /* expected generation for this device */
- __le64 generation;
-
- /*
- * starting byte of this partition on the device,
- * to allow for stripe alignment in the future
- */
- __le64 start_offset;
-
- /* grouping information for allocation decisions */
- __le32 dev_group;
-
- /* seek speed 0-100 where 100 is fastest */
- u8 seek_speed;
-
- /* bandwidth 0-100 where 100 is fastest */
- u8 bandwidth;
-
- /* btrfs generated uuid for this device */
- u8 uuid[BTRFS_UUID_SIZE];
-
- /* uuid of FS who owns this device */
- u8 fsid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_stripe {
- __le64 devid;
- __le64 offset;
- u8 dev_uuid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_chunk {
- /* size of this chunk in bytes */
- __le64 length;
-
- /* objectid of the root referencing this chunk */
- __le64 owner;
-
- __le64 stripe_len;
- __le64 type;
-
- /* optimal io alignment for this chunk */
- __le32 io_align;
-
- /* optimal io width for this chunk */
- __le32 io_width;
-
- /* minimal io size for this chunk */
- __le32 sector_size;
-
- /* 2^16 stripes is quite a lot, a second limit is the size of a single
- * item in the btree
- */
- __le16 num_stripes;
-
- /* sub stripes only matter for raid10 */
- __le16 sub_stripes;
- struct btrfs_stripe stripe;
- /* additional stripes go here */
-} __attribute__ ((__packed__));
-
-#define BTRFS_FREE_SPACE_EXTENT 1
-#define BTRFS_FREE_SPACE_BITMAP 2
-
-struct btrfs_free_space_entry {
- __le64 offset;
- __le64 bytes;
- u8 type;
-} __attribute__ ((__packed__));
-
-struct btrfs_free_space_header {
- struct btrfs_disk_key location;
- __le64 generation;
- __le64 num_entries;
- __le64 num_bitmaps;
-} __attribute__ ((__packed__));
-
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
sizeof(struct btrfs_stripe) * (num_stripes - 1);
}
-#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
-#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
-
/*
* File system states
*/
@@ -357,13 +118,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FS_STATE_TRANS_ABORTED 2
#define BTRFS_FS_STATE_DEV_REPLACING 3
-/* Super block flags */
-/* Errors detected */
-#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
-
-#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
-#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
-
#define BTRFS_BACKREF_REV_MAX 256
#define BTRFS_BACKREF_REV_SHIFT 56
#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
@@ -410,7 +164,6 @@ struct btrfs_header {
* room to translate 14 chunks with 3 stripes each.
*/
#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
-#define BTRFS_LABEL_SIZE 256
/*
* just in case we somehow lose the roots and are not able to mount,
@@ -507,31 +260,6 @@ struct btrfs_super_block {
* Compat flags that we support. If any incompat flags are set other than the
* ones specified below then we will fail to mount
*/
-#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0)
-
-#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
-#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
-/*
- * some patches floated around with a second compression method
- * lets save that incompat here for when they do get in
- * Note we don't actually support it, we're just reserving the
- * number
- */
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
-
-/*
- * older kernels tried to do bigger metadata blocks, but the
- * code was pretty buggy. Lets not let them try anymore.
- */
-#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
-
-#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
-#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
-#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
-#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9)
-
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
@@ -624,357 +352,8 @@ struct btrfs_path {
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
};
-
-/*
- * items in the extent btree are used to record the objectid of the
- * owner of the block and the number of references
- */
-
-struct btrfs_extent_item {
- __le64 refs;
- __le64 generation;
- __le64 flags;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_item_v0 {
- __le32 refs;
-} __attribute__ ((__packed__));
-
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
sizeof(struct btrfs_item))
-
-#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0)
-#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1)
-
-/* following flags only apply to tree blocks */
-
-/* use full backrefs for extent pointers in the block */
-#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
-
-/*
- * this flag is only used internally by scrub and may be changed at any time
- * it is only declared here to avoid collisions
- */
-#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
-
-struct btrfs_tree_block_info {
- struct btrfs_disk_key key;
- u8 level;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_data_ref {
- __le64 root;
- __le64 objectid;
- __le64 offset;
- __le32 count;
-} __attribute__ ((__packed__));
-
-struct btrfs_shared_data_ref {
- __le32 count;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_inline_ref {
- u8 type;
- __le64 offset;
-} __attribute__ ((__packed__));
-
-/* old style backrefs item */
-struct btrfs_extent_ref_v0 {
- __le64 root;
- __le64 generation;
- __le64 objectid;
- __le32 count;
-} __attribute__ ((__packed__));
-
-
-/* dev extents record free space on individual devices. The owner
- * field points back to the chunk allocation mapping tree that allocated
- * the extent. The chunk tree uuid field is a way to double check the owner
- */
-struct btrfs_dev_extent {
- __le64 chunk_tree;
- __le64 chunk_objectid;
- __le64 chunk_offset;
- __le64 length;
- u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_ref {
- __le64 index;
- __le16 name_len;
- /* name goes here */
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_extref {
- __le64 parent_objectid;
- __le64 index;
- __le16 name_len;
- __u8 name[0];
- /* name goes here */
-} __attribute__ ((__packed__));
-
-struct btrfs_timespec {
- __le64 sec;
- __le32 nsec;
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_item {
- /* nfs style generation number */
- __le64 generation;
- /* transid that last touched this inode */
- __le64 transid;
- __le64 size;
- __le64 nbytes;
- __le64 block_group;
- __le32 nlink;
- __le32 uid;
- __le32 gid;
- __le32 mode;
- __le64 rdev;
- __le64 flags;
-
- /* modification sequence number for NFS */
- __le64 sequence;
-
- /*
- * a little future expansion, for more than this we can
- * just grow the inode item and version it
- */
- __le64 reserved[4];
- struct btrfs_timespec atime;
- struct btrfs_timespec ctime;
- struct btrfs_timespec mtime;
- struct btrfs_timespec otime;
-} __attribute__ ((__packed__));
-
-struct btrfs_dir_log_item {
- __le64 end;
-} __attribute__ ((__packed__));
-
-struct btrfs_dir_item {
- struct btrfs_disk_key location;
- __le64 transid;
- __le16 data_len;
- __le16 name_len;
- u8 type;
-} __attribute__ ((__packed__));
-
-#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
-
-/*
- * Internal in-memory flag that a subvolume has been marked for deletion but
- * still visible as a directory
- */
-#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
-
-struct btrfs_root_item {
- struct btrfs_inode_item inode;
- __le64 generation;
- __le64 root_dirid;
- __le64 bytenr;
- __le64 byte_limit;
- __le64 bytes_used;
- __le64 last_snapshot;
- __le64 flags;
- __le32 refs;
- struct btrfs_disk_key drop_progress;
- u8 drop_level;
- u8 level;
-
- /*
- * The following fields appear after subvol_uuids+subvol_times
- * were introduced.
- */
-
- /*
- * This generation number is used to test if the new fields are valid
- * and up to date while reading the root item. Every time the root item
- * is written out, the "generation" field is copied into this field. If
- * anyone ever mounted the fs with an older kernel, we will have
- * mismatching generation values here and thus must invalidate the
- * new fields. See btrfs_update_root and btrfs_find_last_root for
- * details.
- * the offset of generation_v2 is also used as the start for the memset
- * when invalidating the fields.
- */
- __le64 generation_v2;
- u8 uuid[BTRFS_UUID_SIZE];
- u8 parent_uuid[BTRFS_UUID_SIZE];
- u8 received_uuid[BTRFS_UUID_SIZE];
- __le64 ctransid; /* updated when an inode changes */
- __le64 otransid; /* trans when created */
- __le64 stransid; /* trans when sent. non-zero for received subvol */
- __le64 rtransid; /* trans when received. non-zero for received subvol */
- struct btrfs_timespec ctime;
- struct btrfs_timespec otime;
- struct btrfs_timespec stime;
- struct btrfs_timespec rtime;
- __le64 reserved[8]; /* for future */
-} __attribute__ ((__packed__));
-
-/*
- * this is used for both forward and backward root refs
- */
-struct btrfs_root_ref {
- __le64 dirid;
- __le64 sequence;
- __le16 name_len;
-} __attribute__ ((__packed__));
-
-struct btrfs_disk_balance_args {
- /*
- * profiles to operate on, single is denoted by
- * BTRFS_AVAIL_ALLOC_BIT_SINGLE
- */
- __le64 profiles;
-
- /*
- * usage filter
- * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
- * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
- */
- union {
- __le64 usage;
- struct {
- __le32 usage_min;
- __le32 usage_max;
- };
- };
-
- /* devid filter */
- __le64 devid;
-
- /* devid subset filter [pstart..pend) */
- __le64 pstart;
- __le64 pend;
-
- /* btrfs virtual address space subset filter [vstart..vend) */
- __le64 vstart;
- __le64 vend;
-
- /*
- * profile to convert to, single is denoted by
- * BTRFS_AVAIL_ALLOC_BIT_SINGLE
- */
- __le64 target;
-
- /* BTRFS_BALANCE_ARGS_* */
- __le64 flags;
-
- /*
- * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
- * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
- * and maximum
- */
- union {
- __le64 limit;
- struct {
- __le32 limit_min;
- __le32 limit_max;
- };
- };
-
- /*
- * Process chunks that cross stripes_min..stripes_max devices,
- * BTRFS_BALANCE_ARGS_STRIPES_RANGE
- */
- __le32 stripes_min;
- __le32 stripes_max;
-
- __le64 unused[6];
-} __attribute__ ((__packed__));
-
-/*
- * store balance parameters to disk so that balance can be properly
- * resumed after crash or unmount
- */
-struct btrfs_balance_item {
- /* BTRFS_BALANCE_* */
- __le64 flags;
-
- struct btrfs_disk_balance_args data;
- struct btrfs_disk_balance_args meta;
- struct btrfs_disk_balance_args sys;
-
- __le64 unused[4];
-} __attribute__ ((__packed__));
-
-#define BTRFS_FILE_EXTENT_INLINE 0
-#define BTRFS_FILE_EXTENT_REG 1
-#define BTRFS_FILE_EXTENT_PREALLOC 2
-
-struct btrfs_file_extent_item {
- /*
- * transaction id that created this extent
- */
- __le64 generation;
- /*
- * max number of bytes to hold this extent in ram
- * when we split a compressed extent we can't know how big
- * each of the resulting pieces will be. So, this is
- * an upper limit on the size of the extent in ram instead of
- * an exact limit.
- */
- __le64 ram_bytes;
-
- /*
- * 32 bits for the various ways we might encode the data,
- * including compression and encryption. If any of these
- * are set to something a given disk format doesn't understand
- * it is treated like an incompat flag for reading and writing,
- * but not for stat.
- */
- u8 compression;
- u8 encryption;
- __le16 other_encoding; /* spare for later use */
-
- /* are we inline data or a real extent? */
- u8 type;
-
- /*
- * disk space consumed by the extent, checksum blocks are included
- * in these numbers
- *
- * At this offset in the structure, the inline extent data start.
- */
- __le64 disk_bytenr;
- __le64 disk_num_bytes;
- /*
- * the logical offset in file blocks (no csums)
- * this extent record is for. This allows a file extent to point
- * into the middle of an existing extent on disk, sharing it
- * between two snapshots (useful if some bytes in the middle of the
- * extent have changed
- */
- __le64 offset;
- /*
- * the logical number of file blocks (no csums included). This
- * always reflects the size uncompressed and without encoding.
- */
- __le64 num_bytes;
-
-} __attribute__ ((__packed__));
-
-struct btrfs_csum_item {
- u8 csum;
-} __attribute__ ((__packed__));
-
-struct btrfs_dev_stats_item {
- /*
- * grow this item struct at the end for future enhancements and keep
- * the existing values unchanged
- */
- __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
-} __attribute__ ((__packed__));
-
-#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
-#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
-#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
-#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
-#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
-
struct btrfs_dev_replace {
u64 replace_state; /* see #define above */
u64 time_started; /* seconds since 1-Jan-1970 */
@@ -1005,175 +384,6 @@ struct btrfs_dev_replace {
struct btrfs_scrub_progress scrub_progress;
};
-struct btrfs_dev_replace_item {
- /*
- * grow this item struct at the end for future enhancements and keep
- * the existing values unchanged
- */
- __le64 src_devid;
- __le64 cursor_left;
- __le64 cursor_right;
- __le64 cont_reading_from_srcdev_mode;
-
- __le64 replace_state;
- __le64 time_started;
- __le64 time_stopped;
- __le64 num_write_errors;
- __le64 num_uncorrectable_read_errors;
-} __attribute__ ((__packed__));
-
-/* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
-#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
-#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
-#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
-#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
-#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
-#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
- BTRFS_SPACE_INFO_GLOBAL_RSV)
-
-enum btrfs_raid_types {
- BTRFS_RAID_RAID10,
- BTRFS_RAID_RAID1,
- BTRFS_RAID_DUP,
- BTRFS_RAID_RAID0,
- BTRFS_RAID_SINGLE,
- BTRFS_RAID_RAID5,
- BTRFS_RAID_RAID6,
- BTRFS_NR_RAID_TYPES
-};
-
-#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
- BTRFS_BLOCK_GROUP_SYSTEM | \
- BTRFS_BLOCK_GROUP_METADATA)
-
-#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
- BTRFS_BLOCK_GROUP_RAID1 | \
- BTRFS_BLOCK_GROUP_RAID5 | \
- BTRFS_BLOCK_GROUP_RAID6 | \
- BTRFS_BLOCK_GROUP_DUP | \
- BTRFS_BLOCK_GROUP_RAID10)
-#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
- BTRFS_BLOCK_GROUP_RAID6)
-
-/*
- * We need a bit for restriper to be able to tell when chunks of type
- * SINGLE are available. This "extended" profile format is used in
- * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
- * (on-disk). The corresponding on-disk bit in chunk.type is reserved
- * to avoid remappings between two formats in future.
- */
-#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
-
-/*
- * A fake block group type that is used to communicate global block reserve
- * size to userspace via the SPACE_INFO ioctl.
- */
-#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
-
-#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
- BTRFS_AVAIL_ALLOC_BIT_SINGLE)
-
-static inline u64 chunk_to_extended(u64 flags)
-{
- if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
- flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
- return flags;
-}
-static inline u64 extended_to_chunk(u64 flags)
-{
- return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-}
-
-struct btrfs_block_group_item {
- __le64 used;
- __le64 chunk_objectid;
- __le64 flags;
-} __attribute__ ((__packed__));
-
-struct btrfs_free_space_info {
- __le32 extent_count;
- __le32 flags;
-} __attribute__ ((__packed__));
-
-#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
-
-#define BTRFS_QGROUP_LEVEL_SHIFT 48
-static inline u64 btrfs_qgroup_level(u64 qgroupid)
-{
- return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
-}
-
-/*
- * is subvolume quota turned on?
- */
-#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
-/*
- * RESCAN is set during the initialization phase
- */
-#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1)
-/*
- * Some qgroup entries are known to be out of date,
- * either because the configuration has changed in a way that
- * makes a rescan necessary, or because the fs has been mounted
- * with a non-qgroup-aware version.
- * Turning qouta off and on again makes it inconsistent, too.
- */
-#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
-
-#define BTRFS_QGROUP_STATUS_VERSION 1
-
-struct btrfs_qgroup_status_item {
- __le64 version;
- /*
- * the generation is updated during every commit. As older
- * versions of btrfs are not aware of qgroups, it will be
- * possible to detect inconsistencies by checking the
- * generation on mount time
- */
- __le64 generation;
-
- /* flag definitions see above */
- __le64 flags;
-
- /*
- * only used during scanning to record the progress
- * of the scan. It contains a logical address
- */
- __le64 rescan;
-} __attribute__ ((__packed__));
-
-struct btrfs_qgroup_info_item {
- __le64 generation;
- __le64 rfer;
- __le64 rfer_cmpr;
- __le64 excl;
- __le64 excl_cmpr;
-} __attribute__ ((__packed__));
-
-/* flags definition for qgroup limits */
-#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
-#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
-#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
-#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
-#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
-#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
-
-struct btrfs_qgroup_limit_item {
- /*
- * only updated when any of the other values change
- */
- __le64 flags;
- __le64 max_rfer;
- __le64 max_excl;
- __le64 rsv_rfer;
- __le64 rsv_excl;
-} __attribute__ ((__packed__));
-
/* For raid type sysfs entries */
struct raid_kobject {
int raid_type;
@@ -1221,7 +431,7 @@ struct btrfs_space_info {
* bytes_pinned does not reflect the bytes that will be pinned once the
* delayed refs are flushed, so this counter is inc'ed every time we
* call btrfs_free_extent so it is a realtime count of what will be
- * freed once the transaction is committed. It will be zero'ed every
+ * freed once the transaction is committed. It will be zeroed every
* time the transaction commits.
*/
struct percpu_counter total_bytes_pinned;
@@ -1408,6 +618,27 @@ struct btrfs_block_group_cache {
struct btrfs_io_ctl io_ctl;
+ /*
+ * Incremented when doing extent allocations and holding a read lock
+ * on the space_info's groups_sem semaphore.
+ * Decremented when an ordered extent that represents an IO against this
+ * block group's range is created (after it's added to its inode's
+ * root's list of ordered extents) or immediately after the allocation
+ * if it's a metadata extent or fallocate extent (for these cases we
+ * don't create ordered extents).
+ */
+ atomic_t reservations;
+
+ /*
+ * Incremented while holding the spinlock *lock* by a task checking if
+ * it can perform a nocow write (incremented if the value for the *ro*
+ * field is 0). Decremented by such tasks once they create an ordered
+ * extent or before that if some error happens before reaching that step.
+ * This is to prevent races between block group relocation and nocow
+ * writes through direct IO.
+ */
+ atomic_t nocow_writers;
+
/* Lock for free space tree operations. */
struct mutex free_space_lock;
@@ -2026,228 +1257,6 @@ struct btrfs_root {
atomic_t qgroup_meta_rsv;
};
-struct btrfs_ioctl_defrag_range_args {
- /* start of the defrag operation */
- __u64 start;
-
- /* number of bytes to defrag, use (u64)-1 to say all */
- __u64 len;
-
- /*
- * flags for the operation, which can include turning
- * on compression for this one defrag
- */
- __u64 flags;
-
- /*
- * any extent bigger than this will be considered
- * already defragged. Use 0 to take the kernel default
- * Use 1 to say every single extent must be rewritten
- */
- __u32 extent_thresh;
-
- /*
- * which compression method to use if turning on compression
- * for this defrag operation. If unspecified, zlib will
- * be used
- */
- __u32 compress_type;
-
- /* spare for later */
- __u32 unused[4];
-};
-
-
-/*
- * inode items have the data typically returned from stat and store other
- * info about object characteristics. There is one for every file and dir in
- * the FS
- */
-#define BTRFS_INODE_ITEM_KEY 1
-#define BTRFS_INODE_REF_KEY 12
-#define BTRFS_INODE_EXTREF_KEY 13
-#define BTRFS_XATTR_ITEM_KEY 24
-#define BTRFS_ORPHAN_ITEM_KEY 48
-/* reserve 2-15 close to the inode for later flexibility */
-
-/*
- * dir items are the name -> inode pointers in a directory. There is one
- * for every name in a directory.
- */
-#define BTRFS_DIR_LOG_ITEM_KEY 60
-#define BTRFS_DIR_LOG_INDEX_KEY 72
-#define BTRFS_DIR_ITEM_KEY 84
-#define BTRFS_DIR_INDEX_KEY 96
-/*
- * extent data is for file data
- */
-#define BTRFS_EXTENT_DATA_KEY 108
-
-/*
- * extent csums are stored in a separate tree and hold csums for
- * an entire extent on disk.
- */
-#define BTRFS_EXTENT_CSUM_KEY 128
-
-/*
- * root items point to tree roots. They are typically in the root
- * tree used by the super block to find all the other trees
- */
-#define BTRFS_ROOT_ITEM_KEY 132
-
-/*
- * root backrefs tie subvols and snapshots to the directory entries that
- * reference them
- */
-#define BTRFS_ROOT_BACKREF_KEY 144
-
-/*
- * root refs make a fast index for listing all of the snapshots and
- * subvolumes referenced by a given root. They point directly to the
- * directory item in the root that references the subvol
- */
-#define BTRFS_ROOT_REF_KEY 156
-
-/*
- * extent items are in the extent map tree. These record which blocks
- * are used, and how many references there are to each block
- */
-#define BTRFS_EXTENT_ITEM_KEY 168
-
-/*
- * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
- * the length, so we save the level in key->offset instead of the length.
- */
-#define BTRFS_METADATA_ITEM_KEY 169
-
-#define BTRFS_TREE_BLOCK_REF_KEY 176
-
-#define BTRFS_EXTENT_DATA_REF_KEY 178
-
-#define BTRFS_EXTENT_REF_V0_KEY 180
-
-#define BTRFS_SHARED_BLOCK_REF_KEY 182
-
-#define BTRFS_SHARED_DATA_REF_KEY 184
-
-/*
- * block groups give us hints into the extent allocation trees. Which
- * blocks are free etc etc
- */
-#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
-
-/*
- * Every block group is represented in the free space tree by a free space info
- * item, which stores some accounting information. It is keyed on
- * (block_group_start, FREE_SPACE_INFO, block_group_length).
- */
-#define BTRFS_FREE_SPACE_INFO_KEY 198
-
-/*
- * A free space extent tracks an extent of space that is free in a block group.
- * It is keyed on (start, FREE_SPACE_EXTENT, length).
- */
-#define BTRFS_FREE_SPACE_EXTENT_KEY 199
-
-/*
- * When a block group becomes very fragmented, we convert it to use bitmaps
- * instead of extents. A free space bitmap is keyed on
- * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
- * (length / sectorsize) bits.
- */
-#define BTRFS_FREE_SPACE_BITMAP_KEY 200
-
-#define BTRFS_DEV_EXTENT_KEY 204
-#define BTRFS_DEV_ITEM_KEY 216
-#define BTRFS_CHUNK_ITEM_KEY 228
-
-/*
- * Records the overall state of the qgroups.
- * There's only one instance of this key present,
- * (0, BTRFS_QGROUP_STATUS_KEY, 0)
- */
-#define BTRFS_QGROUP_STATUS_KEY 240
-/*
- * Records the currently used space of the qgroup.
- * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
- */
-#define BTRFS_QGROUP_INFO_KEY 242
-/*
- * Contains the user configured limits for the qgroup.
- * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
- */
-#define BTRFS_QGROUP_LIMIT_KEY 244
-/*
- * Records the child-parent relationship of qgroups. For
- * each relation, 2 keys are present:
- * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
- * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
- */
-#define BTRFS_QGROUP_RELATION_KEY 246
-
-/*
- * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
- */
-#define BTRFS_BALANCE_ITEM_KEY 248
-
-/*
- * The key type for tree items that are stored persistently, but do not need to
- * exist for extended period of time. The items can exist in any tree.
- *
- * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
- *
- * Existing items:
- *
- * - balance status item
- * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
- */
-#define BTRFS_TEMPORARY_ITEM_KEY 248
-
-/*
- * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
- */
-#define BTRFS_DEV_STATS_KEY 249
-
-/*
- * The key type for tree items that are stored persistently and usually exist
- * for a long period, eg. filesystem lifetime. The item kinds can be status
- * information, stats or preference values. The item can exist in any tree.
- *
- * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
- *
- * Existing items:
- *
- * - device statistics, store IO stats in the device tree, one key for all
- * stats
- * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
- */
-#define BTRFS_PERSISTENT_ITEM_KEY 249
-
-/*
- * Persistantly stores the device replace state in the device tree.
- * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
- */
-#define BTRFS_DEV_REPLACE_KEY 250
-
-/*
- * Stores items that allow to quickly map UUIDs to something else.
- * These items are part of the filesystem UUID tree.
- * The key is built like this:
- * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
- */
-#if BTRFS_UUID_SIZE != 16
-#error "UUID items require BTRFS_UUID_SIZE == 16!"
-#endif
-#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */
-#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to
- * received subvols */
-
-/*
- * string items are for debugging. They just store a short string of
- * data in the FS
- */
-#define BTRFS_STRING_ITEM_KEY 253
-
/*
* Flags for mount options.
*
@@ -2392,7 +1401,7 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
token->kaddr = NULL;
}
-/* some macros to generate set/get funcs for the struct fields. This
+/* some macros to generate set/get functions for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple
* one for u8:
*/
@@ -3499,6 +2508,12 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+ const u64 start);
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
@@ -4122,6 +3137,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_update_iflags(struct inode *inode);
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
@@ -4326,10 +3342,9 @@ static inline void assfail(char *expr, char *file, int line)
#define ASSERT(expr) ((void)0)
#endif
-#define btrfs_assert()
__printf(5, 6)
__cold
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
const char *btrfs_decode_error(int errno);
@@ -4339,6 +3354,46 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno);
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+#define btrfs_abort_transaction(trans, root, errno) \
+do { \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((root)->fs_info->fs_state))) { \
+ WARN(1, KERN_DEBUG \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno)); \
+ } \
+ __btrfs_abort_transaction((trans), (root), __func__, \
+ __LINE__, (errno)); \
+} while (0)
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
+} while (0)
+
+__printf(5, 6)
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic(). Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ BUG(); \
+} while (0)
+
+
+/* compatibility and incompatibility defines */
+
#define btrfs_set_fs_incompat(__fs_info, opt) \
__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
@@ -4455,44 +3510,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
}
-/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact line number is reported.
- */
-#define btrfs_abort_transaction(trans, root, errno) \
-do { \
- /* Report first abort since mount */ \
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
- &((root)->fs_info->fs_state))) { \
- WARN(1, KERN_DEBUG \
- "BTRFS: Transaction aborted (error %d)\n", \
- (errno)); \
- } \
- __btrfs_abort_transaction((trans), (root), __func__, \
- __LINE__, (errno)); \
-} while (0)
-
-#define btrfs_std_error(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_std_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
-} while (0)
-
-__printf(5, 6)
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
-
-/*
- * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
- * will panic(). Otherwise we BUG() here.
- */
-#define btrfs_panic(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
- BUG(); \
-} while (0)
-
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6cef0062f929..61561c2a3f96 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -134,7 +134,7 @@ again:
/* cached in the btrfs inode and can be accessed */
atomic_add(2, &node->refs);
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ ret = radix_tree_preload(GFP_NOFS);
if (ret) {
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(ret);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c24b653c7343..5fca9534a271 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -188,7 +188,7 @@ struct btrfs_delayed_ref_root {
/*
* To make qgroup to skip given root.
- * This is for snapshot, as btrfs_qgroup_inherit() will manully
+ * This is for snapshot, as btrfs_qgroup_inherit() will manually
* modify counters for snapshot and its source, so we should skip
* the snapshot in new_root/old_roots or it will get calculated twice
*/
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 26bcb487f958..63ef9cdf0144 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev);
-static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
- char *srcdev_name,
- struct btrfs_device **device);
static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
static int btrfs_dev_replace_kthread(void *data);
static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
@@ -305,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
dev_replace->cursor_left_last_write_of_item;
}
-int btrfs_dev_replace_start(struct btrfs_root *root,
- struct btrfs_ioctl_dev_replace_args *args)
+int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+ u64 srcdevid, char *srcdev_name, int read_src)
{
struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -315,29 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
- switch (args->start.cont_reading_from_srcdev_mode) {
- case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
- case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
- break;
- default:
- return -EINVAL;
- }
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
-
/* the disk copy procedure reuses the scrub code */
mutex_lock(&fs_info->volume_mutex);
- ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
- args->start.srcdev_name,
- &src_device);
+ ret = btrfs_find_device_by_devspec(root, srcdevid,
+ srcdev_name, &src_device);
if (ret) {
mutex_unlock(&fs_info->volume_mutex);
return ret;
}
- ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name,
src_device, &tgt_device);
mutex_unlock(&fs_info->volume_mutex);
if (ret)
@@ -364,18 +348,17 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
goto leave;
}
- dev_replace->cont_reading_from_srcdev_mode =
- args->start.cont_reading_from_srcdev_mode;
+ dev_replace->cont_reading_from_srcdev_mode = read_src;
WARN_ON(!src_device);
dev_replace->srcdev = src_device;
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
- btrfs_info_in_rcu(root->fs_info,
+ btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s started",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
@@ -396,14 +379,13 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
- args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace, 1);
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
- btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+ btrfs_err(fs_info, "kobj add dev failed %d\n", ret);
- btrfs_wait_ordered_roots(root->fs_info, -1);
+ btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -421,11 +403,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
btrfs_device_get_total_bytes(src_device),
&dev_replace->scrub_progress, 0, 1);
- ret = btrfs_dev_replace_finishing(root->fs_info, ret);
- /* don't warn if EINPROGRESS, someone else might be running scrub */
+ ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
- args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
- ret = 0;
+ ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
} else {
WARN_ON(ret);
}
@@ -440,8 +420,37 @@ leave:
return ret;
}
+int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ int ret;
+
+ switch (args->start.cont_reading_from_srcdev_mode) {
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+ args->start.tgtdev_name[0] == '\0')
+ return -EINVAL;
+
+ ret = btrfs_dev_replace_start(root, args->start.tgtdev_name,
+ args->start.srcdevid,
+ args->start.srcdev_name,
+ args->start.cont_reading_from_srcdev_mode);
+ args->result = ret;
+ /* don't warn if EINPROGRESS, someone else might be running scrub */
+ if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
+ ret = 0;
+
+ return ret;
+}
+
/*
- * blocked until all flighting bios are finished.
+ * blocked until all in-flight bios operations are finished.
*/
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
@@ -495,7 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_roots(root->fs_info, -1);
+ btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -560,10 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
ASSERT(list_empty(&src_device->resized_list));
tgt_device->commit_total_bytes = src_device->commit_total_bytes;
tgt_device->commit_bytes_used = src_device->bytes_used;
- if (fs_info->sb->s_bdev == src_device->bdev)
- fs_info->sb->s_bdev = tgt_device->bdev;
- if (fs_info->fs_devices->latest_bdev == src_device->bdev)
- fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+
+ btrfs_assign_next_active_device(fs_info, src_device, tgt_device);
+
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
@@ -626,25 +634,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
write_unlock(&em_tree->lock);
}
-static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
- char *srcdev_name,
- struct btrfs_device **device)
-{
- int ret;
-
- if (srcdevid) {
- ret = 0;
- *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
- NULL);
- if (!*device)
- ret = -ENOENT;
- } else {
- ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
- device);
- }
- return ret;
-}
-
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 29e3ef5f96bd..e922b42d91df 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
-int btrfs_dev_replace_start(struct btrfs_root *root,
+int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+ u64 srcdevid, char *srcdev_name, int read_src);
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4e47849d7427..54cca7a1572b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -384,7 +384,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
/*
* Things reading via commit roots that don't have normal protection,
* like send, can have a really old block in cache that may point at a
- * block that has been free'd and re-allocated. So don't clear uptodate
+ * block that has been freed and re-allocated. So don't clear uptodate
* if we find an eb that is under IO (dirty/writeback) because we could
* end up reading in the stale data and then writing it back out and
* making everybody very sad.
@@ -418,7 +418,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
/*
* The super_block structure does not span the whole
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
- * is filled with zeros and is included in the checkum.
+ * is filled with zeros and is included in the checksum.
*/
crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
@@ -600,7 +600,7 @@ static noinline int check_leaf(struct btrfs_root *root,
/*
* Check to make sure that we don't point outside of the leaf,
- * just incase all the items are consistent to eachother, but
+ * just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
if (btrfs_item_end_nr(leaf, slot) >
@@ -1098,7 +1098,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
struct inode *btree_inode = root->fs_info->btree_inode;
buf = btrfs_find_create_tree_block(root, bytenr);
- if (!buf)
+ if (IS_ERR(buf))
return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
buf, 0, WAIT_NONE, btree_get_extent, 0);
@@ -1114,7 +1114,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
int ret;
buf = btrfs_find_create_tree_block(root, bytenr);
- if (!buf)
+ if (IS_ERR(buf))
return 0;
set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
@@ -1147,7 +1147,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr)
{
if (btrfs_test_is_dummy_root(root))
- return alloc_test_extent_buffer(root->fs_info, bytenr);
+ return alloc_test_extent_buffer(root->fs_info, bytenr,
+ root->nodesize);
return alloc_extent_buffer(root->fs_info, bytenr);
}
@@ -1171,8 +1172,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
int ret;
buf = btrfs_find_create_tree_block(root, bytenr);
- if (!buf)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(buf))
+ return buf;
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret) {
@@ -1314,14 +1315,16 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
-struct btrfs_root *btrfs_alloc_dummy_root(void)
+struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize)
{
struct btrfs_root *root;
root = btrfs_alloc_root(NULL, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(4096, 4096, 4096, root, NULL, 1);
+ /* We don't use the stripesize in selftest, set it as sectorsize */
+ __setup_root(nodesize, sectorsize, sectorsize, root, NULL,
+ BTRFS_ROOT_TREE_OBJECTID);
set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
root->alloc_bytenr = 0;
@@ -1640,7 +1643,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ ret = radix_tree_preload(GFP_NOFS);
if (ret)
return ret;
@@ -1803,6 +1806,13 @@ static int cleaner_kthread(void *arg)
if (btrfs_need_cleaner_sleep(root))
goto sleep;
+ /*
+ * Do not do anything if we might cause open_ctree() to block
+ * before we have finished mounting the filesystem.
+ */
+ if (!root->fs_info->open)
+ goto sleep;
+
if (!mutex_trylock(&root->fs_info->cleaner_mutex))
goto sleep;
@@ -2417,7 +2427,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
- btrfs_std_error(tree_root->fs_info, ret,
+ btrfs_handle_fs_error(tree_root->fs_info, ret,
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2713,7 +2723,7 @@ int open_ctree(struct super_block *sb,
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
if (btrfs_check_super_csum(bh->b_data)) {
- printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
+ btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
brelse(bh);
goto fail_alloc;
@@ -2733,7 +2743,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
if (ret) {
- printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
+ btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
goto fail_alloc;
}
@@ -2768,9 +2778,9 @@ int open_ctree(struct super_block *sb,
features = btrfs_super_incompat_flags(disk_super) &
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
- printk(KERN_ERR "BTRFS: couldn't mount because of "
- "unsupported optional features (%Lx).\n",
- features);
+ btrfs_err(fs_info,
+ "cannot mount because of unsupported optional features (%llx)",
+ features);
err = -EINVAL;
goto fail_alloc;
}
@@ -2781,7 +2791,7 @@ int open_ctree(struct super_block *sb,
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- printk(KERN_INFO "BTRFS: has skinny extents\n");
+ btrfs_info(fs_info, "has skinny extents");
/*
* flag our filesystem as having big metadata blocks if
@@ -2789,7 +2799,8 @@ int open_ctree(struct super_block *sb,
*/
if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
+ btrfs_info(fs_info,
+ "flagging fs with big metadata feature");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
@@ -2805,9 +2816,9 @@ int open_ctree(struct super_block *sb,
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
(sectorsize != nodesize)) {
- printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
- "are not allowed for mixed block groups on %s\n",
- sb->s_id);
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ nodesize, sectorsize);
goto fail_alloc;
}
@@ -2820,8 +2831,8 @@ int open_ctree(struct super_block *sb,
features = btrfs_super_compat_ro_flags(disk_super) &
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
- "unsupported option features (%Lx).\n",
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unsupported optional features (%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -2850,8 +2861,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
- printk(KERN_ERR "BTRFS: failed to read the system "
- "array on %s\n", sb->s_id);
+ btrfs_err(fs_info, "failed to read the system array: %d", ret);
goto fail_sb_buffer;
}
@@ -2865,8 +2875,7 @@ int open_ctree(struct super_block *sb,
generation);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
- sb->s_id);
+ btrfs_err(fs_info, "failed to read chunk root");
if (!IS_ERR(chunk_root->node))
free_extent_buffer(chunk_root->node);
chunk_root->node = NULL;
@@ -2880,8 +2889,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_chunk_tree(chunk_root);
if (ret) {
- printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
- sb->s_id);
+ btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
goto fail_tree_roots;
}
@@ -2892,8 +2900,7 @@ int open_ctree(struct super_block *sb,
btrfs_close_extra_devices(fs_devices, 0);
if (!fs_devices->latest_bdev) {
- printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
- sb->s_id);
+ btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
}
@@ -2905,8 +2912,7 @@ retry_root_backup:
generation);
if (IS_ERR(tree_root->node) ||
!extent_buffer_uptodate(tree_root->node)) {
- printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
- sb->s_id);
+ btrfs_warn(fs_info, "failed to read tree root");
if (!IS_ERR(tree_root->node))
free_extent_buffer(tree_root->node);
tree_root->node = NULL;
@@ -2938,20 +2944,19 @@ retry_root_backup:
ret = btrfs_recover_balance(fs_info);
if (ret) {
- printk(KERN_ERR "BTRFS: failed to recover balance\n");
+ btrfs_err(fs_info, "failed to recover balance: %d", ret);
goto fail_block_groups;
}
ret = btrfs_init_dev_stats(fs_info);
if (ret) {
- printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
- ret);
+ btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
goto fail_block_groups;
}
ret = btrfs_init_dev_replace(fs_info);
if (ret) {
- pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
+ btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
goto fail_block_groups;
}
@@ -2959,31 +2964,33 @@ retry_root_backup:
ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
if (ret) {
- pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+ btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
+ ret);
goto fail_block_groups;
}
ret = btrfs_sysfs_add_device(fs_devices);
if (ret) {
- pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+ btrfs_err(fs_info, "failed to init sysfs device interface: %d",
+ ret);
goto fail_fsdev_sysfs;
}
ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
- pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
+ btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
goto fail_fsdev_sysfs;
}
ret = btrfs_init_space_info(fs_info);
if (ret) {
- printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
+ btrfs_err(fs_info, "failed to initialize space info: %d", ret);
goto fail_sysfs;
}
ret = btrfs_read_block_groups(fs_info->extent_root);
if (ret) {
- printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
+ btrfs_err(fs_info, "failed to read block groups: %d", ret);
goto fail_sysfs;
}
fs_info->num_tolerated_disk_barrier_failures =
@@ -2991,7 +2998,8 @@ retry_root_backup:
if (fs_info->fs_devices->missing_devices >
fs_info->num_tolerated_disk_barrier_failures &&
!(sb->s_flags & MS_RDONLY)) {
- pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
+ btrfs_warn(fs_info,
+"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed",
fs_info->fs_devices->missing_devices,
fs_info->num_tolerated_disk_barrier_failures);
goto fail_sysfs;
@@ -3011,13 +3019,12 @@ retry_root_backup:
if (!btrfs_test_opt(tree_root, SSD) &&
!btrfs_test_opt(tree_root, NOSSD) &&
!fs_info->fs_devices->rotating) {
- printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
- "mode\n");
+ btrfs_info(fs_info, "detected SSD devices, enabling SSD mode");
btrfs_set_opt(fs_info->mount_opt, SSD);
}
/*
- * Mount does not set all options immediatelly, we can do it now and do
+ * Mount does not set all options immediately, we can do it now and do
* not have to wait for transaction commit
*/
btrfs_apply_pending_changes(fs_info);
@@ -3030,8 +3037,9 @@ retry_root_backup:
1 : 0,
fs_info->check_integrity_print_mask);
if (ret)
- printk(KERN_WARNING "BTRFS: failed to initialize"
- " integrity check module %s\n", sb->s_id);
+ btrfs_warn(fs_info,
+ "failed to initialize integrity check module: %d",
+ ret);
}
#endif
ret = btrfs_read_qgroup_config(fs_info);
@@ -3061,8 +3069,8 @@ retry_root_backup:
ret = btrfs_recover_relocation(tree_root);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
- printk(KERN_WARNING
- "BTRFS: failed to recover relocation\n");
+ btrfs_warn(fs_info, "failed to recover relocation: %d",
+ ret);
err = -EINVAL;
goto fail_qgroup;
}
@@ -3083,11 +3091,11 @@ retry_root_backup:
if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- pr_info("BTRFS: creating free space tree\n");
+ btrfs_info(fs_info, "creating free space tree");
ret = btrfs_create_free_space_tree(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to create free space tree %d\n",
- ret);
+ btrfs_warn(fs_info,
+ "failed to create free space tree: %d", ret);
close_ctree(tree_root);
return ret;
}
@@ -3104,14 +3112,14 @@ retry_root_backup:
ret = btrfs_resume_balance_async(fs_info);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to resume balance\n");
+ btrfs_warn(fs_info, "failed to resume balance: %d", ret);
close_ctree(tree_root);
return ret;
}
ret = btrfs_resume_dev_replace_async(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to resume dev_replace\n");
+ btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
close_ctree(tree_root);
return ret;
}
@@ -3120,33 +3128,33 @@ retry_root_backup:
if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- pr_info("BTRFS: clearing free space tree\n");
+ btrfs_info(fs_info, "clearing free space tree");
ret = btrfs_clear_free_space_tree(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to clear free space tree %d\n",
- ret);
+ btrfs_warn(fs_info,
+ "failed to clear free space tree: %d", ret);
close_ctree(tree_root);
return ret;
}
}
if (!fs_info->uuid_root) {
- pr_info("BTRFS: creating UUID tree\n");
+ btrfs_info(fs_info, "creating UUID tree");
ret = btrfs_create_uuid_tree(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to create the UUID tree %d\n",
- ret);
+ btrfs_warn(fs_info,
+ "failed to create the UUID tree: %d", ret);
close_ctree(tree_root);
return ret;
}
} else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
fs_info->generation !=
btrfs_super_uuid_tree_generation(disk_super)) {
- pr_info("BTRFS: checking UUID tree\n");
+ btrfs_info(fs_info, "checking UUID tree");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to check the UUID tree %d\n",
- ret);
+ btrfs_warn(fs_info,
+ "failed to check the UUID tree: %d", ret);
close_ctree(tree_root);
return ret;
}
@@ -3245,7 +3253,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
"lost page write due to IO error on %s",
rcu_str_deref(device->name));
- /* note, we dont' set_buffer_write_io_error because we have
+ /* note, we don't set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
*/
clear_buffer_uptodate(bh);
@@ -3646,7 +3654,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (ret) {
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"errors while submitting device barriers.");
return ret;
}
@@ -3686,7 +3694,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
/* FUA is masked off if unsupported and can't be the reason */
- btrfs_std_error(root->fs_info, -EIO,
+ btrfs_handle_fs_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3704,7 +3712,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- btrfs_std_error(root->fs_info, -EIO,
+ btrfs_handle_fs_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -4120,6 +4128,18 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
* done later
*/
+ if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
+ btrfs_err(fs_info, "bytes_used is too small %llu",
+ btrfs_super_bytes_used(sb));
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(btrfs_super_stripesize(sb)) ||
+ ((btrfs_super_stripesize(sb) != sectorsize) &&
+ (btrfs_super_stripesize(sb) != 4096))) {
+ btrfs_err(fs_info, "invalid stripesize %u",
+ btrfs_super_stripesize(sb));
+ ret = -EINVAL;
+ }
if (btrfs_super_num_devices(sb) > (1UL << 31))
printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
btrfs_super_num_devices(sb));
@@ -4357,7 +4377,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
if (ret)
break;
- clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+ clear_extent_bits(dirty_pages, start, end, mark);
while (start <= end) {
eb = btrfs_find_tree_block(root->fs_info, start);
start += root->nodesize;
@@ -4392,7 +4412,7 @@ again:
if (ret)
break;
- clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ clear_extent_dirty(unpin, start, end);
btrfs_error_unpin_extent_range(root, start, end);
cond_resched();
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8e79d0070bcf..acba821499a9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -90,7 +90,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
void btrfs_free_fs_root(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-struct btrfs_root *btrfs_alloc_dummy_root(void);
+struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize);
#endif
/*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 84e060eb0de8..29e5d000bbee 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -231,9 +231,9 @@ static int add_excluded_extent(struct btrfs_root *root,
{
u64 end = start + num_bytes - 1;
set_extent_bits(&root->fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE, GFP_NOFS);
+ start, end, EXTENT_UPTODATE);
set_extent_bits(&root->fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE, GFP_NOFS);
+ start, end, EXTENT_UPTODATE);
return 0;
}
@@ -246,9 +246,9 @@ static void free_excluded_extents(struct btrfs_root *root,
end = start + cache->key.offset - 1;
clear_extent_bits(&root->fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE, GFP_NOFS);
+ start, end, EXTENT_UPTODATE);
clear_extent_bits(&root->fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE, GFP_NOFS);
+ start, end, EXTENT_UPTODATE);
}
static int exclude_super_stripes(struct btrfs_root *root,
@@ -980,7 +980,7 @@ out_free:
* event that tree block loses its owner tree's reference and do the
* back refs conversion.
*
- * When a tree block is COW'd through a tree, there are four cases:
+ * When a tree block is COWed through a tree, there are four cases:
*
* The reference count of the block is one and the tree is the block's
* owner tree. Nothing to do in this case.
@@ -2042,6 +2042,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
struct btrfs_bio *bbio = NULL;
+ /*
+ * Avoid races with device replace and make sure our bbio has devices
+ * associated to its stripes that don't go away while we are discarding.
+ */
+ btrfs_bio_counter_inc_blocked(root->fs_info);
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
bytenr, &num_bytes, &bbio, 0);
@@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
}
btrfs_put_bbio(bbio);
}
+ btrfs_bio_counter_dec(root->fs_info);
if (actual_bytes)
*actual_bytes = discarded_bytes;
@@ -2595,7 +2601,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
/*
- * Need to drop our head ref lock and re-aqcuire the
+ * Need to drop our head ref lock and re-acquire the
* delayed ref lock and then re-check to make sure
* nobody got added.
*/
@@ -2747,7 +2753,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
/*
* We don't ever fill up leaves all the way so multiply by 2 just to be
- * closer to what we're really going to want to ouse.
+ * closer to what we're really going to want to use.
*/
return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
}
@@ -2851,7 +2857,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
}
/*
- * trans->sync means that when we call end_transaciton, we won't
+ * trans->sync means that when we call end_transaction, we won't
* wait on delayed refs
*/
trans->sync = true;
@@ -3824,6 +3830,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
return readonly;
}
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group_cache *bg;
+ bool ret = true;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg)
+ return false;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ atomic_inc(&bg->nocow_writers);
+ spin_unlock(&bg->lock);
+
+ /* no put on block group, done by btrfs_dec_nocow_writers */
+ if (!ret)
+ btrfs_put_block_group(bg);
+
+ return ret;
+
+}
+
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group_cache *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ ASSERT(bg);
+ if (atomic_dec_and_test(&bg->nocow_writers))
+ wake_up_atomic_t(&bg->nocow_writers);
+ /*
+ * Once for our lookup and once for the lookup done by a previous call
+ * to btrfs_inc_nocow_writers()
+ */
+ btrfs_put_block_group(bg);
+ btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+{
+ wait_on_atomic_t(&bg->nocow_writers,
+ btrfs_wait_nocow_writers_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+}
+
static const char *alloc_name(u64 flags)
{
switch (flags) {
@@ -4141,7 +4200,7 @@ commit_trans:
if (need_commit > 0) {
btrfs_start_delalloc_roots(fs_info, 0, -1);
- btrfs_wait_ordered_roots(fs_info, -1);
+ btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
}
trans = btrfs_join_transaction(root);
@@ -4243,7 +4302,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
* Called if we need to clear a data reservation for this inode
* Normally in a error case.
*
- * This one will handle the per-indoe data rsv map for accurate reserved
+ * This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
@@ -4583,7 +4642,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
*/
btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
if (!current->journal_info)
- btrfs_wait_ordered_roots(root->fs_info, nr_items);
+ btrfs_wait_ordered_roots(root->fs_info, nr_items,
+ 0, (u64)-1);
}
}
@@ -4620,7 +4680,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
/* Calc the number of the pages we need flush for space reservation */
items = calc_reclaim_items_nr(root, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -4632,7 +4692,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
if (trans)
return;
if (wait_ordered)
- btrfs_wait_ordered_roots(root->fs_info, items);
+ btrfs_wait_ordered_roots(root->fs_info, items,
+ 0, (u64)-1);
return;
}
@@ -4671,7 +4732,8 @@ skip_async:
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(root->fs_info, items);
+ btrfs_wait_ordered_roots(root->fs_info, items,
+ 0, (u64)-1);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -4911,7 +4973,7 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
* @orig_bytes - the number of bytes we want
* @flush - whether or not we can flush to make our reservation
*
- * This will reserve orgi_bytes number of bytes from the space info associated
+ * This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
* flush out space to make room. It will do this by flushing delalloc if
* possible or committing the transaction. If flush is 0 then no attempts to
@@ -5516,7 +5578,7 @@ void btrfs_orphan_release_metadata(struct inode *inode)
* common file/directory operations, they change two fs/file trees
* and root tree, the number of items that the qgroup reserves is
* different with the free space reservation. So we can not use
- * the space reseravtion mechanism in start_transaction().
+ * the space reservation mechanism in start_transaction().
*/
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
@@ -5565,7 +5627,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
/**
* drop_outstanding_extent - drop an outstanding extent
* @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're relaseing.
+ * @num_bytes: the number of bytes we're releasing.
*
* This is called when we are freeing up an outstanding extent, either called
* after an error or after an extent is written. This will return the number of
@@ -5591,7 +5653,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
drop_inode_space = 1;
/*
- * If we have more or the same amount of outsanding extents than we have
+ * If we have more or the same amount of outstanding extents than we have
* reserved then we need to leave the reserved extents count alone.
*/
if (BTRFS_I(inode)->outstanding_extents >=
@@ -5605,8 +5667,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
}
/**
- * calc_csum_metadata_size - return the amount of metada space that must be
- * reserved/free'd for the given bytes.
+ * calc_csum_metadata_size - return the amount of metadata space that must be
+ * reserved/freed for the given bytes.
* @inode: the inode we're manipulating
* @num_bytes: the number of bytes in question
* @reserve: 1 if we are reserving space, 0 if we are freeing space
@@ -5758,7 +5820,7 @@ out_fail:
/*
* This is tricky, but first we need to figure out how much we
- * free'd from any free-ers that occurred during this
+ * freed from any free-ers that occurred during this
* reservation, so we reset ->csum_bytes to the csum_bytes
* before we dropped our lock, and then call the free for the
* number of bytes that were freed while we were trying our
@@ -5780,7 +5842,7 @@ out_fail:
/*
* Now reset ->csum_bytes to what it should be. If bytes is
- * more than to_free then we would have free'd more space had we
+ * more than to_free then we would have freed more space had we
* not had an artificially high ->csum_bytes, so we need to free
* the remainder. If bytes is the same or less then we don't
* need to do anything, the other free-ers did the correct
@@ -6172,6 +6234,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
return 0;
}
+static void
+btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+ atomic_inc(&bg->reservations);
+}
+
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+ const u64 start)
+{
+ struct btrfs_block_group_cache *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, start);
+ ASSERT(bg);
+ if (atomic_dec_and_test(&bg->reservations))
+ wake_up_atomic_t(&bg->reservations);
+ btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+ struct btrfs_space_info *space_info = bg->space_info;
+
+ ASSERT(bg->ro);
+
+ if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+ return;
+
+ /*
+ * Our block group is read only but before we set it to read only,
+ * some task might have had allocated an extent from it already, but it
+ * has not yet created a respective ordered extent (and added it to a
+ * root's list of ordered extents).
+ * Therefore wait for any task currently allocating extents, since the
+ * block group's reservations counter is incremented while a read lock
+ * on the groups' semaphore is held and decremented after releasing
+ * the read access on that semaphore and creating the ordered extent.
+ */
+ down_write(&space_info->groups_sem);
+ up_write(&space_info->groups_sem);
+
+ wait_on_atomic_t(&bg->reservations,
+ btrfs_wait_bg_reservations_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+}
+
/**
* btrfs_update_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
@@ -6408,7 +6521,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = btrfs_discard_extent(root, start,
end + 1 - start, NULL);
- clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ clear_extent_dirty(unpin, start, end);
unpin_extent_range(root, start, end, true);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
@@ -7025,36 +7138,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
int delalloc)
{
struct btrfs_block_group_cache *used_bg = NULL;
- bool locked = false;
-again:
+
spin_lock(&cluster->refill_lock);
- if (locked) {
- if (used_bg == cluster->block_group)
+ while (1) {
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
return used_bg;
- up_read(&used_bg->data_rwsem);
- btrfs_put_block_group(used_bg);
- }
+ btrfs_get_block_group(used_bg);
- used_bg = cluster->block_group;
- if (!used_bg)
- return NULL;
+ if (!delalloc)
+ return used_bg;
- if (used_bg == block_group)
- return used_bg;
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
- btrfs_get_block_group(used_bg);
+ spin_unlock(&cluster->refill_lock);
- if (!delalloc)
- return used_bg;
+ down_read(&used_bg->data_rwsem);
- if (down_read_trylock(&used_bg->data_rwsem))
- return used_bg;
+ spin_lock(&cluster->refill_lock);
+ if (used_bg == cluster->block_group)
+ return used_bg;
- spin_unlock(&cluster->refill_lock);
- down_read(&used_bg->data_rwsem);
- locked = true;
- goto again;
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
}
static inline void
@@ -7431,6 +7543,7 @@ checks:
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
}
+ btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
ins->objectid = search_start;
@@ -7471,7 +7584,7 @@ loop:
if (loop == LOOP_CACHING_NOWAIT) {
/*
* We want to skip the LOOP_CACHING_WAIT step if we
- * don't have any unached bgs and we've alrelady done a
+ * don't have any uncached bgs and we've already done a
* full search through.
*/
if (orig_have_caching_bg || !full_search)
@@ -7612,8 +7725,10 @@ again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
flags, delalloc);
-
- if (ret == -ENOSPC) {
+ if (!ret && !is_data) {
+ btrfs_dec_block_group_reservations(root->fs_info,
+ ins->objectid);
+ } else if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
num_bytes = min(num_bytes >> 1, ins->offset);
num_bytes = round_down(num_bytes, root->sectorsize);
@@ -7873,7 +7988,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
/*
* Mixed block groups will exclude before processing the log so we only
- * need to do the exlude dance if this fs isn't mixed.
+ * need to do the exclude dance if this fs isn't mixed.
*/
if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
@@ -7901,8 +8016,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf;
buf = btrfs_find_create_tree_block(root, bytenr);
- if (!buf)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(buf))
+ return buf;
+
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
@@ -7923,13 +8039,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
buf->start + buf->len - 1, GFP_NOFS);
else
set_extent_new(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1, GFP_NOFS);
+ buf->start + buf->len - 1);
} else {
buf->log_index = -1;
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
- trans->blocks_used++;
+ trans->dirty = true;
/* this returns a buffer locked for blocking */
return buf;
}
@@ -8544,8 +8660,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = btrfs_find_tree_block(root->fs_info, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr);
- if (!next)
- return -ENOMEM;
+ if (IS_ERR(next))
+ return PTR_ERR(next);
+
btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
level - 1);
reada = 1;
@@ -9058,7 +9175,7 @@ out:
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
- btrfs_std_error(root->fs_info, err, NULL);
+ btrfs_handle_fs_error(root->fs_info, err, NULL);
return err;
}
@@ -9317,7 +9434,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
u64 free_bytes = 0;
int factor;
- /* It's df, we don't care if it's racey */
+ /* It's df, we don't care if it's racy */
if (list_empty(&sinfo->ro_bgs))
return 0;
@@ -10526,14 +10643,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
- EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_DIRTY);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
- EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_DIRTY);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_dec_block_group_ro(root, block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d247fc0eea19..aaee3ef55ed8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -726,14 +726,6 @@ next:
start = last_end + 1;
if (start <= end && state && !need_resched())
goto hit_next;
- goto search_again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return 0;
search_again:
if (start > end)
@@ -742,6 +734,14 @@ search_again:
if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
}
static void wait_on_state(struct extent_io_tree *tree,
@@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
prealloc = alloc_extent_state(mask);
- BUG_ON(!prealloc);
}
spin_lock(&tree->lock);
@@ -1037,7 +1043,13 @@ hit_next:
goto out;
}
- goto search_again;
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
out:
spin_unlock(&tree->lock);
@@ -1046,13 +1058,6 @@ out:
return err;
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
}
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* @bits: the bits to set in this range
* @clear_bits: the bits to clear in this range
* @cached_state: state that we're going to cache
- * @mask: the allocation mask
*
* This will go through and set bits for the given range. If any states exist
* already in this range they are set with the given bit and cleared of the
* clear_bits. This is only meant to be used by things that are mergeable, ie
* converting from say DELALLOC to DIRTY. This is not meant to be used with
* boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
- struct extent_state **cached_state, gfp_t mask)
+ struct extent_state **cached_state)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
+ if (!prealloc) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
@@ -1106,7 +1112,7 @@ again:
* extent state allocations are needed. We'll only know this
* after locking the tree.
*/
- prealloc = alloc_extent_state(mask);
+ prealloc = alloc_extent_state(GFP_NOFS);
if (!prealloc && !first_iteration)
return -ENOMEM;
}
@@ -1263,7 +1269,13 @@ hit_next:
goto out;
}
- goto search_again;
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ cond_resched();
+ first_iteration = false;
+ goto again;
out:
spin_unlock(&tree->lock);
@@ -1271,21 +1283,11 @@ out:
free_extent_state(prealloc);
return err;
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- first_iteration = false;
- goto again;
}
/* wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask,
- struct extent_changeset *changeset)
+ unsigned bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
@@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
BUG_ON(bits & EXTENT_LOCKED);
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+ return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
changeset);
}
@@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask,
- struct extent_changeset *changeset)
+ unsigned bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
@@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
BUG_ON(bits & EXTENT_LOCKED);
- return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+ return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
changeset);
}
@@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
set_state_failrec(failure_tree, rec->start, NULL);
ret = clear_extent_bits(failure_tree, rec->start,
rec->start + rec->len - 1,
- EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY);
if (ret)
err = ret;
ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
rec->start + rec->len - 1,
- EXTENT_DAMAGED, GFP_NOFS);
+ EXTENT_DAMAGED);
if (ret && !err)
err = ret;
@@ -2024,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
bio->bi_iter.bi_size = 0;
map_length = length;
+ /*
+ * Avoid races with device replace and make sure our bbio has devices
+ * associated to its stripes that don't go away while we are doing the
+ * read repair operation.
+ */
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_block(fs_info, WRITE, logical,
&map_length, &bbio, mirror_num);
if (ret) {
+ btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
@@ -2036,6 +2044,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
dev = bbio->stripes[mirror_num-1].dev;
btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
+ btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
@@ -2044,6 +2053,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
/* try to remap that extent elsewhere? */
+ btrfs_bio_counter_dec(fs_info);
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
@@ -2053,6 +2063,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
btrfs_ino(inode), start,
rcu_str_deref(dev->name), sector);
+ btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return 0;
}
@@ -2232,13 +2243,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
/* set the bits in the private failure tree */
ret = set_extent_bits(failure_tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY);
if (ret >= 0)
ret = set_state_failrec(failure_tree, start, failrec);
/* set the bits in the inode's tree */
if (ret >= 0)
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
- GFP_NOFS);
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
if (ret < 0) {
kfree(failrec);
return ret;
@@ -3200,14 +3210,10 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
return ret;
}
-static noinline void update_nr_written(struct page *page,
- struct writeback_control *wbc,
- unsigned long nr_written)
+static void update_nr_written(struct page *page, struct writeback_control *wbc,
+ unsigned long nr_written)
{
wbc->nr_to_write -= nr_written;
- if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
- wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
- page->mapping->writeback_index = page->index + nr_written;
}
/*
@@ -3368,6 +3374,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
while (cur <= end) {
u64 em_end;
+ unsigned long max_nr;
+
if (cur >= i_size) {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
@@ -3423,32 +3431,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
continue;
}
- if (tree->ops && tree->ops->writepage_io_hook) {
- ret = tree->ops->writepage_io_hook(page, cur,
- cur + iosize - 1);
- } else {
- ret = 0;
+ max_nr = (i_size >> PAGE_SHIFT) + 1;
+
+ set_range_writeback(tree, cur, cur + iosize - 1);
+ if (!PageWriteback(page)) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "page %lu not writeback, cur %llu end %llu",
+ page->index, cur, end);
}
- if (ret) {
- SetPageError(page);
- } else {
- unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1;
- set_range_writeback(tree, cur, cur + iosize - 1);
- if (!PageWriteback(page)) {
- btrfs_err(BTRFS_I(inode)->root->fs_info,
- "page %lu not writeback, cur %llu end %llu",
- page->index, cur, end);
- }
+ ret = submit_extent_page(write_flags, tree, wbc, page,
+ sector, iosize, pg_offset,
+ bdev, &epd->bio, max_nr,
+ end_bio_extent_writepage,
+ 0, 0, 0, false);
+ if (ret)
+ SetPageError(page);
- ret = submit_extent_page(write_flags, tree, wbc, page,
- sector, iosize, pg_offset,
- bdev, &epd->bio, max_nr,
- end_bio_extent_writepage,
- 0, 0, 0, false);
- if (ret)
- SetPageError(page);
- }
cur = cur + iosize;
pg_offset += iosize;
nr++;
@@ -3920,12 +3919,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
- int err = 0;
int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
pgoff_t end; /* Inclusive */
+ pgoff_t done_index;
+ int range_whole = 0;
int scanned = 0;
int tag;
@@ -3948,6 +3948,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -3957,6 +3959,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
tag_pages_for_writeback(mapping, index, end);
+ done_index = index;
while (!done && !nr_to_write_done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -3966,6 +3969,7 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ done_index = page->index;
/*
* At this point we hold neither mapping->tree_lock nor
* lock on the page itself: the page may be truncated or
@@ -4007,8 +4011,20 @@ retry:
unlock_page(page);
ret = 0;
}
- if (!err && ret < 0)
- err = ret;
+ if (ret < 0) {
+ /*
+ * done_index is set past this page,
+ * so media errors will not choke
+ * background writeout for the entire
+ * file. This has consequences for
+ * range_cyclic semantics (ie. it may
+ * not be suitable for data integrity
+ * writeout).
+ */
+ done_index = page->index + 1;
+ done = 1;
+ break;
+ }
/*
* the filesystem may choose to bump up nr_to_write.
@@ -4020,7 +4036,7 @@ retry:
pagevec_release(&pvec);
cond_resched();
}
- if (!scanned && !done && !err) {
+ if (!scanned && !done) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
@@ -4029,8 +4045,12 @@ retry:
index = 0;
goto retry;
}
+
+ if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
+ mapping->writeback_index = done_index;
+
btrfs_add_delayed_iput(inode);
- return err;
+ return ret;
}
static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4379,8 +4399,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret < 0) {
btrfs_free_path(path);
return ret;
+ } else {
+ WARN_ON(!ret);
+ if (ret == 1)
+ ret = 0;
}
- WARN_ON(!ret);
+
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
found_type = found_key.type;
@@ -4591,7 +4615,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
if (mapped)
spin_unlock(&page->mapping->private_lock);
- /* One for when we alloced the page */
+ /* One for when we allocated the page */
put_page(page);
} while (index != 0);
}
@@ -4704,16 +4728,16 @@ err:
}
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u32 nodesize)
{
unsigned long len;
if (!fs_info) {
/*
* Called only from tests that don't always have a fs_info
- * available, but we know that nodesize is 4096
+ * available
*/
- len = 4096;
+ len = nodesize;
} else {
len = fs_info->tree_root->nodesize;
}
@@ -4809,7 +4833,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u32 nodesize)
{
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -4817,12 +4841,12 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
- eb = alloc_dummy_extent_buffer(fs_info, start);
+ eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
if (!eb)
return NULL;
eb->fs_info = fs_info;
again:
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ ret = radix_tree_preload(GFP_NOFS);
if (ret)
goto free_eb;
spin_lock(&fs_info->buffer_lock);
@@ -4868,18 +4892,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int uptodate = 1;
int ret;
+ if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
+ btrfs_err(fs_info, "bad tree block start %llu", start);
+ return ERR_PTR(-EINVAL);
+ }
+
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
- return NULL;
+ return ERR_PTR(-ENOMEM);
for (i = 0; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
- if (!p)
+ if (!p) {
+ exists = ERR_PTR(-ENOMEM);
goto free_eb;
+ }
spin_lock(&mapping->private_lock);
if (PagePrivate(p)) {
@@ -4923,9 +4954,11 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
again:
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
- if (ret)
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ exists = ERR_PTR(ret);
goto free_eb;
+ }
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
@@ -5751,7 +5784,7 @@ int try_release_extent_buffer(struct page *page)
struct extent_buffer *eb;
/*
- * We need to make sure noboody is attaching this page to an eb right
+ * We need to make sure nobody is attaching this page to an eb right
* now.
*/
spin_lock(&page->mapping->private_lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b5e0ade90e88..c0c1c4fef6ce 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -71,7 +71,6 @@ struct extent_io_ops {
u64 start, u64 end, int *page_started,
unsigned long *nr_written);
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
- int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
extent_submit_bio_hook_t *submit_bio_hook;
int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
size_t size, struct bio *bio,
@@ -221,8 +220,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int filled,
struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask,
- struct extent_changeset *changeset);
+ unsigned bits, struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
@@ -241,27 +239,27 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits, gfp_t mask)
+ u64 end, unsigned bits)
{
int wake = 0;
if (bits & EXTENT_LOCKED)
wake = 1;
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL,
+ GFP_NOFS);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask,
- struct extent_changeset *changeset);
+ unsigned bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits, gfp_t mask)
+ u64 end, unsigned bits)
{
- return set_extent_bit(tree, start, end, bits, NULL, NULL, mask);
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -279,37 +277,38 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
+ u64 end)
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
- struct extent_state **cached_state, gfp_t mask);
+ struct extent_state **cached_state);
static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
+ u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE,
- NULL, cached_state, mask);
+ NULL, cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
+ u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, mask);
+ NULL, cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
+ u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
+ GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -349,7 +348,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start, u32 nodesize);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -469,5 +468,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 *end, u64 max_bytes);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start, u32 nodesize);
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 318b048eb254..e0715fcfb11e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
/**
* free_extent_map - drop reference count of an extent_map
- * @em: extent map being releasead
+ * @em: extent map being released
*
* Drops the reference out on @em by one and free the structure
* if the reference count hits zero.
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7a7d6e253cfc..62a81ee13a5f 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -248,7 +248,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
BTRFS_DATA_RELOC_TREE_OBJECTID) {
set_extent_bits(io_tree, offset,
offset + root->sectorsize - 1,
- EXTENT_NODATASUM, GFP_NOFS);
+ EXTENT_NODATASUM);
} else {
btrfs_info(BTRFS_I(inode)->root->fs_info,
"no csum found for inode %llu start %llu",
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ea9f10bb089c..e0c9bd3fb02d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1596,6 +1596,13 @@ again:
copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
+ num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ reserve_bytes);
+ dirty_sectors = round_up(copied + sector_offset,
+ root->sectorsize);
+ dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ dirty_sectors);
+
/*
* if we have trouble faulting in the pages, fall
* back to one page at a time
@@ -1605,6 +1612,7 @@ again:
if (copied == 0) {
force_page_uptodate = true;
+ dirty_sectors = 0;
dirty_pages = 0;
} else {
force_page_uptodate = false;
@@ -1615,20 +1623,19 @@ again:
/*
* If we had a short copy we need to release the excess delaloc
* bytes we reserved. We need to increment outstanding_extents
- * because btrfs_delalloc_release_space will decrement it, but
+ * because btrfs_delalloc_release_space and
+ * btrfs_delalloc_release_metadata will decrement it, but
* we still have an outstanding extent for the chunk we actually
* managed to copy.
*/
- num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- reserve_bytes);
- dirty_sectors = round_up(copied + sector_offset,
- root->sectorsize);
- dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- dirty_sectors);
-
if (num_sectors > dirty_sectors) {
- release_bytes = (write_bytes - copied)
- & ~((u64)root->sectorsize - 1);
+ /*
+ * we round down because we don't want to count
+ * any partial blocks actually sent through the
+ * IO machines
+ */
+ release_bytes = round_down(release_bytes - copied,
+ root->sectorsize);
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
@@ -1696,7 +1703,9 @@ again:
btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
- btrfs_delalloc_release_space(inode, pos, release_bytes);
+ btrfs_delalloc_release_space(inode,
+ round_down(pos, root->sectorsize),
+ release_bytes);
}
}
@@ -2020,7 +2029,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
BTRFS_I(inode)->last_trans
<= root->fs_info->last_trans_committed)) {
/*
- * We'v had everything committed since the last time we were
+ * We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
* reason, it's no longer relevant.
*/
@@ -2368,7 +2377,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/* Check the aligned pages after the first unaligned page,
* if offset != orig_start, which means the first unaligned page
- * including serveral following pages are already in holes,
+ * including several following pages are already in holes,
* the extra check can be skipped */
if (offset == orig_start) {
/* after truncate page, check hole again */
@@ -2952,7 +2961,7 @@ const struct file_operations btrfs_file_operations = {
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = btrfs_ioctl,
+ .compat_ioctl = btrfs_compat_ioctl,
#endif
.copy_file_range = btrfs_copy_file_range,
.clone_file_range = btrfs_clone_file_range,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5e6062c26129..69d270f6602c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -29,7 +29,7 @@
#include "inode-map.h"
#include "volumes.h"
-#define BITS_PER_BITMAP (PAGE_SIZE * 8)
+#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_32K
struct btrfs_trim_range {
@@ -1415,11 +1415,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
u64 offset)
{
u64 bitmap_start;
- u32 bytes_per_bitmap;
+ u64 bytes_per_bitmap;
bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
bitmap_start = offset - ctl->start;
- bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
bitmap_start *= bytes_per_bitmap;
bitmap_start += ctl->start;
@@ -1638,10 +1638,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->key.offset;
- u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
- u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
+ u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
- max_bitmaps = max_t(u32, max_bitmaps, 1);
+ max_bitmaps = max_t(u64, max_bitmaps, 1);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
@@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
* sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
* we add more bitmaps.
*/
- bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_SIZE;
+ bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
if (bitmap_bytes >= max_bytes) {
ctl->extents_thresh = 0;
@@ -1983,7 +1983,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
/*
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
- * to reserve them to larger extents, however if we have plent
+ * to reserve them to larger extents, however if we have plenty
* of cache left then go ahead an dadd them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
@@ -3662,7 +3662,7 @@ have_info:
if (tmp->offset + tmp->bytes < offset)
break;
if (offset + bytes < tmp->offset) {
- n = rb_prev(&info->offset_index);
+ n = rb_prev(&tmp->offset_index);
continue;
}
info = tmp;
@@ -3676,7 +3676,7 @@ have_info:
if (offset + bytes < tmp->offset)
break;
if (tmp->offset + tmp->bytes < offset) {
- n = rb_next(&info->offset_index);
+ n = rb_next(&tmp->offset_index);
continue;
}
info = tmp;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 33178c490ace..3af651c2bbc7 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -123,7 +123,7 @@ int btrfs_return_cluster_to_free_space(
int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
-/* Support functions for runnint our sanity tests */
+/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
u64 offset, u64 bytes, bool bitmap);
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index aae520b2aee5..a97fdc156a03 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -24,6 +24,11 @@ int __init btrfs_hash_init(void)
return PTR_ERR_OR_ZERO(tfm);
}
+const char* btrfs_crc32c_impl(void)
+{
+ return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
+}
+
void btrfs_hash_exit(void)
{
crypto_free_shash(tfm);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 118a2316e5d3..c3a2ec554361 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -22,6 +22,7 @@
int __init btrfs_hash_init(void);
void btrfs_hash_exit(void);
+const char* btrfs_crc32c_impl(void);
u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index be4d22a5022f..b8acc07ac6c2 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
name, name_len, &extref)) {
- btrfs_std_error(root->fs_info, -ENOENT, NULL);
+ btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b7fe291a174..d2be95cfb6d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -455,7 +455,7 @@ again:
/*
* skip compression for a small file range(<=blocksize) that
- * isn't an inline extent, since it dosen't save disk space at all.
+ * isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
@@ -824,6 +824,7 @@ retry:
async_extent->ram_size - 1, 0);
goto out_free_reserve;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
/*
* clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
}
return;
out_free_reserve:
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
goto out_drop_extent_cache;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
if (disk_num_bytes < cur_alloc_size)
break;
@@ -1066,6 +1070,7 @@ out:
out_drop_extent_cache:
btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1377,6 +1382,9 @@ next_slot:
*/
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
goto out_check;
+ if (!btrfs_inc_nocow_writers(root->fs_info,
+ disk_bytenr))
+ goto out_check;
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
@@ -1391,6 +1399,9 @@ out_check:
path->slots[0]++;
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info,
+ disk_bytenr);
goto next_slot;
}
if (!nocow) {
@@ -1411,6 +1422,9 @@ out_check:
if (ret) {
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info,
+ disk_bytenr);
goto error;
}
cow_start = (u64)-1;
@@ -1453,6 +1467,8 @@ out_check:
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
num_bytes, num_bytes, type);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
@@ -1962,7 +1978,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
{
WARN_ON((end & (PAGE_SIZE - 1)) == 0);
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
- cached_state, GFP_NOFS);
+ cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -3103,8 +3119,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
- clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
- GFP_NOFS);
+ clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
return 0;
}
@@ -3256,7 +3271,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
/* grab metadata reservation from transaction handle */
if (reserve) {
ret = btrfs_orphan_reserve_metadata(trans, inode);
- BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
+ ASSERT(!ret);
+ if (ret) {
+ atomic_dec(&root->orphan_inodes);
+ clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags);
+ if (insert)
+ clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags);
+ return ret;
+ }
}
/* insert an orphan item to track this unlinked/truncated file */
@@ -3706,7 +3730,7 @@ cache_index:
* and doesn't have an inode ref with the name "bar" anymore.
*
* Setting last_unlink_trans to last_trans is a pessimistic approach,
- * but it guarantees correctness at the expense of ocassional full
+ * but it guarantees correctness at the expense of occasional full
* transaction commits on fsync if our inode is a directory, or if our
* inode is not a directory, logging its parent unnecessarily.
*/
@@ -4962,7 +4986,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* be instantly completed which will give us extents that need
* to be truncated. If we fail to get an orphan inode down we
* could have left over extents that were never meant to live,
- * so we need to garuntee from this point on that everything
+ * so we need to guarantee from this point on that everything
* will be consistent.
*/
ret = btrfs_orphan_add(trans, inode);
@@ -5232,7 +5256,7 @@ void btrfs_evict_inode(struct inode *inode)
}
/*
- * We can't just steal from the global reserve, we need tomake
+ * We can't just steal from the global reserve, we need to make
* sure there is room to do it, if not we need to commit and try
* again.
*/
@@ -6964,7 +6988,18 @@ insert:
* existing will always be non-NULL, since there must be
* extent causing the -EEXIST.
*/
- if (start >= extent_map_end(existing) ||
+ if (existing->start == em->start &&
+ extent_map_end(existing) == extent_map_end(em) &&
+ em->block_start == existing->block_start) {
+ /*
+ * these two extents are the same, it happens
+ * with inlines especially
+ */
+ free_extent_map(em);
+ em = existing;
+ err = 0;
+
+ } else if (start >= extent_map_end(existing) ||
start <= existing->start) {
/*
* The existing extent map is the one nearest to
@@ -7129,6 +7164,43 @@ out:
return em;
}
+static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+ const u64 start,
+ const u64 len,
+ const u64 orig_start,
+ const u64 block_start,
+ const u64 block_len,
+ const u64 orig_block_len,
+ const u64 ram_bytes,
+ const int type)
+{
+ struct extent_map *em = NULL;
+ int ret;
+
+ down_read(&BTRFS_I(inode)->dio_sem);
+ if (type != BTRFS_ORDERED_NOCOW) {
+ em = create_pinned_em(inode, start, len, orig_start,
+ block_start, block_len, orig_block_len,
+ ram_bytes, type);
+ if (IS_ERR(em))
+ goto out;
+ }
+ ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+ len, block_len, type);
+ if (ret) {
+ if (em) {
+ free_extent_map(em);
+ btrfs_drop_extent_cache(inode, start,
+ start + len - 1, 0);
+ }
+ em = ERR_PTR(ret);
+ }
+ out:
+ up_read(&BTRFS_I(inode)->dio_sem);
+
+ return em;
+}
+
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
u64 start, u64 len)
{
@@ -7144,41 +7216,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
if (ret)
return ERR_PTR(ret);
- /*
- * Create the ordered extent before the extent map. This is to avoid
- * races with the fast fsync path that would lead to it logging file
- * extent items that point to disk extents that were not yet written to.
- * The fast fsync path collects ordered extents into a local list and
- * then collects all the new extent maps, so we must create the ordered
- * extent first and make sure the fast fsync path collects any new
- * ordered extents after collecting new extent maps as well.
- * The fsync path simply can not rely on inode_dio_wait() because it
- * causes deadlock with AIO.
- */
- ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
- ins.offset, ins.offset, 0);
- if (ret) {
+ em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+ ins.objectid, ins.offset, ins.offset,
+ ins.offset, 0);
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ if (IS_ERR(em))
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- return ERR_PTR(ret);
- }
-
- em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
- ins.offset, ins.offset, ins.offset, 0);
- if (IS_ERR(em)) {
- struct btrfs_ordered_extent *oe;
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- oe = btrfs_lookup_ordered_extent(inode, start);
- ASSERT(oe);
- if (WARN_ON(!oe))
- return em;
- set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
- set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
- btrfs_remove_ordered_extent(inode, oe);
- /* Once for our lookup and once for the ordered extents tree. */
- btrfs_put_ordered_extent(oe);
- btrfs_put_ordered_extent(oe);
- }
return em;
}
@@ -7408,7 +7452,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
cached_state);
/*
* We're concerned with the entire range that we're going to be
- * doing DIO to, so we need to make sure theres no ordered
+ * doing DIO to, so we need to make sure there's no ordered
* extents in this range.
*/
ordered = btrfs_lookup_ordered_range(inode, lockstart,
@@ -7570,7 +7614,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (current->journal_info) {
/*
* Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transction doesn't get
+ * that anything that needs to check if there's a transaction doesn't get
* confused.
*/
dio_data = current->journal_info;
@@ -7603,7 +7647,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* decompress it, so there will be buffering required no matter what we
* do, so go ahead and fallback to buffered.
*
- * We return -ENOTBLK because thats what makes DIO go ahead and go back
+ * We return -ENOTBLK because that's what makes DIO go ahead and go back
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
@@ -7650,24 +7694,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1) {
+ &orig_block_len, &ram_bytes) == 1 &&
+ btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+ struct extent_map *em2;
+
+ em2 = btrfs_create_dio_extent(inode, start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(root->fs_info, block_start);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
- em = create_pinned_em(inode, start, len,
- orig_start,
- block_start, len,
- orig_block_len,
- ram_bytes, type);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
+ em = em2;
}
-
- ret = btrfs_add_ordered_extent_dio(inode, start,
- block_start, len, len, type);
- if (ret) {
- free_extent_map(em);
+ if (em2 && IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
goto unlock_err;
}
goto unlock;
@@ -9019,7 +9060,7 @@ static int btrfs_truncate(struct inode *inode)
return ret;
/*
- * Yes ladies and gentelment, this is indeed ugly. The fact is we have
+ * Yes ladies and gentlemen, this is indeed ugly. The fact is we have
* 3 things going on here
*
* 1) We need to reserve space for our orphan item and the space to
@@ -9033,15 +9074,15 @@ static int btrfs_truncate(struct inode *inode)
* space reserved in case it uses space during the truncate (thank you
* very much snapshotting).
*
- * And we need these to all be seperate. The fact is we can use alot of
+ * And we need these to all be separate. The fact is we can use a lot of
* space doing the truncate, and we have no earthly idea how much space
- * we will use, so we need the truncate reservation to be seperate so it
+ * we will use, so we need the truncate reservation to be separate so it
* doesn't end up using space reserved for updating the inode or
* removing the orphan item. We also need to be able to stop the
* transaction and start a new one, which means we need to be able to
* update the inode several times, and we have no idea of knowing how
* many times that will be, so we can't just reserve 1 item for the
- * entirety of the opration, so that has to be done seperately as well.
+ * entirety of the operation, so that has to be done separately as well.
* Then there is the orphan item, which does indeed need to be held on
* to for the whole operation, and we need nobody to touch this reserved
* space except the orphan code.
@@ -9230,6 +9271,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
+ init_rwsem(&ei->dio_sem);
return inode;
}
@@ -9387,10 +9429,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
return 0;
}
+static int btrfs_rename_exchange(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct timespec ctime = CURRENT_TIME;
+ struct dentry *parent;
+ u64 old_ino = btrfs_ino(old_inode);
+ u64 new_ino = btrfs_ino(new_inode);
+ u64 old_idx = 0;
+ u64 new_idx = 0;
+ u64 root_objectid;
+ int ret;
+ bool root_log_pinned = false;
+ bool dest_log_pinned = false;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ return -EXDEV;
+
+ /* close the race window with snapshot create/destroy ioctl */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&root->fs_info->subvol_sem);
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&dest->fs_info->subvol_sem);
+
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+ * should cover the worst case number of items we'll modify.
+ */
+ trans = btrfs_start_transaction(root, 12);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
+
+ /*
+ * We need to find a free sequence number both in the source and
+ * in the destination directory for the exchange.
+ */
+ ret = btrfs_set_inode_index(new_dir, &old_idx);
+ if (ret)
+ goto out_fail;
+ ret = btrfs_set_inode_index(old_dir, &new_idx);
+ if (ret)
+ goto out_fail;
+
+ BTRFS_I(old_inode)->dir_index = 0ULL;
+ BTRFS_I(new_inode)->dir_index = 0ULL;
+
+ /* Reference for the source. */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ } else {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ ret = btrfs_insert_inode_ref(trans, dest,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ old_ino,
+ btrfs_ino(new_dir), old_idx);
+ if (ret)
+ goto out_fail;
+ }
+
+ /* And now for the dest. */
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(dest->fs_info, trans);
+ } else {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ ret = btrfs_insert_inode_ref(trans, root,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len,
+ new_ino,
+ btrfs_ino(old_dir), new_idx);
+ if (ret)
+ goto out_fail;
+ }
+
+ /* Update inode version and ctime/mtime. */
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
+ inode_inc_iversion(old_inode);
+ inode_inc_iversion(new_inode);
+ old_dir->i_ctime = old_dir->i_mtime = ctime;
+ new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_inode->i_ctime = ctime;
+ new_inode->i_ctime = ctime;
+
+ if (old_dentry->d_parent != new_dentry->d_parent) {
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+ btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+ }
+
+ /* src is a subvolume */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, root, old_dir,
+ root_objectid,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ } else { /* src is an inode */
+ ret = __btrfs_unlink_inode(trans, root, old_dir,
+ old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, old_inode);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ /* dest is a subvolume */
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, dest, new_dir,
+ root_objectid,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ } else { /* dest is an inode */
+ ret = __btrfs_unlink_inode(trans, dest, new_dir,
+ new_dentry->d_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ if (!ret)
+ ret = btrfs_update_inode(trans, dest, new_inode);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ ret = btrfs_add_link(trans, new_dir, old_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, 0, old_idx);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ ret = btrfs_add_link(trans, old_dir, new_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len, 0, new_idx);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ if (old_inode->i_nlink == 1)
+ BTRFS_I(old_inode)->dir_index = old_idx;
+ if (new_inode->i_nlink == 1)
+ BTRFS_I(new_inode)->dir_index = new_idx;
+
+ if (root_log_pinned) {
+ parent = new_dentry->d_parent;
+ btrfs_log_new_name(trans, old_inode, old_dir, parent);
+ btrfs_end_log_trans(root);
+ root_log_pinned = false;
+ }
+ if (dest_log_pinned) {
+ parent = old_dentry->d_parent;
+ btrfs_log_new_name(trans, new_inode, new_dir, parent);
+ btrfs_end_log_trans(dest);
+ dest_log_pinned = false;
+ }
+out_fail:
+ /*
+ * If we have pinned a log and an error happened, we unpin tasks
+ * trying to sync the log and force them to fallback to a transaction
+ * commit if the log currently contains any of the inodes involved in
+ * this rename operation (to ensure we do not persist a log with an
+ * inconsistent state for any of these inodes or leading to any
+ * inconsistencies when replayed). If the transaction was aborted, the
+ * abortion reason is propagated to userspace when attempting to commit
+ * the transaction. If the log does not contain any of these inodes, we
+ * allow the tasks to sync it.
+ */
+ if (ret && (root_log_pinned || dest_log_pinned)) {
+ if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+ (new_inode &&
+ btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
+ if (root_log_pinned) {
+ btrfs_end_log_trans(root);
+ root_log_pinned = false;
+ }
+ if (dest_log_pinned) {
+ btrfs_end_log_trans(dest);
+ dest_log_pinned = false;
+ }
+ }
+ ret = btrfs_end_transaction(trans, root);
+out_notrans:
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&dest->fs_info->subvol_sem);
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&root->fs_info->subvol_sem);
+
+ return ret;
+}
+
+static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir,
+ struct dentry *dentry)
+{
+ int ret;
+ struct inode *inode;
+ u64 objectid;
+ u64 index;
+
+ ret = btrfs_find_free_ino(root, &objectid);
+ if (ret)
+ return ret;
+
+ inode = btrfs_new_inode(trans, root, dir,
+ dentry->d_name.name,
+ dentry->d_name.len,
+ btrfs_ino(dir),
+ objectid,
+ S_IFCHR | WHITEOUT_MODE,
+ &index);
+
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ return ret;
+ }
+
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ WHITEOUT_DEV);
+
+ ret = btrfs_init_inode_security(trans, inode, dir,
+ &dentry->d_name);
+ if (ret)
+ goto out;
+
+ ret = btrfs_add_nondir(trans, dir, dentry,
+ inode, 0, index);
+ if (ret)
+ goto out;
+
+ ret = btrfs_update_inode(trans, root, inode);
+out:
+ unlock_new_inode(inode);
+ if (ret)
+ inode_dec_link_count(inode);
+ iput(inode);
+
+ return ret;
+}
+
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9712,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
u64 root_objectid;
int ret;
u64 old_ino = btrfs_ino(old_inode);
+ bool log_pinned = false;
if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9449,15 +9763,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* We want to reserve the absolute worst case amount of items. So if
* both inodes are subvols and we need to unlink them then that would
* require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
+ * would require 5 item modifications, so we'll assume they are normal
* inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
* should cover the worst case number of items we'll modify.
+ * If our rename has the whiteout flag, we need more 5 units for the
+ * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+ * when selinux is enabled).
*/
- trans = btrfs_start_transaction(root, 11);
+ trans_num_items = 11;
+ if (flags & RENAME_WHITEOUT)
+ trans_num_items += 5;
+ trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_notrans;
- }
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
if (dest != root)
btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9791,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(root->fs_info, trans);
} else {
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9478,14 +9800,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_ino(new_dir), index);
if (ret)
goto out_fail;
- /*
- * this is an ugly little race, but the rename is required
- * to make sure that if we crash, the inode is either at the
- * old name or the new one. pinning the log transaction lets
- * us make sure we don't allow a log commit to come in after
- * we unlink the name but before we add the new name back in.
- */
- btrfs_pin_log_trans(root);
}
inode_inc_iversion(old_dir);
@@ -9552,12 +9866,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ if (log_pinned) {
struct dentry *parent = new_dentry->d_parent;
+
btrfs_log_new_name(trans, old_inode, old_dir, parent);
btrfs_end_log_trans(root);
+ log_pinned = false;
+ }
+
+ if (flags & RENAME_WHITEOUT) {
+ ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+ old_dentry);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
}
out_fail:
+ /*
+ * If we have pinned the log and an error happened, we unpin tasks
+ * trying to sync the log and force them to fallback to a transaction
+ * commit if the log currently contains any of the inodes involved in
+ * this rename operation (to ensure we do not persist a log with an
+ * inconsistent state for any of these inodes or leading to any
+ * inconsistencies when replayed). If the transaction was aborted, the
+ * abortion reason is propagated to userspace when attempting to commit
+ * the transaction. If the log does not contain any of these inodes, we
+ * allow the tasks to sync it.
+ */
+ if (ret && log_pinned) {
+ if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+ (new_inode &&
+ btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
+ btrfs_end_log_trans(root);
+ log_pinned = false;
+ }
btrfs_end_transaction(trans, root);
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9918,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- if (flags & ~RENAME_NOREPLACE)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (flags & RENAME_EXCHANGE)
+ return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+ new_dentry);
+
+ return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10294,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_end_transaction(trans, root);
break;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
@@ -10184,7 +10537,7 @@ static const struct file_operations btrfs_dir_file_operations = {
.iterate = btrfs_real_readdir,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = btrfs_ioctl,
+ .compat_ioctl = btrfs_compat_ioctl,
#endif
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0b8ba717175b..05173563e4a6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
if (flags & BTRFS_INODE_NODATACOW)
iflags |= FS_NOCOW_FL;
- if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
- iflags |= FS_COMPR_FL;
- else if (flags & BTRFS_INODE_NOCOMPRESS)
+ if (flags & BTRFS_INODE_NOCOMPRESS)
iflags |= FS_NOCOMP_FL;
+ else if (flags & BTRFS_INODE_COMPRESS)
+ iflags |= FS_COMPR_FL;
return iflags;
}
@@ -296,7 +296,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
} else {
/*
- * Revert back under same assuptions as above
+ * Revert back under same assumptions as above
*/
if (S_ISREG(mode)) {
if (inode->i_size == 0)
@@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir,
{
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_root_item root_item;
+ struct btrfs_root_item *root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir,
u64 qgroup_reserved;
uuid_le new_uuid;
+ root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
+ if (!root_item)
+ return -ENOMEM;
+
ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
if (ret)
- return ret;
+ goto fail_free;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
- * screwed up since it assume subvolme qgroup's level to be 0.
+ * screwed up since it assumes subvolume qgroup's level to be 0.
*/
- if (btrfs_qgroup_level(objectid))
- return -ENOSPC;
+ if (btrfs_qgroup_level(objectid)) {
+ ret = -ENOSPC;
+ goto fail_free;
+ }
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir,
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
8, &qgroup_reserved, false);
if (ret)
- return ret;
+ goto fail_free;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(root, &block_rsv,
qgroup_reserved);
- return ret;
+ goto fail_free;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
@@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir,
BTRFS_UUID_SIZE);
btrfs_mark_buffer_dirty(leaf);
- memset(&root_item, 0, sizeof(root_item));
-
- inode_item = &root_item.inode;
+ inode_item = &root_item->inode;
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
- btrfs_set_root_flags(&root_item, 0);
- btrfs_set_root_limit(&root_item, 0);
+ btrfs_set_root_flags(root_item, 0);
+ btrfs_set_root_limit(root_item, 0);
btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
- btrfs_set_root_bytenr(&root_item, leaf->start);
- btrfs_set_root_generation(&root_item, trans->transid);
- btrfs_set_root_level(&root_item, 0);
- btrfs_set_root_refs(&root_item, 1);
- btrfs_set_root_used(&root_item, leaf->len);
- btrfs_set_root_last_snapshot(&root_item, 0);
+ btrfs_set_root_bytenr(root_item, leaf->start);
+ btrfs_set_root_generation(root_item, trans->transid);
+ btrfs_set_root_level(root_item, 0);
+ btrfs_set_root_refs(root_item, 1);
+ btrfs_set_root_used(root_item, leaf->len);
+ btrfs_set_root_last_snapshot(root_item, 0);
- btrfs_set_root_generation_v2(&root_item,
- btrfs_root_generation(&root_item));
+ btrfs_set_root_generation_v2(root_item,
+ btrfs_root_generation(root_item));
uuid_le_gen(&new_uuid);
- memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
- btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
- btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
- root_item.ctime = root_item.otime;
- btrfs_set_root_ctransid(&root_item, trans->transid);
- btrfs_set_root_otransid(&root_item, trans->transid);
+ memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
+ btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
+ root_item->ctime = root_item->otime;
+ btrfs_set_root_ctransid(root_item, trans->transid);
+ btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
leaf = NULL;
- btrfs_set_root_dirid(&root_item, new_dirid);
+ btrfs_set_root_dirid(root_item, new_dirid);
key.objectid = objectid;
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
- &root_item);
+ root_item);
if (ret)
goto fail;
@@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir,
BUG_ON(ret);
ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
- root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+ root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret)
btrfs_abort_transaction(trans, root, ret);
fail:
+ kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
@@ -629,6 +634,10 @@ fail:
d_instantiate(dentry, inode);
}
return ret;
+
+fail_free:
+ kfree(root_item);
+ return ret;
}
static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
@@ -681,7 +690,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto dec_and_free;
- btrfs_wait_ordered_extents(root, -1);
+ btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
@@ -771,7 +780,7 @@ free_pending:
* a. be owner of dir, or
* b. be owner of victim, or
* c. have CAP_FOWNER capability
- * 6. If the victim is append-only or immutable we can't do antyhing with
+ * 6. If the victim is append-only or immutable we can't do anything with
* links pointing to it.
* 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
* 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
@@ -837,11 +846,9 @@ static noinline int btrfs_mksubvol(struct path *parent,
struct dentry *dentry;
int error;
- inode_lock_nested(dir, I_MUTEX_PARENT);
- // XXX: should've been
- // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
- // if (error == -EINTR)
- // return error;
+ error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (error == -EINTR)
+ return error;
dentry = lookup_one_len(name, parent->dentry, namelen);
error = PTR_ERR(dentry);
@@ -1230,7 +1237,7 @@ again:
set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state,
@@ -2368,11 +2375,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out;
- inode_lock_nested(dir, I_MUTEX_PARENT);
- // XXX: should've been
- // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
- // if (err == -EINTR)
- // goto out_drop_write;
+ err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (err == -EINTR)
+ goto out_drop_write;
dentry = lookup_one_len(vol_args->name, parent, namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
@@ -2562,7 +2567,7 @@ out_dput:
dput(dentry);
out_unlock_dir:
inode_unlock(dir);
-//out_drop_write:
+out_drop_write:
mnt_drop_write_file(file);
out:
kfree(vol_args);
@@ -2671,10 +2676,10 @@ out:
return ret;
}
-static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -2690,7 +2695,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
goto err_drop;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ /* Check for compatibility reject unknown flags */
+ if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
+ return -EOPNOTSUPP;
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
@@ -2699,13 +2706,23 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
}
mutex_lock(&root->fs_info->volume_mutex);
- ret = btrfs_rm_device(root, vol_args->name);
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ ret = btrfs_rm_device(root, NULL, vol_args->devid);
+ } else {
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_rm_device(root, vol_args->name, 0);
+ }
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
- if (!ret)
- btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
-
+ if (!ret) {
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
+ btrfs_info(root->fs_info, "device deleted: id %llu",
+ vol_args->devid);
+ else
+ btrfs_info(root->fs_info, "device deleted: %s",
+ vol_args->name);
+ }
out:
kfree(vol_args);
err_drop:
@@ -2713,6 +2730,47 @@ err_drop:
return ret;
}
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out_drop_write;
+ }
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = btrfs_rm_device(root, vol_args->name, 0);
+ mutex_unlock(&root->fs_info->volume_mutex);
+
+ if (!ret)
+ btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+ kfree(vol_args);
+out:
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+out_drop_write:
+ mnt_drop_write_file(file);
+
+ return ret;
+}
+
static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
@@ -3472,13 +3530,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = vmalloc(root->nodesize);
- if (!buf)
- return ret;
+ buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ if (!buf) {
+ buf = vmalloc(root->nodesize);
+ if (!buf)
+ return ret;
+ }
path = btrfs_alloc_path();
if (!path) {
- vfree(buf);
+ kvfree(buf);
return ret;
}
@@ -3779,7 +3840,7 @@ process_slot:
out:
btrfs_free_path(path);
- vfree(buf);
+ kvfree(buf);
return ret;
}
@@ -4380,7 +4441,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
1)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
- ret = btrfs_dev_replace_start(root, p);
+ ret = btrfs_dev_replace_by_ioctl(root, p);
atomic_set(
&root->fs_info->mutually_exclusive_operation_running,
0);
@@ -4589,7 +4650,7 @@ again:
}
/*
- * mut. excl. ops lock is locked. Three possibilites:
+ * mut. excl. ops lock is locked. Three possibilities:
* (1) some other op is running
* (2) balance is running
* (3) balance is paused -- special case (think resume)
@@ -4851,8 +4912,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
/* update qgroup status and info */
err = btrfs_run_qgroups(trans, root->fs_info);
if (err < 0)
- btrfs_std_error(root->fs_info, ret,
- "failed to update qgroup status and info\n");
+ btrfs_handle_fs_error(root->fs_info, err,
+ "failed to update qgroup status and info");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
ret = err;
@@ -5398,9 +5459,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
if (ret)
return ret;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_drop_write;
+ }
spin_lock(&root->fs_info->super_lock);
newflags = btrfs_super_compat_flags(super_block);
@@ -5419,7 +5486,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
btrfs_set_super_incompat_flags(super_block, newflags);
spin_unlock(&root->fs_info->super_lock);
- return btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
+out_drop_write:
+ mnt_drop_write_file(file);
+
+ return ret;
}
long btrfs_ioctl(struct file *file, unsigned int
@@ -5463,6 +5534,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_add_dev(root, argp);
case BTRFS_IOC_RM_DEV:
return btrfs_ioctl_rm_dev(file, argp);
+ case BTRFS_IOC_RM_DEV_V2:
+ return btrfs_ioctl_rm_dev_v2(file, argp);
case BTRFS_IOC_FS_INFO:
return btrfs_ioctl_fs_info(root, argp);
case BTRFS_IOC_DEV_INFO:
@@ -5494,7 +5567,7 @@ long btrfs_ioctl(struct file *file, unsigned int
ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
/*
* The transaction thread may want to do more work,
- * namely it pokes the cleaner ktread that will start
+ * namely it pokes the cleaner kthread that will start
* processing uncleaned subvols.
*/
wake_up_process(root->fs_info->transaction_kthread);
@@ -5556,3 +5629,24 @@ long btrfs_ioctl(struct file *file, unsigned int
return -ENOTTY;
}
+
+#ifdef CONFIG_COMPAT
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0de7da5a610d..e96634a725c3 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+ const u64 range_start, const u64 range_len)
{
- struct list_head splice, works;
+ LIST_HEAD(splice);
+ LIST_HEAD(skipped);
+ LIST_HEAD(works);
struct btrfs_ordered_extent *ordered, *next;
int count = 0;
-
- INIT_LIST_HEAD(&splice);
- INIT_LIST_HEAD(&works);
+ const u64 range_end = range_start + range_len;
mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
@@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
while (!list_empty(&splice) && nr) {
ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
root_extent_list);
+
+ if (range_end <= ordered->start ||
+ ordered->start + ordered->disk_len <= range_start) {
+ list_move_tail(&ordered->root_extent_list, &skipped);
+ cond_resched_lock(&root->ordered_extent_lock);
+ continue;
+ }
+
list_move_tail(&ordered->root_extent_list,
&root->ordered_extents);
atomic_inc(&ordered->refs);
@@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
nr--;
count++;
}
+ list_splice_tail(&skipped, &root->ordered_extents);
list_splice_tail(&splice, &root->ordered_extents);
spin_unlock(&root->ordered_extent_lock);
@@ -708,11 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
return count;
}
-void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+ const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
struct list_head splice;
int done;
+ int total_done = 0;
INIT_LIST_HEAD(&splice);
@@ -728,8 +740,10 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
- done = btrfs_wait_ordered_extents(root, nr);
+ done = btrfs_wait_ordered_extents(root, nr,
+ range_start, range_len);
btrfs_put_fs_root(root);
+ total_done += done;
spin_lock(&fs_info->ordered_root_lock);
if (nr != -1) {
@@ -740,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
mutex_unlock(&fs_info->ordered_operations_mutex);
+
+ return total_done;
}
/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 23c96059cef2..451507776ff5 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -58,7 +58,7 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
-#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */
#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
@@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len);
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
-void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+ const u64 range_start, const u64 range_len);
+int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+ const u64 range_start, const u64 range_len);
void btrfs_get_logged_extents(struct inode *inode,
struct list_head *logged_list,
const loff_t start,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9e119552ed32..9d4c05b14f6e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -85,7 +85,7 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
- * Refer to qgroup_shared_accouting() for details.
+ * Refer to qgroup_shared_accounting() for details.
*/
u64 old_refcnt;
u64 new_refcnt;
@@ -499,7 +499,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
}
/*
* we call btrfs_free_qgroup_config() when umounting
- * filesystem and disabling quota, so we set qgroup_ulit
+ * filesystem and disabling quota, so we set qgroup_ulist
* to be null here to avoid double free.
*/
ulist_free(fs_info->qgroup_ulist);
@@ -1036,7 +1036,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
/*
* The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their refrence and
+ * then this qgroup and all of the parent qgroups get their reference and
* exclusive counts adjusted.
*
* Caller should hold fs_info->qgroup_lock.
@@ -1436,7 +1436,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
/*
* No need to do lock, since this function will only be called in
- * btrfs_commmit_transaction().
+ * btrfs_commit_transaction().
*/
node = rb_first(&delayed_refs->dirty_extent_root);
while (node) {
@@ -1557,7 +1557,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* A: cur_old_roots < nr_old_roots (not exclusive before)
* !A: cur_old_roots == nr_old_roots (possible exclusive before)
* B: cur_new_roots < nr_new_roots (not exclusive now)
- * !B: cur_new_roots == nr_new_roots (possible exclsuive now)
+ * !B: cur_new_roots == nr_new_roots (possible exclusive now)
*
* Results:
* +: Possible sharing -> exclusive -: Possible exclusive -> sharing
@@ -1851,7 +1851,7 @@ out:
}
/*
- * Copy the acounting information between qgroups. This is necessary
+ * Copy the accounting information between qgroups. This is necessary
* when a snapshot or a subvolume is created. Throwing an error will
* cause a transaction abort so we take extra care here to only error
* when a readonly fs is a reasonable outcome.
@@ -2340,7 +2340,7 @@ out:
mutex_unlock(&fs_info->qgroup_rescan_lock);
/*
- * only update status, since the previous part has alreay updated the
+ * only update status, since the previous part has already updated the
* qgroup info.
*/
trans = btrfs_start_transaction(fs_info->quota_root, 1);
@@ -2542,8 +2542,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
changeset.bytes_changed = 0;
changeset.range_changed = ulist_alloc(GFP_NOFS);
ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
- &changeset);
+ start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
trace_btrfs_qgroup_reserve_data(inode, start, len,
changeset.bytes_changed,
QGROUP_RESERVE);
@@ -2580,8 +2579,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
return -ENOMEM;
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
- &changeset);
+ start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -2672,7 +2670,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
}
/*
- * Check qgroup reserved space leaking, normally at destory inode
+ * Check qgroup reserved space leaking, normally at destroy inode
* time
*/
void btrfs_qgroup_check_reserved_leak(struct inode *inode)
@@ -2688,7 +2686,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
return;
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+ EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0b7792e02dd5..f8b6d411a034 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -576,7 +576,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
* we can't merge with cached rbios, since the
* idea is that when we merge the destination
* rbio is going to run our IO for us. We can
- * steal from cached rbio's though, other functions
+ * steal from cached rbios though, other functions
* handle that.
*/
if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
@@ -2368,7 +2368,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
- /* Check scrubbing pairty and repair it */
+ /* Check scrubbing parity and repair it */
p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
parity = kmap(p);
if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
@@ -2493,7 +2493,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
/*
* Here means we got one corrupted data stripe and one
* corrupted parity on RAID6, if the corrupted parity
- * is scrubbing parity, luckly, use the other one to repair
+ * is scrubbing parity, luckily, use the other one to repair
* the data, or we can not repair the data stripe.
*/
if (failp != rbio->scrubp)
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 298631eaee78..8428db7cd88f 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
do {
enqueued = 0;
+ mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(fs_info,
device);
}
+ mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 08ef890deca6..0477dca154ed 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -668,8 +668,8 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
* roots of b-trees that reference the tree block.
*
* the basic idea of this function is check backrefs of a given block
- * to find upper level blocks that refernece the block, and then check
- * bakcrefs of these upper level blocks recursively. the recursion stop
+ * to find upper level blocks that reference the block, and then check
+ * backrefs of these upper level blocks recursively. the recursion stop
* when tree root is reached or backrefs for the block is cached.
*
* NOTE: if we find backrefs for a block are cached, we know backrefs
@@ -1160,7 +1160,7 @@ out:
if (!RB_EMPTY_NODE(&upper->rb_node))
continue;
- /* Add this guy's upper edges to the list to proces */
+ /* Add this guy's upper edges to the list to process */
list_for_each_entry(edge, &upper->upper, list[LOWER])
list_add_tail(&edge->list[UPPER], &list);
if (list_empty(&upper->upper))
@@ -2396,7 +2396,7 @@ again:
}
/*
- * we keep the old last snapshod transid in rtranid when we
+ * we keep the old last snapshot transid in rtranid when we
* created the relocation tree.
*/
last_snap = btrfs_root_rtransid(&reloc_root->root_item);
@@ -2418,7 +2418,7 @@ again:
}
out:
if (ret) {
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
@@ -2616,7 +2616,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
* only one thread can access block_rsv at this point,
* so we don't need hold lock to protect block_rsv.
* we expand more reservation size here to allow enough
- * space for relocation and we will return eailer in
+ * space for relocation and we will return earlier in
* enospc case.
*/
rc->block_rsv->size = tmp + rc->extent_root->nodesize *
@@ -2814,7 +2814,7 @@ static void mark_block_processed(struct reloc_control *rc,
u64 bytenr, u32 blocksize)
{
set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
- EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_DIRTY);
}
static void __mark_block_processed(struct reloc_control *rc,
@@ -3182,7 +3182,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
page_start + offset == cluster->boundary[nr]) {
set_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end,
- EXTENT_BOUNDARY, GFP_NOFS);
+ EXTENT_BOUNDARY);
nr++;
}
@@ -4059,8 +4059,7 @@ restart:
}
btrfs_release_path(path);
- clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
- GFP_NOFS);
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
if (trans) {
btrfs_end_transaction_throttle(trans, rc->extent_root);
@@ -4254,12 +4253,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
rc->block_group->key.objectid, rc->block_group->flags);
- ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- btrfs_wait_ordered_roots(fs_info, -1);
+ btrfs_wait_block_group_reservations(rc->block_group);
+ btrfs_wait_nocow_writers(rc->block_group);
+ btrfs_wait_ordered_roots(fs_info, -1,
+ rc->block_group->key.objectid,
+ rc->block_group->key.offset);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
@@ -4592,7 +4590,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
/*
* called before creating snapshot. it calculates metadata reservation
- * requried for relocating tree blocks in the snapshot
+ * required for relocating tree blocks in the snapshot
*/
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve)
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 9fcd6dfc3266..f1c30861d062 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -71,9 +71,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
* search_key: the key to search
* path: the path we search
* root_item: the root item of the tree we look for
- * root_key: the reak key of the tree we look for
+ * root_key: the root key of the tree we look for
*
- * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset
* of the search key, just lookup the root with the highest offset for a
* given objectid.
*
@@ -284,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- btrfs_std_error(tree_root->fs_info, err,
+ btrfs_handle_fs_error(tree_root->fs_info, err,
"Failed to start trans to delete "
"orphan item");
break;
@@ -293,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid);
btrfs_end_transaction(trans, tree_root);
if (err) {
- btrfs_std_error(tree_root->fs_info, err,
+ btrfs_handle_fs_error(tree_root->fs_info, err,
"Failed to delete root orphan "
"item");
break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4678f03e878e..70427ef66b04 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -745,7 +745,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
* sure we read the bad mirror.
*/
ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
- EXTENT_DAMAGED, GFP_NOFS);
+ EXTENT_DAMAGED);
if (ret) {
/* set_extent_bits should give proper error */
WARN_ON(ret > 0);
@@ -763,7 +763,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
end, EXTENT_DAMAGED, 0, NULL);
if (!corrected)
clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
- EXTENT_DAMAGED, GFP_NOFS);
+ EXTENT_DAMAGED);
}
out:
@@ -1044,7 +1044,7 @@ nodatasum_case:
/*
* !is_metadata and !have_csum, this means that the data
- * might not be COW'ed, that it might be modified
+ * might not be COWed, that it might be modified
* concurrently. The general strategy to work on the
* commit root does not help in the case when COW is not
* used.
@@ -1125,7 +1125,7 @@ nodatasum_case:
* the 2nd page of mirror #1 faces I/O errors, and the 2nd page
* of mirror #2 is readable but the final checksum test fails,
* then the 2nd page of mirror #3 could be tried, whether now
- * the final checksum succeedes. But this would be a rare
+ * the final checksum succeeds. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
@@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
recover->bbio = bbio;
recover->map_length = mapped_length;
- BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
+ BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
@@ -2127,6 +2127,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
if (bio->bi_error)
sblock->no_io_error_seen = 0;
+ bio_put(bio);
+
btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
}
@@ -2179,7 +2181,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
u64 length = sblock->page_count * PAGE_SIZE;
u64 logical = sblock->pagev[0]->logical;
- struct btrfs_bio *bbio;
+ struct btrfs_bio *bbio = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
int ret;
@@ -2860,7 +2862,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- nsectors = map->stripe_len / root->sectorsize;
+ nsectors = div_u64(map->stripe_len, root->sectorsize);
bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
GFP_NOFS);
@@ -2980,6 +2982,7 @@ again:
extent_len);
mapped_length = extent_len;
+ bbio = NULL;
ret = btrfs_map_block(fs_info, READ, extent_logical,
&mapped_length, &bbio, 0);
if (!ret) {
@@ -3070,7 +3073,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int slot;
u64 nstripes;
struct extent_buffer *l;
- struct btrfs_key key;
u64 physical;
u64 logical;
u64 logic_end;
@@ -3079,7 +3081,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int mirror_num;
struct reada_control *reada1;
struct reada_control *reada2;
- struct btrfs_key key_start;
+ struct btrfs_key key;
struct btrfs_key key_end;
u64 increment = map->stripe_len;
u64 offset;
@@ -3158,21 +3160,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
scrub_blocked_if_needed(fs_info);
/* FIXME it might be better to start readahead at commit root */
- key_start.objectid = logical;
- key_start.type = BTRFS_EXTENT_ITEM_KEY;
- key_start.offset = (u64)0;
+ key.objectid = logical;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)0;
key_end.objectid = logic_end;
key_end.type = BTRFS_METADATA_ITEM_KEY;
key_end.offset = (u64)-1;
- reada1 = btrfs_reada_add(root, &key_start, &key_end);
+ reada1 = btrfs_reada_add(root, &key, &key_end);
- key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key_start.type = BTRFS_EXTENT_CSUM_KEY;
- key_start.offset = logical;
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = logical;
key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_end.type = BTRFS_EXTENT_CSUM_KEY;
key_end.offset = logic_end;
- reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+ reada2 = btrfs_reada_add(csum_root, &key, &key_end);
if (!IS_ERR(reada1))
btrfs_reada_wait(reada1);
@@ -3580,6 +3582,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(root, cache);
+ if (!ret && is_dev_replace) {
+ /*
+ * If we are doing a device replace wait for any tasks
+ * that started dellaloc right before we set the block
+ * group to RO mode, as they might have just allocated
+ * an extent from it or decided they could do a nocow
+ * write. And if any such tasks did that, wait for their
+ * ordered extents to complete and then commit the
+ * current transaction, so that we can later see the new
+ * extent items in the extent tree - the ordered extents
+ * create delayed data references (for cow writes) when
+ * they complete, which will be run and insert the
+ * corresponding extent items into the extent tree when
+ * we commit the transaction they used when running
+ * inode.c:btrfs_finish_ordered_io(). We later use
+ * the commit root of the extent tree to find extents
+ * to copy from the srcdev into the tgtdev, and we don't
+ * want to miss any new extents.
+ */
+ btrfs_wait_block_group_reservations(cache);
+ btrfs_wait_nocow_writers(cache);
+ ret = btrfs_wait_ordered_roots(fs_info, -1,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret > 0) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ ret = PTR_ERR(trans);
+ else
+ ret = btrfs_commit_transaction(trans,
+ root);
+ if (ret) {
+ scrub_pause_off(fs_info);
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+ }
scrub_pause_off(fs_info);
if (ret == 0) {
@@ -3600,9 +3642,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
found_key.offset, cache, is_dev_replace);
@@ -3638,6 +3682,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+ dev_replace->cursor_left = dev_replace->cursor_right;
+ dev_replace->item_needs_writeback = 1;
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+
if (ro_set)
btrfs_dec_block_group_ro(root, cache);
@@ -3675,9 +3724,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
ret = -ENOMEM;
break;
}
-
- dev_replace->cursor_left = dev_replace->cursor_right;
- dev_replace->item_needs_writeback = 1;
skip:
key.offset = found_key.offset + length;
btrfs_release_path(path);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8d358c547c59..b71dd298385c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1831,7 +1831,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
/*
* If we have a parent root we need to verify that the parent dir was
- * not delted and then re-created, if it was then we have no overwrite
+ * not deleted and then re-created, if it was then we have no overwrite
* and we can just unlink this entry.
*/
if (sctx->parent_root) {
@@ -4192,9 +4192,9 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
return -ENOMEM;
/*
- * This hack is needed because empty acl's are stored as zero byte
+ * This hack is needed because empty acls are stored as zero byte
* data in xattrs. Problem with that is, that receiving these zero byte
- * acl's will fail later. To fix this, we send a dummy acl list that
+ * acls will fail later. To fix this, we send a dummy acl list that
* only contains the version number and no entries.
*/
if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
@@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
+ unsigned alloc_size;
int sort_clone_roots = 0;
int index;
@@ -5978,6 +5979,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
+ if (arg->clone_sources_count >
+ ULLONG_MAX / sizeof(*arg->clone_sources)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (!access_ok(VERIFY_READ, arg->clone_sources,
sizeof(*arg->clone_sources) *
arg->clone_sources_count)) {
@@ -6022,40 +6029,53 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->clone_roots_cnt = arg->clone_sources_count;
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = vmalloc(sctx->send_max_size);
+ sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
if (!sctx->send_buf) {
- ret = -ENOMEM;
- goto out;
+ sctx->send_buf = vmalloc(sctx->send_max_size);
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
- sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+ sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
+ sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+ if (!sctx->read_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
- (arg->clone_sources_count + 1));
+ alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
+
+ sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
if (!sctx->clone_roots) {
- ret = -ENOMEM;
- goto out;
+ sctx->clone_roots = vzalloc(alloc_size);
+ if (!sctx->clone_roots) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
+ alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+
if (arg->clone_sources_count) {
- clone_sources_tmp = vmalloc(arg->clone_sources_count *
- sizeof(*arg->clone_sources));
+ clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
if (!clone_sources_tmp) {
- ret = -ENOMEM;
- goto out;
+ clone_sources_tmp = vmalloc(alloc_size);
+ if (!clone_sources_tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
- arg->clone_sources_count *
- sizeof(*arg->clone_sources));
+ alloc_size);
if (ret) {
ret = -EFAULT;
goto out;
@@ -6089,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->clone_roots[i].root = clone_root;
clone_sources_to_rollback = i + 1;
}
- vfree(clone_sources_tmp);
+ kvfree(clone_sources_tmp);
clone_sources_tmp = NULL;
}
@@ -6207,15 +6227,15 @@ out:
btrfs_root_dec_send_in_progress(sctx->parent_root);
kfree(arg);
- vfree(clone_sources_tmp);
+ kvfree(clone_sources_tmp);
if (sctx) {
if (sctx->send_filp)
fput(sctx->send_filp);
- vfree(sctx->clone_roots);
- vfree(sctx->send_buf);
- vfree(sctx->read_buf);
+ kvfree(sctx->clone_roots);
+ kvfree(sctx->send_buf);
+ kvfree(sctx->read_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index e05619f241be..875c757e73e2 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -36,7 +36,7 @@ static inline void put_unaligned_le8(u8 val, void *p)
*
* The end result is that anyone who #includes ctree.h gets a
* declaration for the btrfs_set_foo functions and btrfs_foo functions,
- * which are wappers of btrfs_set_token_#bits functions and
+ * which are wrappers of btrfs_set_token_#bits functions and
* btrfs_get_token_#bits functions, which are defined in this file.
*
* These setget functions do all the extent_buffer related mapping
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 00b8f37cc306..60e7179ed4b7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno)
return errstr;
}
-static void save_error_info(struct btrfs_fs_info *fs_info)
-{
- /*
- * today we only save the error info into ram. Long term we'll
- * also send it down to the disk
- */
- set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
-}
-
/* btrfs handle error by forcing the filesystem readonly */
static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
{
@@ -121,7 +112,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* Note that a running device replace operation is not
* canceled here although there is no way to update
* the progress. It would add the risk of a deadlock,
- * therefore the canceling is ommited. The only penalty
+ * therefore the canceling is omitted. The only penalty
* is that some I/O remains active until the procedure
* completes. The next time when the filesystem is
* mounted writeable again, the device replace
@@ -131,11 +122,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
}
/*
- * __btrfs_std_error decodes expected errors from the caller and
+ * __btrfs_handle_fs_error decodes expected errors from the caller and
* invokes the approciate error response.
*/
__cold
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
@@ -170,8 +161,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
}
#endif
+ /*
+ * Today we only save the error info to memory. Long term we'll
+ * also send it down to the disk
+ */
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
/* Don't go through full error handling during mount */
- save_error_info(fs_info);
if (sb->s_flags & MS_BORN)
btrfs_handle_error(fs_info);
}
@@ -239,7 +235,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
- if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
+ if (!trans->dirty && list_empty(&trans->new_bgs)) {
const char *errstr;
errstr = btrfs_decode_error(errno);
@@ -252,7 +248,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
/* Wake up anybody who may be waiting on this transaction */
wake_up(&root->fs_info->transaction_wait);
wake_up(&root->fs_info->transaction_blocked_wait);
- __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+ __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL);
}
/*
* __btrfs_panic decodes unexpected, fatal errors from the caller,
@@ -1160,7 +1156,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_roots(fs_info, -1);
+ btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1488,10 +1484,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info,
memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
} else {
/*
- * Since SELinux(the only one supports security_mnt_opts) does
- * NOT support changing context during remount/mount same sb,
- * This must be the same or part of the same security options,
- * just free it.
+ * Since SELinux (the only one supporting security_mnt_opts)
+ * does NOT support changing context during remount/mount of
+ * the same sb, this must be the same or part of the same
+ * security options, just free it.
*/
security_free_mnt_opts(sec_opts);
}
@@ -1669,8 +1665,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
unsigned long old_opts)
{
/*
- * We need cleanup all defragable inodes if the autodefragment is
- * close or the fs is R/O.
+ * We need to cleanup all defragable inodes if the autodefragment is
+ * close or the filesystem is read only.
*/
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
@@ -1811,6 +1807,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
}
sb->s_flags &= ~MS_RDONLY;
+
+ fs_info->open = 1;
}
out:
wake_up_process(fs_info->transaction_kthread);
@@ -1881,7 +1879,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
int ret;
/*
- * We aren't under the device list lock, so this is racey-ish, but good
+ * We aren't under the device list lock, so this is racy-ish, but good
* enough for our purposes.
*/
nr_devices = fs_info->fs_devices->open_devices;
@@ -1900,7 +1898,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
if (!devices_info)
return -ENOMEM;
- /* calc min stripe number for data space alloction */
+ /* calc min stripe number for data space allocation */
type = btrfs_get_alloc_profile(root, 1);
if (type & BTRFS_BLOCK_GROUP_RAID0) {
min_stripes = 2;
@@ -1936,7 +1934,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
avail_space *= BTRFS_STRIPE_LEN;
/*
- * In order to avoid overwritting the superblock on the drive,
+ * In order to avoid overwriting the superblock on the drive,
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*/
@@ -2051,9 +2049,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
u64 thresh = 0;
+ int mixed = 0;
/*
- * holding chunk_muext to avoid allocating new chunks, holding
+ * holding chunk_mutex to avoid allocating new chunks, holding
* device_list_mutex to avoid the device being removed
*/
rcu_read_lock();
@@ -2076,8 +2075,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
}
}
- if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
- total_free_meta += found->disk_total - found->disk_used;
+
+ /*
+ * Metadata in mixed block goup profiles are accounted in data
+ */
+ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+ mixed = 1;
+ else
+ total_free_meta += found->disk_total -
+ found->disk_used;
+ }
total_used += found->disk_used;
}
@@ -2090,7 +2098,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
/* Account global block reserve as used, it's in logical size already */
spin_lock(&block_rsv->lock);
- buf->f_bfree -= block_rsv->size >> bits;
+ /* Mixed block groups accounting is not byte-accurate, avoid overflow */
+ if (buf->f_bfree >= block_rsv->size >> bits)
+ buf->f_bfree -= block_rsv->size >> bits;
+ else
+ buf->f_bfree = 0;
spin_unlock(&block_rsv->lock);
buf->f_bavail = div_u64(total_free_data, factor);
@@ -2115,7 +2127,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
thresh = 4 * 1024 * 1024;
- if (total_free_meta - thresh < block_rsv->size)
+ if (!mixed && total_free_meta - thresh < block_rsv->size)
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
@@ -2293,7 +2305,7 @@ static void btrfs_interface_exit(void)
static void btrfs_print_mod_info(void)
{
- printk(KERN_INFO "Btrfs loaded"
+ printk(KERN_INFO "Btrfs loaded, crc32c=%s"
#ifdef CONFIG_BTRFS_DEBUG
", debug=on"
#endif
@@ -2303,33 +2315,48 @@ static void btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
", integrity-checker=on"
#endif
- "\n");
+ "\n",
+ btrfs_crc32c_impl());
}
static int btrfs_run_sanity_tests(void)
{
- int ret;
-
+ int ret, i;
+ u32 sectorsize, nodesize;
+ u32 test_sectorsize[] = {
+ PAGE_SIZE,
+ };
ret = btrfs_init_test_fs();
if (ret)
return ret;
-
- ret = btrfs_test_free_space_cache();
- if (ret)
- goto out;
- ret = btrfs_test_extent_buffer_operations();
- if (ret)
- goto out;
- ret = btrfs_test_extent_io();
- if (ret)
- goto out;
- ret = btrfs_test_inodes();
- if (ret)
- goto out;
- ret = btrfs_test_qgroups();
- if (ret)
- goto out;
- ret = btrfs_test_free_space_tree();
+ for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) {
+ sectorsize = test_sectorsize[i];
+ for (nodesize = sectorsize;
+ nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE;
+ nodesize <<= 1) {
+ pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n",
+ sectorsize, nodesize);
+ ret = btrfs_test_free_space_cache(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_buffer_operations(sectorsize,
+ nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_io(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_inodes(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_qgroups(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ }
+ }
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 539e7b5e3f86..4879656bda3c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
if (!fs_info)
return -EPERM;
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
ret = kstrtoul(skip_spaces(buf), 0, &val);
if (ret)
return ret;
@@ -364,7 +367,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
char *label = fs_info->super_copy->label;
- return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ssize_t ret;
+
+ spin_lock(&fs_info->super_lock);
+ ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ spin_unlock(&fs_info->super_lock);
+
+ return ret;
}
static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,6 +383,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
size_t p_len;
+ if (!fs_info)
+ return -EPERM;
+
if (fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index f54bf450bad3..10eb249ef891 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -175,7 +175,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
}
struct btrfs_block_group_cache *
-btrfs_alloc_dummy_block_group(unsigned long length)
+btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize)
{
struct btrfs_block_group_cache *cache;
@@ -192,8 +192,8 @@ btrfs_alloc_dummy_block_group(unsigned long length)
cache->key.objectid = 0;
cache->key.offset = length;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = 4096;
- cache->full_stripe_len = 4096;
+ cache->sectorsize = sectorsize;
+ cache->full_stripe_len = sectorsize;
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 054b8c73c951..66fb6b701eb7 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -26,27 +26,28 @@
struct btrfs_root;
struct btrfs_trans_handle;
-int btrfs_test_free_space_cache(void);
-int btrfs_test_extent_buffer_operations(void);
-int btrfs_test_extent_io(void);
-int btrfs_test_inodes(void);
-int btrfs_test_qgroups(void);
-int btrfs_test_free_space_tree(void);
+int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
+int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
+int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
+int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
+int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
+int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
int btrfs_init_test_fs(void);
void btrfs_destroy_test_fs(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
void btrfs_free_dummy_root(struct btrfs_root *root);
struct btrfs_block_group_cache *
-btrfs_alloc_dummy_block_group(unsigned long length);
+btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize);
void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
#else
-static inline int btrfs_test_free_space_cache(void)
+static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
return 0;
}
-static inline int btrfs_test_extent_buffer_operations(void)
+static inline int btrfs_test_extent_buffer_operations(u32 sectorsize,
+ u32 nodesize)
{
return 0;
}
@@ -57,19 +58,19 @@ static inline int btrfs_init_test_fs(void)
static inline void btrfs_destroy_test_fs(void)
{
}
-static inline int btrfs_test_extent_io(void)
+static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
{
return 0;
}
-static inline int btrfs_test_inodes(void)
+static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
{
return 0;
}
-static inline int btrfs_test_qgroups(void)
+static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
{
return 0;
}
-static inline int btrfs_test_free_space_tree(void)
+static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
{
return 0;
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index f51963a8f929..4f8cbd1ec5ee 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -22,7 +22,7 @@
#include "../extent_io.h"
#include "../disk-io.h"
-static int test_btrfs_split_item(void)
+static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
{
struct btrfs_path *path;
struct btrfs_root *root;
@@ -40,7 +40,7 @@ static int test_btrfs_split_item(void)
test_msg("Running btrfs_split_item tests\n");
- root = btrfs_alloc_dummy_root();
+ root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(root)) {
test_msg("Could not allocate root\n");
return PTR_ERR(root);
@@ -53,7 +53,8 @@ static int test_btrfs_split_item(void)
return -ENOMEM;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
+ path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize,
+ nodesize);
if (!eb) {
test_msg("Could not allocate dummy buffer\n");
ret = -ENOMEM;
@@ -222,8 +223,8 @@ out:
return ret;
}
-int btrfs_test_extent_buffer_operations(void)
+int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize)
{
- test_msg("Running extent buffer operation tests");
- return test_btrfs_split_item();
+ test_msg("Running extent buffer operation tests\n");
+ return test_btrfs_split_item(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 70948b13bc81..d19ab0317283 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -21,6 +21,7 @@
#include <linux/slab.h>
#include <linux/sizes.h>
#include "btrfs-tests.h"
+#include "../ctree.h"
#include "../extent_io.h"
#define PROCESS_UNLOCK (1 << 0)
@@ -65,7 +66,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
return count;
}
-static int test_find_delalloc(void)
+static int test_find_delalloc(u32 sectorsize)
{
struct inode *inode;
struct extent_io_tree tmp;
@@ -113,7 +114,7 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL);
+ set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL);
start = 0;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -122,9 +123,9 @@ static int test_find_delalloc(void)
test_msg("Should have found at least one delalloc\n");
goto out_bits;
}
- if (start != 0 || end != 4095) {
- test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
- start, end);
+ if (start != 0 || end != (sectorsize - 1)) {
+ test_msg("Expected start 0 end %u, got start %llu end %llu\n",
+ sectorsize - 1, start, end);
goto out_bits;
}
unlock_extent(&tmp, start, end);
@@ -144,7 +145,7 @@ static int test_find_delalloc(void)
test_msg("Couldn't find the locked page\n");
goto out_bits;
}
- set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL);
+ set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -172,11 +173,11 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- test_start = max_bytes + 4096;
+ test_start = max_bytes + sectorsize;
locked_page = find_lock_page(inode->i_mapping, test_start >>
PAGE_SHIFT);
if (!locked_page) {
- test_msg("Could'nt find the locked page\n");
+ test_msg("Couldn't find the locked page\n");
goto out_bits;
}
start = test_start;
@@ -199,7 +200,7 @@ static int test_find_delalloc(void)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL);
+ set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -262,7 +263,7 @@ static int test_find_delalloc(void)
}
ret = 0;
out_bits:
- clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
+ clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1);
out:
if (locked_page)
put_page(locked_page);
@@ -272,6 +273,16 @@ out:
return ret;
}
+/**
+ * test_bit_in_byte - Determine whether a bit is set in a byte
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static inline int test_bit_in_byte(int nr, const u8 *addr)
+{
+ return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1)));
+}
+
static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
unsigned long len)
{
@@ -298,25 +309,29 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
return -EINVAL;
}
- bitmap_set(bitmap, (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
- sizeof(long) * BITS_PER_BYTE);
- extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0,
- sizeof(long) * BITS_PER_BYTE);
- if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
- test_msg("Setting straddling pages failed\n");
- return -EINVAL;
- }
+ /* Straddling pages test */
+ if (len > PAGE_SIZE) {
+ bitmap_set(bitmap,
+ (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting straddling pages failed\n");
+ return -EINVAL;
+ }
- bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
- bitmap_clear(bitmap,
- (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
- sizeof(long) * BITS_PER_BYTE);
- extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
- extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0,
- sizeof(long) * BITS_PER_BYTE);
- if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
- test_msg("Clearing straddling pages failed\n");
- return -EINVAL;
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ bitmap_clear(bitmap,
+ (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing straddling pages failed\n");
+ return -EINVAL;
+ }
}
/*
@@ -333,7 +348,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
for (i = 0; i < len * BITS_PER_BYTE; i++) {
int bit, bit1;
- bit = !!test_bit(i, bitmap);
+ bit = !!test_bit_in_byte(i, (u8 *)bitmap);
bit1 = !!extent_buffer_test_bit(eb, 0, i);
if (bit1 != bit) {
test_msg("Testing bit pattern failed\n");
@@ -351,15 +366,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
return 0;
}
-static int test_eb_bitmaps(void)
+static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
- unsigned long len = PAGE_SIZE * 4;
+ unsigned long len;
unsigned long *bitmap;
struct extent_buffer *eb;
int ret;
test_msg("Running extent buffer bitmap tests\n");
+ /*
+ * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than
+ * BTRFS_MAX_METADATA_BLOCKSIZE.
+ */
+ len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE)
+ ? sectorsize * 4 : sectorsize;
+
bitmap = kmalloc(len, GFP_KERNEL);
if (!bitmap) {
test_msg("Couldn't allocate test bitmap\n");
@@ -379,7 +401,7 @@ static int test_eb_bitmaps(void)
/* Do it over again with an extent buffer which isn't page-aligned. */
free_extent_buffer(eb);
- eb = __alloc_dummy_extent_buffer(NULL, PAGE_SIZE / 2, len);
+ eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
if (!eb) {
test_msg("Couldn't allocate test extent buffer\n");
kfree(bitmap);
@@ -393,17 +415,17 @@ out:
return ret;
}
-int btrfs_test_extent_io(void)
+int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
{
int ret;
test_msg("Running extent I/O tests\n");
- ret = test_find_delalloc();
+ ret = test_find_delalloc(sectorsize);
if (ret)
goto out;
- ret = test_eb_bitmaps();
+ ret = test_eb_bitmaps(sectorsize, nodesize);
out:
test_msg("Extent I/O tests finished\n");
return ret;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 514247515312..3956bb2ff84c 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -22,10 +22,10 @@
#include "../disk-io.h"
#include "../free-space-cache.h"
-#define BITS_PER_BITMAP (PAGE_SIZE * 8)
+#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
/*
- * This test just does basic sanity checking, making sure we can add an exten
+ * This test just does basic sanity checking, making sure we can add an extent
* entry and remove space from either end and the middle, and make sure we can
* remove space that covers adjacent extent entries.
*/
@@ -99,7 +99,8 @@ static int test_extents(struct btrfs_block_group_cache *cache)
return 0;
}
-static int test_bitmaps(struct btrfs_block_group_cache *cache)
+static int test_bitmaps(struct btrfs_block_group_cache *cache,
+ u32 sectorsize)
{
u64 next_bitmap_offset;
int ret;
@@ -139,7 +140,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
* The first bitmap we have starts at offset 0 so the next one is just
* at the end of the first bitmap.
*/
- next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+ next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
/* Test a bit straddling two bitmaps */
ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
@@ -167,9 +168,10 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
}
/* This is the high grade jackassery */
-static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
+static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
+ u32 sectorsize)
{
- u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+ u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
int ret;
test_msg("Running bitmap and extent tests\n");
@@ -396,11 +398,13 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
* wasn't optimal as they could be spread all over the block group while under
* concurrency (extra overhead and fragmentation).
*
- * This stealing approach is benefical, since we always prefer to allocate from
- * extent entries, both for clustered and non-clustered allocation requests.
+ * This stealing approach is beneficial, since we always prefer to allocate
+ * from extent entries, both for clustered and non-clustered allocation
+ * requests.
*/
static int
-test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
+ u32 sectorsize)
{
int ret;
u64 offset;
@@ -538,7 +542,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096);
+ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -596,8 +600,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) {
- test_msg("Cache free space is not 1Mb + 4Kb\n");
+ if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) {
+ test_msg("Cache free space is not 1Mb + %u\n", sectorsize);
return -EINVAL;
}
@@ -610,22 +614,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return -EINVAL;
}
- /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
+ /*
+ * All that remains is a sectorsize free space region in a bitmap.
+ * Confirm.
+ */
ret = check_num_extents_and_bitmaps(cache, 1, 1);
if (ret)
return ret;
- if (cache->free_space_ctl->free_space != 4096) {
- test_msg("Cache free space is not 4Kb\n");
+ if (cache->free_space_ctl->free_space != sectorsize) {
+ test_msg("Cache free space is not %u\n", sectorsize);
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache,
- 0, 4096, 0,
+ 0, sectorsize, 0,
&max_extent_size);
if (offset != (SZ_128M + SZ_16M)) {
- test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
- offset);
+ test_msg("Failed to allocate %u, returned offset : %llu\n",
+ sectorsize, offset);
return -EINVAL;
}
@@ -732,7 +739,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, SZ_32M, 8192);
+ ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -756,7 +763,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
/*
* Confirm that our extent entry didn't stole all free space from the
- * bitmap, because of the small 8Kb free space region.
+ * bitmap, because of the small 2 * sectorsize free space region.
*/
ret = check_num_extents_and_bitmaps(cache, 2, 1);
if (ret)
@@ -782,8 +789,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) {
- test_msg("Cache free space is not 1Mb + 8Kb\n");
+ if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) {
+ test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize);
return -EINVAL;
}
@@ -795,21 +802,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return -EINVAL;
}
- /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
+ /*
+ * All that remains is 2 * sectorsize free space region
+ * in a bitmap. Confirm.
+ */
ret = check_num_extents_and_bitmaps(cache, 1, 1);
if (ret)
return ret;
- if (cache->free_space_ctl->free_space != 8192) {
- test_msg("Cache free space is not 8Kb\n");
+ if (cache->free_space_ctl->free_space != 2 * sectorsize) {
+ test_msg("Cache free space is not %u\n", 2 * sectorsize);
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache,
- 0, 8192, 0,
+ 0, 2 * sectorsize, 0,
&max_extent_size);
if (offset != SZ_32M) {
- test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
+ test_msg("Failed to allocate %u, offset: %llu\n",
+ 2 * sectorsize,
offset);
return -EINVAL;
}
@@ -824,7 +835,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return 0;
}
-int btrfs_test_free_space_cache(void)
+int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
struct btrfs_block_group_cache *cache;
struct btrfs_root *root = NULL;
@@ -832,13 +843,19 @@ int btrfs_test_free_space_cache(void)
test_msg("Running btrfs free space cache tests\n");
- cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
+ /*
+ * For ppc64 (with 64k page size), bytes per bitmap might be
+ * larger than 1G. To make bitmap test available in ppc64,
+ * alloc dummy block group whose size cross bitmaps.
+ */
+ cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize
+ + PAGE_SIZE, sectorsize);
if (!cache) {
test_msg("Couldn't run the tests\n");
return 0;
}
- root = btrfs_alloc_dummy_root();
+ root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
@@ -854,14 +871,14 @@ int btrfs_test_free_space_cache(void)
ret = test_extents(cache);
if (ret)
goto out;
- ret = test_bitmaps(cache);
+ ret = test_bitmaps(cache, sectorsize);
if (ret)
goto out;
- ret = test_bitmaps_and_extents(cache);
+ ret = test_bitmaps_and_extents(cache, sectorsize);
if (ret)
goto out;
- ret = test_steal_space_from_bitmap_to_extent(cache);
+ ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 7cea4462acd5..aac507085ab0 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../disk-io.h"
@@ -30,7 +31,7 @@ struct free_space_extent {
* The test cases align their operations to this in order to hit some of the
* edge cases in the bitmap code.
*/
-#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096)
+#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE)
static int __check_free_space_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
@@ -439,7 +440,8 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *,
struct btrfs_block_group_cache *,
struct btrfs_path *);
-static int run_test(test_func_t test_func, int bitmaps)
+static int run_test(test_func_t test_func, int bitmaps,
+ u32 sectorsize, u32 nodesize)
{
struct btrfs_root *root = NULL;
struct btrfs_block_group_cache *cache = NULL;
@@ -447,7 +449,7 @@ static int run_test(test_func_t test_func, int bitmaps)
struct btrfs_path *path = NULL;
int ret;
- root = btrfs_alloc_dummy_root();
+ root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(root)) {
test_msg("Couldn't allocate dummy root\n");
ret = PTR_ERR(root);
@@ -466,7 +468,8 @@ static int run_test(test_func_t test_func, int bitmaps)
root->fs_info->free_space_root = root;
root->fs_info->tree_root = root;
- root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ root->node = alloc_test_extent_buffer(root->fs_info,
+ nodesize, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -474,9 +477,9 @@ static int run_test(test_func_t test_func, int bitmaps)
}
btrfs_set_header_level(root->node, 0);
btrfs_set_header_nritems(root->node, 0);
- root->alloc_bytenr += 8192;
+ root->alloc_bytenr += 2 * nodesize;
- cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE);
+ cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize);
if (!cache) {
test_msg("Couldn't allocate dummy block group cache\n");
ret = -ENOMEM;
@@ -534,17 +537,18 @@ out:
return ret;
}
-static int run_test_both_formats(test_func_t test_func)
+static int run_test_both_formats(test_func_t test_func,
+ u32 sectorsize, u32 nodesize)
{
int ret;
- ret = run_test(test_func, 0);
+ ret = run_test(test_func, 0, sectorsize, nodesize);
if (ret)
return ret;
- return run_test(test_func, 1);
+ return run_test(test_func, 1, sectorsize, nodesize);
}
-int btrfs_test_free_space_tree(void)
+int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
{
test_func_t tests[] = {
test_empty_block_group,
@@ -561,9 +565,11 @@ int btrfs_test_free_space_tree(void)
test_msg("Running free space tree tests\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
- int ret = run_test_both_formats(tests[i]);
+ int ret = run_test_both_formats(tests[i], sectorsize,
+ nodesize);
if (ret) {
- test_msg("%pf failed\n", tests[i]);
+ test_msg("%pf : sectorsize %u failed\n",
+ tests[i], sectorsize);
return ret;
}
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 863a6a3af1f8..29648c0a39f1 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../btrfs_inode.h"
@@ -86,19 +87,19 @@ static void insert_inode_item_key(struct btrfs_root *root)
* diagram of how the extents will look though this may not be possible we still
* want to make sure everything acts normally (the last number is not inclusive)
*
- * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288]
- * [hole ][inline][ hole ][ regular ][regular1 split][ hole ]
+ * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
+ * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split]
*
- * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056]
- * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ]
+ * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ]
+ * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written]
*
- * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ]
- * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent]
+ * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635]
+ * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1]
*
- * [81920-86016]
- * [ regular ]
+ * [69635-73731][ 73731 - 86019 ][86019-90115]
+ * [ regular ][ hole but no extent][ regular ]
*/
-static void setup_file_extents(struct btrfs_root *root)
+static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
{
int slot = 0;
u64 disk_bytenr = SZ_1M;
@@ -119,7 +120,7 @@ static void setup_file_extents(struct btrfs_root *root)
insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
slot);
slot++;
- offset = 4096;
+ offset = sectorsize;
/* Now another hole */
insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
@@ -128,99 +129,106 @@ static void setup_file_extents(struct btrfs_root *root)
offset += 4;
/* Now for a regular extent */
- insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0,
+ disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- disk_bytenr += 4096;
- offset += 4095;
+ disk_bytenr += sectorsize;
+ offset += sectorsize - 1;
/*
* Now for 3 extents that were split from a hole punch so we test
* offsets properly.
*/
- insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
- 0, slot);
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, 4 * sectorsize,
BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 8192;
- disk_bytenr += 16384;
+ offset += 2 * sectorsize;
+ disk_bytenr += 4 * sectorsize;
/* Now for a unwritten prealloc extent */
- insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
slot++;
- offset += 4096;
+ offset += sectorsize;
/*
* We want to jack up disk_bytenr a little more so the em stuff doesn't
* merge our records.
*/
- disk_bytenr += 8192;
+ disk_bytenr += 2 * sectorsize;
/*
* Now for a partially written prealloc extent, basically the same as
* the hole punch example above. Ram_bytes never changes when you mark
* extents written btw.
*/
- insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
- BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize,
+ disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, 4 * sectorsize,
BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
slot++;
- offset += 8192;
- disk_bytenr += 16384;
+ offset += 2 * sectorsize;
+ disk_bytenr += 4 * sectorsize;
/* Now a normal compressed extent */
- insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0,
+ disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG,
+ BTRFS_COMPRESS_ZLIB, slot);
slot++;
- offset += 8192;
+ offset += 2 * sectorsize;
/* No merges */
- disk_bytenr += 8192;
+ disk_bytenr += 2 * sectorsize;
/* Now a split compressed extent */
- insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG,
+ BTRFS_COMPRESS_ZLIB, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0,
+ disk_bytenr + sectorsize, sectorsize,
BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, sectorsize,
BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
slot++;
- offset += 8192;
- disk_bytenr += 8192;
+ offset += 2 * sectorsize;
+ disk_bytenr += 2 * sectorsize;
/* Now extents that have a hole but no hole extent */
- insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 16384;
- disk_bytenr += 4096;
- insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ offset += 4 * sectorsize;
+ disk_bytenr += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
}
static unsigned long prealloc_only = 0;
static unsigned long compressed_only = 0;
static unsigned long vacancy_only = 0;
-static noinline int test_btrfs_get_extent(void)
+static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
{
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
@@ -240,7 +248,7 @@ static noinline int test_btrfs_get_extent(void)
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
- root = btrfs_alloc_dummy_root();
+ root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(root)) {
test_msg("Couldn't allocate root\n");
goto out;
@@ -256,7 +264,7 @@ static noinline int test_btrfs_get_extent(void)
goto out;
}
- root->node = alloc_dummy_extent_buffer(NULL, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -264,7 +272,7 @@ static noinline int test_btrfs_get_extent(void)
/*
* We will just free a dummy node if it's ref count is 2 so we need an
- * extra ref so our searches don't accidently release our page.
+ * extra ref so our searches don't accidentally release our page.
*/
extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
@@ -273,7 +281,7 @@ static noinline int test_btrfs_get_extent(void)
/* First with no extents */
BTRFS_I(inode)->root = root;
- em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0);
if (IS_ERR(em)) {
em = NULL;
test_msg("Got an error when we shouldn't have\n");
@@ -295,7 +303,7 @@ static noinline int test_btrfs_get_extent(void)
* setup_file_extents, so if you change anything there you need to
* update the comment and update the expected values below.
*/
- setup_file_extents(root);
+ setup_file_extents(root, sectorsize);
em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
if (IS_ERR(em)) {
@@ -318,7 +326,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -327,7 +335,8 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected an inline, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4091) {
+
+ if (em->start != offset || em->len != (sectorsize - 5)) {
test_msg("Unexpected extent wanted start %llu len 1, got start "
"%llu len %llu\n", offset, em->start, em->len);
goto out;
@@ -344,7 +353,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -366,7 +375,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* Regular extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -375,7 +384,7 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4095) {
+ if (em->start != offset || em->len != sectorsize - 1) {
test_msg("Unexpected extent wanted start %llu len 4095, got "
"start %llu len %llu\n", offset, em->start, em->len);
goto out;
@@ -393,7 +402,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* The next 3 are split extents */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -402,9 +411,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -421,7 +431,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -430,9 +440,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a hole, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -442,7 +453,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -451,9 +462,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 8192) {
- test_msg("Unexpected extent wanted start %llu len 8192, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -475,7 +487,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* Prealloc extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -484,9 +496,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
@@ -503,7 +516,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* The next 3 are a half written prealloc extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -512,9 +525,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
@@ -532,7 +546,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -541,9 +555,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -564,7 +579,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -573,9 +588,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 8192) {
- test_msg("Unexpected extent wanted start %llu len 8192, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
@@ -598,7 +614,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* Now for the compressed extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -607,9 +623,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 8192) {
- test_msg("Unexpected extent wanted start %llu len 8192, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u,"
+ "got start %llu len %llu\n",
+ offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
@@ -631,7 +648,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* Split compressed extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -640,9 +657,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u,"
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
@@ -665,7 +683,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -674,9 +692,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -691,7 +710,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -701,9 +720,10 @@ static noinline int test_btrfs_get_extent(void)
disk_bytenr, em->block_start);
goto out;
}
- if (em->start != offset || em->len != 8192) {
- test_msg("Unexpected extent wanted start %llu len 8192, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
@@ -725,7 +745,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* A hole between regular extents but no hole extent */
- em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -734,9 +754,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -765,9 +786,10 @@ static noinline int test_btrfs_get_extent(void)
* length of the actual hole, if this changes we'll have to change this
* test.
*/
- if (em->start != offset || em->len != 12288) {
- test_msg("Unexpected extent wanted start %llu len 12288, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 3 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, 3 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != vacancy_only) {
@@ -783,7 +805,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -792,9 +814,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u,"
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -815,7 +838,7 @@ out:
return ret;
}
-static int test_hole_first(void)
+static int test_hole_first(u32 sectorsize, u32 nodesize)
{
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
@@ -832,7 +855,7 @@ static int test_hole_first(void)
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
- root = btrfs_alloc_dummy_root();
+ root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(root)) {
test_msg("Couldn't allocate root\n");
goto out;
@@ -844,7 +867,7 @@ static int test_hole_first(void)
goto out;
}
- root->node = alloc_dummy_extent_buffer(NULL, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -861,9 +884,9 @@ static int test_hole_first(void)
* btrfs_get_extent.
*/
insert_inode_item_key(root);
- insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
- BTRFS_FILE_EXTENT_REG, 0, 1);
- em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+ insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
+ em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -872,9 +895,10 @@ static int test_hole_first(void)
test_msg("Expected a hole, got %llu\n", em->block_start);
goto out;
}
- if (em->start != 0 || em->len != 4096) {
- test_msg("Unexpected extent wanted start 0 len 4096, got start "
- "%llu len %llu\n", em->start, em->len);
+ if (em->start != 0 || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start 0 len %u, "
+ "got start %llu len %llu\n",
+ sectorsize, em->start, em->len);
goto out;
}
if (em->flags != vacancy_only) {
@@ -884,18 +908,19 @@ static int test_hole_first(void)
}
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+ em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
}
- if (em->block_start != 4096) {
+ if (em->block_start != sectorsize) {
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != 4096 || em->len != 4096) {
- test_msg("Unexpected extent wanted start 4096 len 4096, got "
- "start %llu len %llu\n", em->start, em->len);
+ if (em->start != sectorsize || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %u len %u, "
+ "got start %llu len %llu\n",
+ sectorsize, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -912,7 +937,7 @@ out:
return ret;
}
-static int test_extent_accounting(void)
+static int test_extent_accounting(u32 sectorsize, u32 nodesize)
{
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
@@ -924,7 +949,7 @@ static int test_extent_accounting(void)
return ret;
}
- root = btrfs_alloc_dummy_root();
+ root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(root)) {
test_msg("Couldn't allocate root\n");
goto out;
@@ -954,10 +979,11 @@ static int test_extent_accounting(void)
goto out;
}
- /* [BTRFS_MAX_EXTENT_SIZE][4k] */
+ /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
- BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+ BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
+ NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -969,10 +995,10 @@ static int test_extent_accounting(void)
goto out;
}
- /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+ /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE >> 1,
- (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DIRTY |
EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
NULL, GFP_KERNEL);
@@ -987,10 +1013,11 @@ static int test_extent_accounting(void)
goto out;
}
- /* [BTRFS_MAX_EXTENT_SIZE][4K] */
+ /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
- (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ (BTRFS_MAX_EXTENT_SIZE >> 1)
+ + sectorsize - 1,
NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
@@ -1004,16 +1031,17 @@ static int test_extent_accounting(void)
}
/*
- * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+ * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*
* I'm artificially adding 2 to outstanding_extents because in the
* buffered IO case we'd add things up as we go, but I don't feel like
* doing that here, this isn't the interesting case we want to test.
*/
BTRFS_I(inode)->outstanding_extents += 2;
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
- (BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
- NULL);
+ ret = btrfs_set_extent_delalloc(inode,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
+ (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
+ NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1025,10 +1053,13 @@ static int test_extent_accounting(void)
goto out;
}
- /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+ /*
+ * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
+ */
BTRFS_I(inode)->outstanding_extents++;
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
- BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ ret = btrfs_set_extent_delalloc(inode,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1042,8 +1073,8 @@ static int test_extent_accounting(void)
/* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE+4096,
- BTRFS_MAX_EXTENT_SIZE+8191,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
NULL, GFP_KERNEL);
@@ -1063,8 +1094,9 @@ static int test_extent_accounting(void)
* might fail and I'd rather satisfy my paranoia at this point.
*/
BTRFS_I(inode)->outstanding_extents++;
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
- BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ ret = btrfs_set_extent_delalloc(inode,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1103,7 +1135,7 @@ out:
return ret;
}
-int btrfs_test_inodes(void)
+int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
{
int ret;
@@ -1112,13 +1144,13 @@ int btrfs_test_inodes(void)
set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
test_msg("Running btrfs_get_extent tests\n");
- ret = test_btrfs_get_extent();
+ ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
return ret;
test_msg("Running hole first btrfs_get_extent test\n");
- ret = test_hole_first();
+ ret = test_hole_first(sectorsize, nodesize);
if (ret)
return ret;
test_msg("Running outstanding_extents tests\n");
- return test_extent_accounting();
+ return test_extent_accounting(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 8ea5d34bc5a2..57a12c0d680b 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../transaction.h"
@@ -216,7 +217,8 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
return ret;
}
-static int test_no_shared_qgroup(struct btrfs_root *root)
+static int test_no_shared_qgroup(struct btrfs_root *root,
+ u32 sectorsize, u32 nodesize)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -227,29 +229,30 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
btrfs_init_dummy_trans(&trans);
test_msg("Qgroup basic add\n");
- ret = btrfs_create_qgroup(NULL, fs_info, 5);
+ ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID);
if (ret) {
test_msg("Couldn't create a qgroup %d\n", ret);
return ret;
}
/*
- * Since the test trans doesn't havee the complicated delayed refs,
+ * Since the test trans doesn't have the complicated delayed refs,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FS_TREE_OBJECTID);
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -257,32 +260,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
old_roots = NULL;
new_roots = NULL;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = remove_extent_item(root, 4096, 4096);
+ ret = remove_extent_item(root, nodesize, nodesize);
if (ret)
return -EINVAL;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -290,14 +294,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return -EINVAL;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
@@ -310,7 +314,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
* right, also remove one of the roots and make sure the exclusive count is
* adjusted properly.
*/
-static int test_multiple_refs(struct btrfs_root *root)
+static int test_multiple_refs(struct btrfs_root *root,
+ u32 sectorsize, u32 nodesize)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -322,25 +327,29 @@ static int test_multiple_refs(struct btrfs_root *root)
test_msg("Qgroup multiple refs test\n");
- /* We have 5 created already from the previous test */
- ret = btrfs_create_qgroup(NULL, fs_info, 256);
+ /*
+ * We have BTRFS_FS_TREE_OBJECTID created already from the
+ * previous test.
+ */
+ ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID);
if (ret) {
test_msg("Couldn't create a qgroup %d\n", ret);
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FS_TREE_OBJECTID);
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -348,30 +357,32 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = add_tree_ref(root, 4096, 4096, 0, 256);
+ ret = add_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FIRST_FREE_OBJECTID);
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -379,35 +390,38 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, 0)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
- if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
+ nodesize, 0)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = remove_extent_ref(root, 4096, 4096, 0, 256);
+ ret = remove_extent_ref(root, nodesize, nodesize, 0,
+ BTRFS_FIRST_FREE_OBJECTID);
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -415,19 +429,21 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
- if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
+ 0, 0)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
@@ -435,13 +451,13 @@ static int test_multiple_refs(struct btrfs_root *root)
return 0;
}
-int btrfs_test_qgroups(void)
+int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
{
struct btrfs_root *root;
struct btrfs_root *tmp_root;
int ret = 0;
- root = btrfs_alloc_dummy_root();
+ root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(root)) {
test_msg("Couldn't allocate root\n");
return PTR_ERR(root);
@@ -468,7 +484,8 @@ int btrfs_test_qgroups(void)
* Can't use bytenr 0, some things freak out
* *cough*backref walking code*cough*
*/
- root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize,
+ nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -476,16 +493,16 @@ int btrfs_test_qgroups(void)
}
btrfs_set_header_level(root->node, 0);
btrfs_set_header_nritems(root->node, 0);
- root->alloc_bytenr += 8192;
+ root->alloc_bytenr += 2 * nodesize;
- tmp_root = btrfs_alloc_dummy_root();
+ tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(tmp_root)) {
test_msg("Couldn't allocate a fs root\n");
ret = PTR_ERR(tmp_root);
goto out;
}
- tmp_root->root_key.objectid = 5;
+ tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID;
root->fs_info->fs_root = tmp_root;
ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
if (ret) {
@@ -493,14 +510,14 @@ int btrfs_test_qgroups(void)
goto out;
}
- tmp_root = btrfs_alloc_dummy_root();
+ tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize);
if (IS_ERR(tmp_root)) {
test_msg("Couldn't allocate a fs root\n");
ret = PTR_ERR(tmp_root);
goto out;
}
- tmp_root->root_key.objectid = 256;
+ tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID;
ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
if (ret) {
test_msg("Couldn't insert fs root %d\n", ret);
@@ -508,10 +525,10 @@ int btrfs_test_qgroups(void)
}
test_msg("Running qgroup tests\n");
- ret = test_no_shared_qgroup(root);
+ ret = test_no_shared_qgroup(root, sectorsize, nodesize);
if (ret)
goto out;
- ret = test_multiple_refs(root);
+ ret = test_multiple_refs(root, sectorsize, nodesize);
out:
btrfs_free_dummy_root(root);
return ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 43885e51b882..765845742fde 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -311,10 +311,11 @@ loop:
* when the transaction commits
*/
static int record_root_in_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root,
+ int force)
{
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
- root->last_trans < trans->transid) {
+ if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ root->last_trans < trans->transid) || force) {
WARN_ON(root == root->fs_info->extent_root);
WARN_ON(root->commit_root != root->node);
@@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
smp_wmb();
spin_lock(&root->fs_info->fs_roots_radix_lock);
- if (root->last_trans == trans->transid) {
+ if (root->last_trans == trans->transid && !force) {
spin_unlock(&root->fs_info->fs_roots_radix_lock);
return 0;
}
@@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&root->fs_info->reloc_mutex);
- record_root_in_trans(trans, root);
+ record_root_in_trans(trans, root, 0);
mutex_unlock(&root->fs_info->reloc_mutex);
return 0;
@@ -943,7 +944,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
err = convert_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT,
- mark, &cached_state, GFP_NOFS);
+ mark, &cached_state);
/*
* convert_extent_bit can return -ENOMEM, which is most of the
* time a temporary error. So when it happens, ignore the error
@@ -1311,6 +1312,92 @@ int btrfs_defrag_root(struct btrfs_root *root)
}
/*
+ * Do all special snapshot related qgroup dirty hack.
+ *
+ * Will do all needed qgroup inherit and dirty hack like switch commit
+ * roots inside one transaction and write all btree into disk, to make
+ * qgroup works.
+ */
+static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_root *src,
+ struct btrfs_root *parent,
+ struct btrfs_qgroup_inherit *inherit,
+ u64 dst_objectid)
+{
+ struct btrfs_fs_info *fs_info = src->fs_info;
+ int ret;
+
+ /*
+ * Save some performance in the case that qgroups are not
+ * enabled. If this check races with the ioctl, rescan will
+ * kick in anyway.
+ */
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_enabled) {
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return 0;
+ }
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
+ * We are going to commit transaction, see btrfs_commit_transaction()
+ * comment for reason locking tree_log_mutex
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
+ ret = commit_fs_roots(trans, src);
+ if (ret)
+ goto out;
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_account_extents(trans, fs_info);
+ if (ret < 0)
+ goto out;
+
+ /* Now qgroup are all updated, we can inherit it to new qgroups */
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ src->root_key.objectid, dst_objectid,
+ inherit);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Now we do a simplified commit transaction, which will:
+ * 1) commit all subvolume and extent tree
+ * To ensure all subvolume and extent tree have a valid
+ * commit_root to accounting later insert_dir_item()
+ * 2) write all btree blocks onto disk
+ * This is to make sure later btree modification will be cowed
+ * Or commit_root can be populated and cause wrong qgroup numbers
+ * In this simplified commit, we don't really care about other trees
+ * like chunk and root tree, as they won't affect qgroup.
+ * And we don't write super to avoid half committed status.
+ */
+ ret = commit_cowonly_roots(trans, src);
+ if (ret)
+ goto out;
+ switch_commit_roots(trans->transaction, fs_info);
+ ret = btrfs_write_and_wait_transaction(trans, src);
+ if (ret)
+ btrfs_handle_fs_error(fs_info, ret,
+ "Error while writing out transaction for qgroup");
+
+out:
+ mutex_unlock(&fs_info->tree_log_mutex);
+
+ /*
+ * Force parent root to be updated, as we recorded it before so its
+ * last_trans == cur_transid.
+ * Or it won't be committed again onto disk after later
+ * insert_dir_item()
+ */
+ if (!ret)
+ record_root_in_trans(trans, parent, 1);
+ return ret;
+}
+
+/*
* new snapshots need to be created at a very specific time in the
* transaction commit. This does the actual creation.
*
@@ -1383,7 +1470,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
- record_root_in_trans(trans, parent_root);
+ record_root_in_trans(trans, parent_root, 0);
cur_time = current_fs_time(parent_inode->i_sb);
@@ -1420,7 +1507,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- record_root_in_trans(trans, root);
+ record_root_in_trans(trans, root, 0);
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
@@ -1516,6 +1603,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
+ /*
+ * Do special qgroup accounting for snapshot, as we do some qgroup
+ * snapshot hack to do fast snapshot.
+ * To co-operate with that hack, we do hack again.
+ * Or snapshot will be greatly slowed down by a subtree qgroup rescan
+ */
+ ret = qgroup_account_snapshot(trans, root, parent_root,
+ pending->inherit, objectid);
+ if (ret < 0)
+ goto fail;
+
ret = btrfs_insert_dir_item(trans, parent_root,
dentry->d_name.name, dentry->d_name.len,
parent_inode, &key,
@@ -1559,23 +1657,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- /*
- * account qgroup counters before qgroup_inherit()
- */
- ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- if (ret)
- goto fail;
- ret = btrfs_qgroup_account_extents(trans, fs_info);
- if (ret)
- goto fail;
- ret = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
fail:
pending->error = ret;
dir_item_existed:
@@ -1821,7 +1902,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
- btrfs_wait_ordered_roots(fs_info, -1);
+ btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
}
static inline void
@@ -2145,7 +2226,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_transaction(trans, root);
if (ret) {
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
goto scrub_continue;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 72be51f7ca2f..c5abee4f01ad 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -110,7 +110,6 @@ struct btrfs_trans_handle {
u64 chunk_bytes_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
- unsigned long blocks_used;
unsigned long delayed_ref_updates;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
@@ -121,6 +120,7 @@ struct btrfs_trans_handle {
bool can_flush_pending_bgs;
bool reloc_reserved;
bool sync;
+ bool dirty;
unsigned int type;
/*
* this root is only needed to validate that the root passed to
@@ -144,7 +144,7 @@ struct btrfs_pending_snapshot {
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
u64 qgroup_reserved;
- /* extra metadata reseration for relocation */
+ /* extra metadata reservation for relocation */
int error;
bool readonly;
struct list_head list;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e692eea87af6..c05f69a8ec42 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2330,7 +2330,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
/* for regular files, make sure corresponding
- * orhpan item exist. extents past the new EOF
+ * orphan item exist. extents past the new EOF
* will be truncated later by orphan cleanup.
*/
if (S_ISREG(mode)) {
@@ -2422,8 +2422,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
root_owner = btrfs_header_owner(parent);
next = btrfs_find_create_tree_block(root, bytenr);
- if (!next)
- return -ENOMEM;
+ if (IS_ERR(next))
+ return PTR_ERR(next);
if (*level == 1) {
ret = wc->process_func(root, next, wc, ptr_gen);
@@ -3001,7 +3001,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
break;
clear_extent_bits(&log->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+ EXTENT_DIRTY | EXTENT_NEW);
}
/*
@@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&extents);
+ down_write(&BTRFS_I(inode)->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
@@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
}
list_sort(NULL, &extents, extent_cmp);
+ btrfs_get_logged_extents(inode, logged_list, start, end);
/*
- * Collect any new ordered extents within the range. This is to
- * prevent logging file extent items without waiting for the disk
- * location they point to being written. We do this only to deal
- * with races against concurrent lockless direct IO writes.
+ * Some ordered extents started by fsync might have completed
+ * before we could collect them into the list logged_list, which
+ * means they're gone, not in our logged_list nor in the inode's
+ * ordered tree. We want the application/user space to know an
+ * error happened while attempting to persist file data so that
+ * it can take proper action. If such error happened, we leave
+ * without writing to the log tree and the fsync must report the
+ * file data write error and not commit the current transaction.
*/
- btrfs_get_logged_extents(inode, logged_list, start, end);
+ ret = btrfs_inode_check_errors(inode);
+ if (ret)
+ ctx->io_err = ret;
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4210,7 @@ process:
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
+ up_write(&BTRFS_I(inode)->dio_sem);
btrfs_release_path(path);
return ret;
@@ -4623,23 +4632,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
/*
- * Collect ordered extents only if we are logging data. This is to
- * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
- * will process the ordered extents if they still exists at the time,
- * because when we collect them we test and set for the flag
- * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
- * same ordered extents. The consequence for the LOG_INODE_ALL log mode
- * not processing the ordered extents is that we end up logging the
- * corresponding file extent items, based on the extent maps in the
- * inode's extent_map_tree's modified_list, without logging the
- * respective checksums (since the may still be only attached to the
- * ordered extents and have not been inserted in the csum tree by
- * btrfs_finish_ordered_io() yet).
- */
- if (inode_only == LOG_INODE_ALL)
- btrfs_get_logged_extents(inode, &logged_list, start, end);
-
- /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
@@ -4846,21 +4838,6 @@ log_extents:
goto out_unlock;
}
if (fast_search) {
- /*
- * Some ordered extents started by fsync might have completed
- * before we collected the ordered extents in logged_list, which
- * means they're gone, not in our logged_list nor in the inode's
- * ordered tree. We want the application/user space to know an
- * error happened while attempting to persist file data so that
- * it can take proper action. If such error happened, we leave
- * without writing to the log tree and the fsync must report the
- * file data write error and not commit the current transaction.
- */
- err = btrfs_inode_check_errors(inode);
- if (err) {
- ctx->io_err = err;
- goto out_unlock;
- }
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
&logged_list, ctx, start, end);
if (ret) {
@@ -4937,7 +4914,7 @@ out_unlock:
* the actual unlink operation, so if we do this check before a concurrent task
* sets last_unlink_trans it means we've logged a consistent version/state of
* all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task migth have only updated last_unlink_trans before
+ * commit (the concurrent task might have only updated last_unlink_trans before
* we logged the inode or it might have also done the unlink).
*/
static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
@@ -4996,7 +4973,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
while (1) {
/*
* If we are logging a directory then we start with our inode,
- * not our parents inode, so we need to skipp setting the
+ * not our parent's inode, so we need to skip setting the
* logged_trans so that further down in the log code we don't
* think this inode has already been logged.
*/
@@ -5158,7 +5135,7 @@ process_leaf:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR)
+ if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
btrfs_release_path(path);
ret = btrfs_log_inode(trans, root, di_inode,
@@ -5278,11 +5255,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (IS_ERR(dir_inode))
continue;
+ if (ctx)
+ ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, dir_inode,
LOG_INODE_ALL, 0, LLONG_MAX, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, dir_inode))
ret = 1;
+ if (!ret && ctx && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, root,
+ dir_inode, ctx);
iput(dir_inode);
if (ret)
goto out;
@@ -5375,7 +5357,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
log_dentries = true;
/*
- * On unlink we must make sure all our current and old parent directores
+ * On unlink we must make sure all our current and old parent directory
* inodes are fully logged. This is to prevent leaving dangling
* directory index entries in directories that were our parents but are
* not anymore. Not doing this results in old parent directory being
@@ -5519,7 +5501,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
+ btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
"recovering log root tree.");
goto error;
}
@@ -5533,7 +5515,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_std_error(fs_info, ret,
+ btrfs_handle_fs_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
@@ -5551,7 +5533,7 @@ again:
log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_std_error(fs_info, ret,
+ btrfs_handle_fs_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
@@ -5566,7 +5548,7 @@ again:
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
- btrfs_std_error(fs_info, ret, "Couldn't read target root "
+ btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
}
@@ -5652,11 +5634,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* into the file. When the file is logged we check it and
* don't log the parents if the file is fully on disk.
*/
- if (S_ISREG(inode->i_mode)) {
- mutex_lock(&BTRFS_I(inode)->log_mutex);
- BTRFS_I(inode)->last_unlink_trans = trans->transid;
- mutex_unlock(&BTRFS_I(inode)->log_mutex);
- }
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
/*
* if this directory was already logged any new
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 91feb2bdefee..b1434bb57e36 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -28,7 +28,7 @@
* }
* ulist_free(ulist);
*
- * This assumes the graph nodes are adressable by u64. This stems from the
+ * This assumes the graph nodes are addressable by u64. This stems from the
* usage for tree enumeration in btrfs, where the logical addresses are
* 64 bit.
*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bd0f45fb38c4..2f631b58ae00 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,13 +20,13 @@
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
-#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
+#include <linux/uuid.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
@@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
};
+/*
+ * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
+ * condition is not met. Zero means there's no corresponding
+ * BTRFS_ERROR_DEV_*_NOT_MET value.
+ */
+const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
+ [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+ [BTRFS_RAID_DUP] = 0,
+ [BTRFS_RAID_RAID0] = 0,
+ [BTRFS_RAID_SINGLE] = 0,
+ [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
+ [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
+};
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
@@ -699,7 +714,8 @@ static noinline int device_list_add(const char *path,
* if there is new btrfs on an already registered device,
* then remove the stale device entry.
*/
- btrfs_free_stale_device(device);
+ if (ret > 0)
+ btrfs_free_stale_device(device);
*fs_devices_ret = fs_devices;
@@ -988,6 +1004,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
return ret;
}
+void btrfs_release_disk_super(struct page *page)
+{
+ kunmap(page);
+ put_page(page);
+}
+
+int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
+ struct page **page, struct btrfs_super_block **disk_super)
+{
+ void *p;
+ pgoff_t index;
+
+ /* make sure our super fits in the device */
+ if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ return 1;
+
+ /* make sure our super fits in the page */
+ if (sizeof(**disk_super) > PAGE_SIZE)
+ return 1;
+
+ /* make sure our super doesn't straddle pages on disk */
+ index = bytenr >> PAGE_SHIFT;
+ if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
+ return 1;
+
+ /* pull in the page with our super */
+ *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ index, GFP_KERNEL);
+
+ if (IS_ERR_OR_NULL(*page))
+ return 1;
+
+ p = kmap(*page);
+
+ /* align our pointer to the offset of the super block */
+ *disk_super = p + (bytenr & ~PAGE_MASK);
+
+ if (btrfs_super_bytenr(*disk_super) != bytenr ||
+ btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(*page);
+ return 1;
+ }
+
+ if ((*disk_super)->label[0] &&
+ (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
+ (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
+
+ return 0;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -999,13 +1065,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_super_block *disk_super;
struct block_device *bdev;
struct page *page;
- void *p;
int ret = -EINVAL;
u64 devid;
u64 transid;
u64 total_devices;
u64 bytenr;
- pgoff_t index;
/*
* we would like to check all the supers, but that would make
@@ -1018,41 +1082,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
mutex_lock(&uuid_mutex);
bdev = blkdev_get_by_path(path, flags, holder);
-
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
goto error;
}
- /* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
- goto error_bdev_put;
-
- /* make sure our super fits in the page */
- if (sizeof(*disk_super) > PAGE_SIZE)
- goto error_bdev_put;
-
- /* make sure our super doesn't straddle pages on disk */
- index = bytenr >> PAGE_SHIFT;
- if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
- goto error_bdev_put;
-
- /* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
- index, GFP_NOFS);
-
- if (IS_ERR_OR_NULL(page))
+ if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
goto error_bdev_put;
- p = kmap(page);
-
- /* align our pointer to the offset of the super block */
- disk_super = p + (bytenr & ~PAGE_MASK);
-
- if (btrfs_super_bytenr(disk_super) != bytenr ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC)
- goto error_unmap;
-
devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
@@ -1060,8 +1097,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
if (ret > 0) {
if (disk_super->label[0]) {
- if (disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
} else {
printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
@@ -1073,9 +1108,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
-error_unmap:
- kunmap(page);
- put_page(page);
+ btrfs_release_disk_super(page);
error_bdev_put:
blkdev_put(bdev, flags);
@@ -1454,7 +1487,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_std_error(root->fs_info, ret, "Slot search failed");
+ btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed");
goto out;
}
@@ -1462,7 +1495,7 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret) {
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to remove dev extent item");
} else {
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
@@ -1688,32 +1721,92 @@ out:
return ret;
}
-int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+/*
+ * Verify that @num_devices satisfies the RAID profile constraints in the whole
+ * filesystem. It's up to the caller to adjust that number regarding eg. device
+ * replace.
+ */
+static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
+ u64 num_devices)
+{
+ u64 all_avail;
+ unsigned seq;
+ int i;
+
+ do {
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ all_avail = fs_info->avail_data_alloc_bits |
+ fs_info->avail_system_alloc_bits |
+ fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (!(all_avail & btrfs_raid_group[i]))
+ continue;
+
+ if (num_devices < btrfs_raid_array[i].devs_min) {
+ int ret = btrfs_raid_mindev_error[i];
+
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
+ struct btrfs_device *device)
{
- struct btrfs_device *device;
struct btrfs_device *next_device;
- struct block_device *bdev;
- struct buffer_head *bh = NULL;
- struct btrfs_super_block *disk_super;
+
+ list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
+ if (next_device != device &&
+ !next_device->missing && next_device->bdev)
+ return next_device;
+ }
+
+ return NULL;
+}
+
+/*
+ * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * and replace it with the provided or the next active device, in the context
+ * where this function called, there should be always be another device (or
+ * this_dev) which is active.
+ */
+void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *device, struct btrfs_device *this_dev)
+{
+ struct btrfs_device *next_device;
+
+ if (this_dev)
+ next_device = this_dev;
+ else
+ next_device = btrfs_find_next_active_device(fs_info->fs_devices,
+ device);
+ ASSERT(next_device);
+
+ if (fs_info->sb->s_bdev &&
+ (fs_info->sb->s_bdev == device->bdev))
+ fs_info->sb->s_bdev = next_device->bdev;
+
+ if (fs_info->fs_devices->latest_bdev == device->bdev)
+ fs_info->fs_devices->latest_bdev = next_device->bdev;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
+{
+ struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
- u64 all_avail;
- u64 devid;
u64 num_devices;
- u8 *dev_uuid;
- unsigned seq;
int ret = 0;
bool clear_super = false;
+ char *dev_name = NULL;
mutex_lock(&uuid_mutex);
- do {
- seq = read_seqbegin(&root->fs_info->profiles_lock);
-
- all_avail = root->fs_info->avail_data_alloc_bits |
- root->fs_info->avail_system_alloc_bits |
- root->fs_info->avail_metadata_alloc_bits;
- } while (read_seqretry(&root->fs_info->profiles_lock, seq));
-
num_devices = root->fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
@@ -1722,78 +1815,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
- ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
- goto out;
- }
-
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
- ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+ ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1);
+ if (ret)
goto out;
- }
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
- root->fs_info->fs_devices->rw_devices <= 2) {
- ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
- goto out;
- }
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
- root->fs_info->fs_devices->rw_devices <= 3) {
- ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
+ ret = btrfs_find_device_by_devspec(root, devid, device_path,
+ &device);
+ if (ret)
goto out;
- }
-
- if (strcmp(device_path, "missing") == 0) {
- struct list_head *devices;
- struct btrfs_device *tmp;
-
- device = NULL;
- devices = &root->fs_info->fs_devices->devices;
- /*
- * It is safe to read the devices since the volume_mutex
- * is held.
- */
- list_for_each_entry(tmp, devices, dev_list) {
- if (tmp->in_fs_metadata &&
- !tmp->is_tgtdev_for_dev_replace &&
- !tmp->bdev) {
- device = tmp;
- break;
- }
- }
- bdev = NULL;
- bh = NULL;
- disk_super = NULL;
- if (!device) {
- ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
- goto out;
- }
- } else {
- ret = btrfs_get_bdev_and_sb(device_path,
- FMODE_WRITE | FMODE_EXCL,
- root->fs_info->bdev_holder, 0,
- &bdev, &bh);
- if (ret)
- goto out;
- disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- dev_uuid = disk_super->dev_item.uuid;
- device = btrfs_find_device(root->fs_info, devid, dev_uuid,
- disk_super->fsid);
- if (!device) {
- ret = -ENOENT;
- goto error_brelse;
- }
- }
if (device->is_tgtdev_for_dev_replace) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto error_brelse;
+ goto out;
}
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto error_brelse;
+ goto out;
}
if (device->writeable) {
@@ -1801,6 +1839,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
unlock_chunks(root);
+ dev_name = kstrdup(device->name->str, GFP_KERNEL);
+ if (!dev_name) {
+ ret = -ENOMEM;
+ goto error_undo;
+ }
clear_super = true;
}
@@ -1842,12 +1885,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->missing)
device->fs_devices->missing_devices--;
- next_device = list_entry(root->fs_info->fs_devices->devices.next,
- struct btrfs_device, dev_list);
- if (device->bdev == root->fs_info->sb->s_bdev)
- root->fs_info->sb->s_bdev = next_device->bdev;
- if (device->bdev == root->fs_info->fs_devices->latest_bdev)
- root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+ btrfs_assign_next_active_device(root->fs_info, device, NULL);
if (device->bdev) {
device->fs_devices->open_devices--;
@@ -1883,63 +1921,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
* at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super
*/
- if (clear_super && disk_super) {
- u64 bytenr;
- int i;
-
- /* make sure this device isn't detected as part of
- * the FS anymore
- */
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
-
- /* clear the mirror copies of super block on the disk
- * being removed, 0th copy is been taken care above and
- * the below would take of the rest
- */
- for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- i_size_read(bdev->bd_inode))
- break;
-
- brelse(bh);
- bh = __bread(bdev, bytenr / 4096,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh)
- continue;
-
- disk_super = (struct btrfs_super_block *)bh->b_data;
-
- if (btrfs_super_bytenr(disk_super) != bytenr ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- continue;
- }
- memset(&disk_super->magic, 0,
- sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
+ if (clear_super) {
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL,
+ root->fs_info->bdev_holder);
+ if (!IS_ERR(bdev)) {
+ btrfs_scratch_superblocks(bdev, dev_name);
+ blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
}
}
- ret = 0;
-
- if (bdev) {
- /* Notify udev that device has changed */
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
-
- /* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
- }
-
-error_brelse:
- brelse(bh);
- if (bdev)
- blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
+ kfree(dev_name);
+
mutex_unlock(&uuid_mutex);
return ret;
+
error_undo:
if (device->writeable) {
lock_chunks(root);
@@ -1948,7 +1946,7 @@ error_undo:
device->fs_devices->rw_devices++;
unlock_chunks(root);
}
- goto error_brelse;
+ goto out;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
@@ -1972,11 +1970,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->missing)
fs_devices->missing_devices--;
- if (srcdev->writeable) {
+ if (srcdev->writeable)
fs_devices->rw_devices--;
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
- }
if (srcdev->bdev)
fs_devices->open_devices--;
@@ -1987,6 +1982,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
+ if (srcdev->writeable) {
+ /* zero out the old super if it is writable */
+ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
+ }
call_rcu(&srcdev->rcu, free_device);
/*
@@ -2016,32 +2015,33 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
- struct btrfs_device *next_device;
-
mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
- if (tgtdev->bdev) {
- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ if (tgtdev->bdev)
fs_info->fs_devices->open_devices--;
- }
+
fs_info->fs_devices->num_devices--;
- next_device = list_entry(fs_info->fs_devices->devices.next,
- struct btrfs_device, dev_list);
- if (tgtdev->bdev == fs_info->sb->s_bdev)
- fs_info->sb->s_bdev = next_device->bdev;
- if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
- fs_info->fs_devices->latest_bdev = next_device->bdev;
- list_del_rcu(&tgtdev->dev_list);
+ btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
- call_rcu(&tgtdev->rcu, free_device);
+ list_del_rcu(&tgtdev->dev_list);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
+
+ /*
+ * The update_dev_time() with in btrfs_scratch_superblocks()
+ * may lead to a call to btrfs_show_devname() which will try
+ * to hold device_list_mutex. And here this device
+ * is already out of device list, so we don't have to hold
+ * the device_list_mutex lock.
+ */
+ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ call_rcu(&tgtdev->rcu, free_device);
}
static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -2102,6 +2102,31 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
}
/*
+ * Lookup a device given by device id, or the path if the id is 0.
+ */
+int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+ char *devpath,
+ struct btrfs_device **device)
+{
+ int ret;
+
+ if (devid) {
+ ret = 0;
+ *device = btrfs_find_device(root->fs_info, devid, NULL,
+ NULL);
+ if (!*device)
+ ret = -ENOENT;
+ } else {
+ if (!devpath || !devpath[0])
+ return -EINVAL;
+
+ ret = btrfs_find_device_missing_or_by_path(root, devpath,
+ device);
+ }
+ return ret;
+}
+
+/*
* does all the dirty work required for changing file system's UUID.
*/
static int btrfs_prepare_sprout(struct btrfs_root *root)
@@ -2165,7 +2190,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
}
/*
- * strore the expected generation for seed devices in device items.
+ * Store the expected generation for seed devices in device items.
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -2418,7 +2443,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
@@ -2663,7 +2688,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_std_error(root->fs_info, -ENOENT,
+ btrfs_handle_fs_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
@@ -2671,7 +2696,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
@@ -2736,6 +2761,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
u64 dev_extent_len = 0;
u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
int i, ret = 0;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
/* Just in case */
root = root->fs_info->chunk_root;
@@ -2762,12 +2788,19 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
check_system_chunk(trans, extent_root, map->type);
unlock_chunks(root->fs_info->chunk_root);
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
+ */
+ mutex_lock(&fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
if (ret) {
+ mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, root, ret);
goto out;
}
@@ -2786,11 +2819,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
if (ret) {
+ mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, root, ret);
goto out;
}
}
}
+ mutex_unlock(&fs_devices->device_list_mutex);
+
ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -2857,7 +2893,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
return ret;
}
@@ -3362,7 +3398,7 @@ static int should_balance_chunk(struct btrfs_root *root,
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
/*
* Same logic as the 'limit' filter; the minimum cannot be
- * determined here because we do not have the global informatoin
+ * determined here because we do not have the global information
* about the count of all chunks that satisfy the filters.
*/
if (bargs->limit_max == 0)
@@ -3402,6 +3438,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
+ u64 bytes_used = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -3540,7 +3577,13 @@ again:
goto loop;
}
- if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+ ASSERT(fs_info->data_sinfo);
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
+
+ if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+ !chunk_reserved && !bytes_used) {
trans = btrfs_start_transaction(chunk_root, 0);
if (IS_ERR(trans)) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3632,7 +3675,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
if (ret)
- btrfs_std_error(fs_info, ret, NULL);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
@@ -3693,10 +3736,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
num_devices--;
}
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
- allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
- if (num_devices == 1)
- allowed |= BTRFS_BLOCK_GROUP_DUP;
- else if (num_devices > 1)
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
+ if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
if (num_devices > 2)
allowed |= BTRFS_BLOCK_GROUP_RAID5;
@@ -4200,6 +4241,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
if (IS_ERR(uuid_root)) {
ret = PTR_ERR(uuid_root);
btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
return ret;
}
@@ -5278,7 +5320,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_nr = div64_u64(stripe_nr, stripe_len);
stripe_offset = stripe_nr * stripe_len;
- BUG_ON(offset < stripe_offset);
+ if (offset < stripe_offset) {
+ btrfs_crit(fs_info, "stripe math has gone wrong, "
+ "stripe_offset=%llu, offset=%llu, start=%llu, "
+ "logical=%llu, stripe_len=%llu",
+ stripe_offset, offset, em->start, logical,
+ stripe_len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
@@ -5519,7 +5569,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
&stripe_index);
mirror_num = stripe_index + 1;
}
- BUG_ON(stripe_index >= map->num_stripes);
+ if (stripe_index >= map->num_stripes) {
+ btrfs_crit(fs_info, "stripe index math went horribly wrong, "
+ "got stripe_index=%u, num_stripes=%u",
+ stripe_index, map->num_stripes);
+ ret = -EINVAL;
+ goto out;
+ }
num_alloc_stripes = num_stripes;
if (dev_replace_is_ongoing) {
@@ -5718,20 +5774,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
if (found) {
- if (physical_of_found + map->stripe_len <=
- dev_replace->cursor_left) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
+ struct btrfs_bio_stripe *tgtdev_stripe =
+ bbio->stripes + num_stripes;
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->length =
+ bbio->stripes[index_srcdev].length;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[index_srcdev] = num_stripes;
- tgtdev_indexes++;
- num_stripes++;
- }
+ tgtdev_indexes++;
+ num_stripes++;
}
}
@@ -6032,7 +6085,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
- /* Shoud be the original bio. */
+ /* Should be the original bio. */
WARN_ON(bio != bbio->orig_bio);
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -6206,27 +6259,23 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
return dev;
}
-static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
- struct extent_buffer *leaf,
- struct btrfs_chunk *chunk)
+/* Return -EIO if any error, otherwise return 0. */
+static int btrfs_check_chunk_valid(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 logical)
{
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
- u64 logical;
u64 length;
u64 stripe_len;
- u64 devid;
- u8 uuid[BTRFS_UUID_SIZE];
- int num_stripes;
- int ret;
- int i;
+ u16 num_stripes;
+ u16 sub_stripes;
+ u64 type;
- logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
- /* Validation check */
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+
if (!num_stripes) {
btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
num_stripes);
@@ -6237,24 +6286,70 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
"invalid chunk logical %llu", logical);
return -EIO;
}
+ if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) {
+ btrfs_err(root->fs_info, "invalid chunk sectorsize %u",
+ btrfs_chunk_sector_size(leaf, chunk));
+ return -EIO;
+ }
if (!length || !IS_ALIGNED(length, root->sectorsize)) {
btrfs_err(root->fs_info,
"invalid chunk length %llu", length);
return -EIO;
}
- if (!is_power_of_2(stripe_len)) {
+ if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
stripe_len);
return -EIO;
}
if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- btrfs_chunk_type(leaf, chunk)) {
+ type) {
btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
~(BTRFS_BLOCK_GROUP_TYPE_MASK |
BTRFS_BLOCK_GROUP_PROFILE_MASK) &
btrfs_chunk_type(leaf, chunk));
return -EIO;
}
+ if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+ num_stripes != 1)) {
+ btrfs_err(root->fs_info,
+ "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+ num_stripes, sub_stripes,
+ type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ u64 logical;
+ u64 length;
+ u64 stripe_len;
+ u64 devid;
+ u8 uuid[BTRFS_UUID_SIZE];
+ int num_stripes;
+ int ret;
+ int i;
+
+ logical = key->offset;
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ ret = btrfs_check_chunk_valid(root, leaf, chunk, logical);
+ if (ret)
+ return ret;
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6502,6 +6597,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
u32 array_size;
u32 len = 0;
u32 cur_offset;
+ u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
@@ -6511,12 +6607,12 @@ int btrfs_read_sys_array(struct btrfs_root *root)
* overallocate but we can keep it as-is, only the first page is used.
*/
sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
- if (!sb)
- return -ENOMEM;
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
- * The sb extent buffer is artifical and just used to read the system array.
+ * The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
@@ -6568,6 +6664,15 @@ int btrfs_read_sys_array(struct btrfs_root *root)
break;
}
+ type = btrfs_chunk_type(sb, chunk);
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
+ btrfs_err(root->fs_info,
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
@@ -6586,13 +6691,15 @@ int btrfs_read_sys_array(struct btrfs_root *root)
sb_array_offset += len;
cur_offset += len;
}
- free_extent_buffer(sb);
+ clear_extent_buffer_uptodate(sb);
+ free_extent_buffer_stale(sb);
return ret;
out_short_read:
printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
len, cur_offset);
- free_extent_buffer(sb);
+ clear_extent_buffer_uptodate(sb);
+ free_extent_buffer_stale(sb);
return -EIO;
}
@@ -6604,6 +6711,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
struct btrfs_key found_key;
int ret;
int slot;
+ u64 total_dev = 0;
root = root->fs_info->chunk_root;
@@ -6645,6 +6753,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
ret = read_one_dev(root, leaf, dev_item);
if (ret)
goto error;
+ total_dev++;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -6654,6 +6763,28 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
}
path->slots[0]++;
}
+
+ /*
+ * After loading chunk tree, we've got all device information,
+ * do another round of validation checks.
+ */
+ if (total_dev != root->fs_info->fs_devices->total_devices) {
+ btrfs_err(root->fs_info,
+ "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_super_num_devices(root->fs_info->super_copy),
+ total_dev);
+ ret = -EINVAL;
+ goto error;
+ }
+ if (btrfs_super_total_bytes(root->fs_info->super_copy) <
+ root->fs_info->fs_devices->total_rw_bytes) {
+ btrfs_err(root->fs_info,
+ "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
+ btrfs_super_total_bytes(root->fs_info->super_copy),
+ root->fs_info->fs_devices->total_rw_bytes);
+ ret = -EINVAL;
+ goto error;
+ }
ret = 0;
error:
unlock_chunks(root);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1939ebde63df..0ac90f8d85bd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -340,14 +340,14 @@ struct btrfs_raid_attr {
};
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-
+extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES];
extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
struct map_lookup {
u64 type;
int io_align;
int io_width;
- int stripe_len;
+ u64 stripe_len;
int sector_size;
int num_stripes;
int sub_stripes;
@@ -357,52 +357,6 @@ struct map_lookup {
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
-/*
- * Restriper's general type filter
- */
-#define BTRFS_BALANCE_DATA (1ULL << 0)
-#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
-#define BTRFS_BALANCE_METADATA (1ULL << 2)
-
-#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
- BTRFS_BALANCE_SYSTEM | \
- BTRFS_BALANCE_METADATA)
-
-#define BTRFS_BALANCE_FORCE (1ULL << 3)
-#define BTRFS_BALANCE_RESUME (1ULL << 4)
-
-/*
- * Balance filters
- */
-#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
-#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
-#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
-#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
-#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
-#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
-#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
-#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
-#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
-
-#define BTRFS_BALANCE_ARGS_MASK \
- (BTRFS_BALANCE_ARGS_PROFILES | \
- BTRFS_BALANCE_ARGS_USAGE | \
- BTRFS_BALANCE_ARGS_DEVID | \
- BTRFS_BALANCE_ARGS_DRANGE | \
- BTRFS_BALANCE_ARGS_VRANGE | \
- BTRFS_BALANCE_ARGS_LIMIT | \
- BTRFS_BALANCE_ARGS_LIMIT_RANGE | \
- BTRFS_BALANCE_ARGS_STRIPES_RANGE | \
- BTRFS_BALANCE_ARGS_USAGE_RANGE)
-
-/*
- * Profile changing flags. When SOFT is set we won't relocate chunk if
- * it already has the target profile (even though it may be
- * half-filled).
- */
-#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
-#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
-
struct btrfs_balance_args;
struct btrfs_balance_progress;
struct btrfs_balance_control {
@@ -445,13 +399,18 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *device, struct btrfs_device *this_dev);
int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
char *device_path,
struct btrfs_device **device);
+int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+ char *devpath,
+ struct btrfs_device **device);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
-int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid);
void btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3bfb252206c7..d1a177a3dbe8 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
}
static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size,
- int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
-
name = xattr_full_name(handler, name);
return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
}
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
- struct dentry *dentry,
+ struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
{
name = xattr_full_name(handler, name);
- return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
+ return btrfs_set_prop(inode, name, value, size, flags);
}
static const struct xattr_handler btrfs_security_xattr_handler = {
diff --git a/fs/buffer.c b/fs/buffer.c
index af0d9a82a8ed..754813a6962b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -255,17 +255,17 @@ out:
*/
static void free_more_memory(void)
{
- struct zone *zone;
+ struct zoneref *z;
int nid;
wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
yield();
for_each_online_node(nid) {
- (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
- gfp_zone(GFP_NOFS), NULL,
- &zone);
- if (zone)
+
+ z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+ gfp_zone(GFP_NOFS), NULL);
+ if (z->zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
GFP_NOFS, NULL);
}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 861d611b8c05..ce5f345d70f5 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
* check if the backing cache is updated to FS-Cache
* - called by FS-Cache when evaluates if need to invalidate the cache
*/
-static bool cachefiles_check_consistency(struct fscache_operation *op)
+static int cachefiles_check_consistency(struct fscache_operation *op)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 43098cd9602b..26a9d10d75e9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
/*
* Finish an async read(ahead) op.
*/
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_osd_data *osd_data;
- int rc = req->r_result;
- int bytes = le32_to_cpu(msg->hdr.data_len);
+ int rc = req->r_result <= 0 ? req->r_result : 0;
+ int bytes = req->r_result >= 0 ? req->r_result : 0;
int num_pages;
int i;
@@ -276,8 +276,10 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
- if (rc < 0 && rc != -ENOENT)
+ if (rc < 0 && rc != -ENOENT) {
+ ceph_fscache_readpage_cancel(inode, page);
goto unlock;
+ }
if (bytes < (int)PAGE_SIZE) {
/* zero (remainder of) page */
int s = bytes < 0 ? 0 : bytes;
@@ -376,8 +378,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
req->r_callback = finish_read;
req->r_inode = inode;
- ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
ret = ceph_osdc_start_request(osdc, req, false);
if (ret < 0)
@@ -537,8 +537,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
- ceph_readpage_to_fscache(inode, page);
-
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
truncate_seq, truncate_size,
&inode->i_mtime, &page, 1);
if (err < 0) {
- dout("writepage setting page/mapping error %d %p\n", err, page);
+ struct writeback_control tmp_wbc;
+ if (!wbc)
+ wbc = &tmp_wbc;
+ if (err == -ERESTARTSYS) {
+ /* killed by SIGKILL */
+ dout("writepage interrupted page %p\n", page);
+ redirty_page_for_writepage(wbc, page);
+ end_page_writeback(page);
+ goto out;
+ }
+ dout("writepage setting page/mapping error %d %p\n",
+ err, page);
SetPageError(page);
mapping_set_error(&inode->i_data, err);
- if (wbc)
- wbc->pages_skipped++;
+ wbc->pages_skipped++;
} else {
dout("writepage cleaned page %p\n", page);
err = 0; /* vfs expects us to return 0 */
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
BUG_ON(!inode);
ihold(inode);
err = writepage_nounlock(page, wbc);
+ if (err == -ERESTARTSYS) {
+ /* direct memory reclaimer was killed by SIGKILL. return 0
+ * to prevent caller from setting mapping/page error */
+ err = 0;
+ }
unlock_page(page);
iput(inode);
return err;
}
-
/*
* lame release_pages helper. release_pages() isn't exported to
* modules.
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
* If we get an error, set the mapping error bit, but not the individual
* page error bits.
*/
-static void writepages_finish(struct ceph_osd_request *req,
- struct ceph_msg *msg)
+static void writepages_finish(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
bool remove_page;
-
dout("writepages_finish %p rc %d\n", inode, rc);
if (rc < 0)
mapping_set_error(mapping, rc);
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
clear_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
+ if (rc < 0)
+ SetPageError(page);
+
ceph_put_snap_context(page_snap_context(page));
page->private = 0;
ClearPagePrivate(page);
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- pr_warn("writepage_start %p on forced umount\n", inode);
- truncate_pagecache(inode, 0);
+ if (ci->i_wrbuffer_ref > 0) {
+ pr_warn_ratelimited(
+ "writepage_start %p %lld forced umount\n",
+ inode, ceph_ino(inode));
+ }
mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
}
@@ -1063,10 +1079,7 @@ new_request:
pages = NULL;
}
- vino = ceph_vino(inode);
- ceph_osdc_build_request(req, offset, snapc, vino.snap,
- &inode->i_mtime);
-
+ req->r_mtime = inode->i_mtime;
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
BUG_ON(rc);
req = NULL;
@@ -1099,8 +1112,7 @@ release_pvec_pages:
mapping->writeback_index = index;
out:
- if (req)
- ceph_osdc_put_request(req);
+ ceph_osdc_put_request(req);
ceph_put_snap_context(snapc);
dout("writepages done, rc = %d\n", rc);
return rc;
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
struct page *page)
{
struct inode *inode = file_inode(file);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
loff_t page_off = pos & PAGE_MASK;
int pos_in_page = pos & ~PAGE_MASK;
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
int r;
struct ceph_snap_context *snapc, *oldest;
+ if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout(" page %p forced umount\n", page);
+ unlock_page(page);
+ return -EIO;
+ }
+
retry_locked:
/* writepages currently holds page lock, but if we change that later, */
wait_on_page_writeback(page);
@@ -1165,7 +1184,7 @@ retry_locked:
snapc = ceph_get_snap_context(snapc);
unlock_page(page);
ceph_queue_writeback(inode);
- r = wait_event_interruptible(ci->i_cap_wq,
+ r = wait_event_killable(ci->i_cap_wq,
context_is_writeable_or_written(inode, snapc));
ceph_put_snap_context(snapc);
if (r == -ERESTARTSYS)
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
.direct_IO = ceph_direct_io,
};
+static void ceph_block_sigs(sigset_t *oldset)
+{
+ sigset_t mask;
+ siginitsetinv(&mask, sigmask(SIGKILL));
+ sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+ sigprocmask(SIG_SETMASK, oldset, NULL);
+}
/*
* vm ops
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *pinned_page = NULL;
loff_t off = vmf->pgoff << PAGE_SHIFT;
int want, got, ret;
+ sigset_t oldset;
+
+ ceph_block_sigs(&oldset);
dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
- while (1) {
- got = 0;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
- -1, &got, &pinned_page);
- if (ret == 0)
- break;
- if (ret != -ERESTARTSYS) {
- WARN_ON(1);
- return VM_FAULT_SIGBUS;
- }
- }
+
+ got = 0;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+ if (ret < 0)
+ goto out_restore;
+
dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
ceph_put_cap_refs(ci, got);
if (ret != -EAGAIN)
- return ret;
+ goto out_restore;
/* read inline data */
if (off >= PAGE_SIZE) {
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
- goto out;
+ goto out_inline;
}
ret1 = __ceph_do_getattr(inode, page,
CEPH_STAT_CAP_INLINE_DATA, true);
if (ret1 < 0 || off >= i_size_read(inode)) {
unlock_page(page);
put_page(page);
- ret = VM_FAULT_SIGBUS;
- goto out;
+ if (ret1 < 0)
+ ret = ret1;
+ else
+ ret = VM_FAULT_SIGBUS;
+ goto out_inline;
}
if (ret1 < PAGE_SIZE)
zero_user_segment(page, ret1, PAGE_SIZE);
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
SetPageUptodate(page);
vmf->page = page;
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+ dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+ inode, off, (size_t)PAGE_SIZE, ret);
}
-out:
- dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
- inode, off, (size_t)PAGE_SIZE, ret);
+out_restore:
+ ceph_restore_sigs(&oldset);
+ if (ret < 0)
+ ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+
return ret;
}
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
loff_t size = i_size_read(inode);
size_t len;
int want, got, ret;
+ sigset_t oldset;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
- return VM_FAULT_SIGBUS;
+ return VM_FAULT_OOM;
+
+ ceph_block_sigs(&oldset);
if (ci->i_inline_version != CEPH_INLINE_NONE) {
struct page *locked_page = NULL;
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ceph_uninline_data(vma->vm_file, locked_page);
if (locked_page)
unlock_page(locked_page);
- if (ret < 0) {
- ret = VM_FAULT_SIGBUS;
+ if (ret < 0)
goto out_free;
- }
}
if (off + PAGE_SIZE <= size)
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_BUFFER;
- while (1) {
- got = 0;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
- &got, NULL);
- if (ret == 0)
- break;
- if (ret != -ERESTARTSYS) {
- WARN_ON(1);
- ret = VM_FAULT_SIGBUS;
- goto out_free;
- }
- }
+
+ got = 0;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+ &got, NULL);
+ if (ret < 0)
+ goto out_free;
+
dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
inode, off, len, ceph_cap_string(got));
/* Update time before taking page lock */
file_update_time(vma->vm_file);
- lock_page(page);
+ do {
+ lock_page(page);
- ret = VM_FAULT_NOPAGE;
- if ((off > size) ||
- (page->mapping != inode->i_mapping)) {
- unlock_page(page);
- goto out;
- }
+ if ((off > size) || (page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ break;
+ }
+
+ ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+ if (ret >= 0) {
+ /* success. we'll keep the page locked. */
+ set_page_dirty(page);
+ ret = VM_FAULT_LOCKED;
+ }
+ } while (ret == -EAGAIN);
- ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (ret >= 0) {
- /* success. we'll keep the page locked. */
- set_page_dirty(page);
- ret = VM_FAULT_LOCKED;
- } else {
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
- }
-out:
if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
int dirty;
@@ -1495,8 +1523,10 @@ out:
inode, off, len, ceph_cap_string(got), ret);
ceph_put_cap_refs(ci, got);
out_free:
+ ceph_restore_sigs(&oldset);
ceph_free_cap_flush(prealloc_cf);
-
+ if (ret < 0)
+ ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
return ret;
}
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
goto out;
}
- ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+ req->r_mtime = inode->i_mtime;
err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
goto out_put;
}
- ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+ req->r_mtime = inode->i_mtime;
err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
rd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
rd_req->r_base_oloc.pool = pool;
- snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
- "%llx.00000000", ci->i_vino.ino);
- rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+ ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
+
+ err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+ if (err)
+ goto out_unlock;
wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1, false, GFP_NOFS);
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out_unlock;
}
- wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+ wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
- wr_req->r_base_oloc.pool = pool;
- wr_req->r_base_oid = rd_req->r_base_oid;
+ ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
+ ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
+
+ err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+ if (err)
+ goto out_unlock;
/* one page should be large enough for STAT data */
pages = ceph_alloc_page_vector(1, GFP_KERNEL);
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
0, false, true);
- ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
- &ci->vfs_inode.i_mtime);
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
- ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
- &ci->vfs_inode.i_mtime);
+ wr_req->r_mtime = ci->vfs_inode.i_mtime;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err)
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
out_unlock:
up_write(&mdsc->pool_perm_rwsem);
- if (rd_req)
- ceph_osdc_put_request(rd_req);
- if (wr_req)
- ceph_osdc_put_request(wr_req);
+ ceph_osdc_put_request(rd_req);
+ ceph_osdc_put_request(wr_req);
out:
if (!err)
err = have;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc..238c55b01723 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -25,6 +25,7 @@
#include "cache.h"
struct ceph_aux_inode {
+ u64 version;
struct timespec mtime;
loff_t size;
};
@@ -69,15 +70,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
fsc, true);
-
- if (fsc->fscache == NULL) {
+ if (!fsc->fscache)
pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
- return 0;
- }
-
- fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
- if (fsc->revalidate_wq == NULL)
- return -ENOMEM;
return 0;
}
@@ -105,6 +99,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
const struct inode* inode = &ci->vfs_inode;
memset(&aux, 0, sizeof(aux));
+ aux.version = ci->i_version;
aux.mtime = inode->i_mtime;
aux.size = i_size_read(inode);
@@ -131,6 +126,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&aux, 0, sizeof(aux));
+ aux.version = ci->i_version;
aux.mtime = inode->i_mtime;
aux.size = i_size_read(inode);
@@ -181,32 +177,26 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.now_uncached = ceph_fscache_inode_now_uncached,
};
-void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
- struct ceph_inode_info* ci)
+void ceph_fscache_register_inode_cookie(struct inode *inode)
{
- struct inode* inode = &ci->vfs_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
/* No caching for filesystem */
if (fsc->fscache == NULL)
return;
/* Only cache for regular files that are read only */
- if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
+ if (!S_ISREG(inode->i_mode))
return;
- /* Avoid multiple racing open requests */
- inode_lock(inode);
-
- if (ci->fscache)
- goto done;
-
- ci->fscache = fscache_acquire_cookie(fsc->fscache,
- &ceph_fscache_inode_object_def,
- ci, true);
- fscache_check_consistency(ci->fscache);
-done:
+ inode_lock_nested(inode, I_MUTEX_CHILD);
+ if (!ci->fscache) {
+ ci->fscache = fscache_acquire_cookie(fsc->fscache,
+ &ceph_fscache_inode_object_def,
+ ci, false);
+ }
inode_unlock(inode);
-
}
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
@@ -222,6 +212,34 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
fscache_relinquish_cookie(cookie, 0);
}
+static bool ceph_fscache_can_enable(void *data)
+{
+ struct inode *inode = data;
+ return !inode_is_open_for_write(inode);
+}
+
+void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (!fscache_cookie_valid(ci->fscache))
+ return;
+
+ if (inode_is_open_for_write(inode)) {
+ dout("fscache_file_set_cookie %p %p disabling cache\n",
+ inode, filp);
+ fscache_disable_cookie(ci->fscache, false);
+ fscache_uncache_all_inode_pages(ci->fscache, inode);
+ } else {
+ fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
+ inode);
+ if (fscache_cookie_enabled(ci->fscache)) {
+ dout("fscache_file_set_cookie %p %p enabing cache\n",
+ inode, filp);
+ }
+ }
+}
+
static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
{
if (!error)
@@ -236,10 +254,9 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
unlock_page(page);
}
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
{
- return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
- (ci->i_fscache_gen == ci->i_rdcache_gen));
+ return ci->i_fscache_gen == ci->i_rdcache_gen;
}
@@ -332,69 +349,27 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
- if (fsc->revalidate_wq)
- destroy_workqueue(fsc->revalidate_wq);
-
fscache_relinquish_cookie(fsc->fscache, 0);
fsc->fscache = NULL;
}
-static void ceph_revalidate_work(struct work_struct *work)
-{
- int issued;
- u32 orig_gen;
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_revalidate_work);
- struct inode *inode = &ci->vfs_inode;
-
- spin_lock(&ci->i_ceph_lock);
- issued = __ceph_caps_issued(ci, NULL);
- orig_gen = ci->i_rdcache_gen;
- spin_unlock(&ci->i_ceph_lock);
-
- if (!(issued & CEPH_CAP_FILE_CACHE)) {
- dout("revalidate_work lost cache before validation %p\n",
- inode);
- goto out;
- }
-
- if (!fscache_check_consistency(ci->fscache))
- fscache_invalidate(ci->fscache);
-
- spin_lock(&ci->i_ceph_lock);
- /* Update the new valid generation (backwards sanity check too) */
- if (orig_gen > ci->i_fscache_gen) {
- ci->i_fscache_gen = orig_gen;
- }
- spin_unlock(&ci->i_ceph_lock);
-
-out:
- iput(&ci->vfs_inode);
-}
-
-void ceph_queue_revalidate(struct inode *inode)
+/*
+ * caller should hold CEPH_CAP_FILE_{RD,CACHE}
+ */
+void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
+ if (cache_valid(ci))
return;
- ihold(inode);
-
- if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
- &ci->i_revalidate_work)) {
- dout("ceph_queue_revalidate %p\n", inode);
- } else {
- dout("ceph_queue_revalidate %p failed\n)", inode);
- iput(inode);
+ /* resue i_truncate_mutex. There should be no pending
+ * truncate while the caller holds CEPH_CAP_FILE_RD */
+ mutex_lock(&ci->i_truncate_mutex);
+ if (!cache_valid(ci)) {
+ if (fscache_check_consistency(ci->fscache))
+ fscache_invalidate(ci->fscache);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_fscache_gen = ci->i_rdcache_gen;
+ spin_unlock(&ci->i_ceph_lock);
}
-}
-
-void ceph_fscache_inode_init(struct ceph_inode_info *ci)
-{
- ci->fscache = NULL;
- /* The first load is verifed cookie open time */
- ci->i_fscache_gen = 1;
- INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
+ mutex_unlock(&ci->i_truncate_mutex);
}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 5ac591bd012b..7e72c7594f0c 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -34,10 +34,10 @@ void ceph_fscache_unregister(void);
int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
-void ceph_fscache_inode_init(struct ceph_inode_info *ci);
-void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
- struct ceph_inode_info* ci);
+void ceph_fscache_register_inode_cookie(struct inode *inode);
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
+void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);
int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
int ceph_readpages_from_fscache(struct inode *inode,
@@ -46,12 +46,11 @@ int ceph_readpages_from_fscache(struct inode *inode,
unsigned *nr_pages);
void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
-void ceph_queue_revalidate(struct inode *inode);
-static inline void ceph_fscache_update_objectsize(struct inode *inode)
+static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- fscache_attr_changed(ci->fscache);
+ ci->fscache = NULL;
+ ci->i_fscache_gen = 0;
}
static inline void ceph_fscache_invalidate(struct inode *inode)
@@ -88,6 +87,11 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode,
return fscache_readpages_cancel(ci->fscache, pages);
}
+static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+{
+ ci->i_fscache_gen = ci->i_rdcache_gen - 1;
+}
+
#else
static inline int ceph_fscache_register(void)
@@ -112,8 +116,20 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
}
-static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
- struct ceph_inode_info* ci)
+static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
+{
+}
+
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+}
+
+static inline void ceph_fscache_file_set_cookie(struct inode *inode,
+ struct file *filp)
+{
+}
+
+static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
{
}
@@ -141,10 +157,6 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
{
}
-static inline void ceph_fscache_update_objectsize(struct inode *inode)
-{
-}
-
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
@@ -154,10 +166,6 @@ static inline void ceph_invalidate_fscache_page(struct inode *inode,
{
}
-static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
-{
-}
-
static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
{
return 1;
@@ -173,7 +181,7 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode,
{
}
-static inline void ceph_queue_revalidate(struct inode *inode)
+static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
{
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cfaeef18cbca..6f60d0a3d0f9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1656,7 +1656,7 @@ retry_locked:
*/
if ((!is_delayed || mdsc->stopping) &&
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
- ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
+ !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
@@ -1698,8 +1698,8 @@ retry_locked:
revoking = cap->implemented & ~cap->issued;
dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
- cap->mds, cap, ceph_cap_string(cap->issued),
- ceph_cap_string(cap_used),
+ cap->mds, cap, ceph_cap_string(cap_used),
+ ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
@@ -2317,7 +2317,7 @@ again:
/* make sure file is actually open */
file_wanted = __ceph_caps_file_wanted(ci);
- if ((file_wanted & need) == 0) {
+ if ((file_wanted & need) != need) {
dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
ceph_cap_string(need), ceph_cap_string(file_wanted));
*err = -EBADF;
@@ -2393,6 +2393,9 @@ again:
snap_rwsem_locked = true;
}
*got = need | (have & want);
+ if ((need & CEPH_CAP_FILE_RD) &&
+ !(*got & CEPH_CAP_FILE_CACHE))
+ ceph_disable_fscache_readpage(ci);
__take_cap_refs(ci, *got, true);
ret = 1;
}
@@ -2412,12 +2415,26 @@ again:
goto out_unlock;
}
- if (!__ceph_is_any_caps(ci) &&
- ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- dout("get_cap_refs %p forced umount\n", inode);
- *err = -EIO;
- ret = 1;
- goto out_unlock;
+ if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
+ int mds_wanted;
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+ CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ *err = -EIO;
+ ret = 1;
+ goto out_unlock;
+ }
+ mds_wanted = __ceph_caps_mds_wanted(ci);
+ if ((mds_wanted & need) != need) {
+ dout("get_cap_refs %p caps were dropped"
+ " (session killed?)\n", inode);
+ *err = -ESTALE;
+ ret = 1;
+ goto out_unlock;
+ }
+ if ((mds_wanted & file_wanted) ==
+ (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+ ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
}
dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2504,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
if (err == -EAGAIN)
continue;
if (err < 0)
- return err;
+ ret = err;
} else {
ret = wait_event_interruptible(ci->i_cap_wq,
try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2513,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
continue;
if (err < 0)
ret = err;
- if (ret < 0)
- return ret;
+ }
+ if (ret < 0) {
+ if (err == -ESTALE) {
+ /* session was killed, try renew caps */
+ ret = ceph_renew_caps(&ci->vfs_inode);
+ if (ret == 0)
+ continue;
+ }
+ return ret;
}
if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -2533,6 +2557,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
break;
}
+ if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
+ ceph_fscache_revalidate_cookie(ci);
+
*got = _got;
return 0;
}
@@ -2774,7 +2801,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
bool writeback = false;
bool queue_trunc = false;
bool queue_invalidate = false;
- bool queue_revalidate = false;
bool deleted_inode = false;
bool fill_inline = false;
@@ -2807,7 +2833,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- !ci->i_wrbuffer_ref) {
+ !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
if (try_nonblocking_invalidate(inode)) {
/* there were locked pages.. invalidate later
in a separate thread. */
@@ -2816,8 +2842,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
}
-
- ceph_fscache_invalidate(inode);
}
/* side effects now are allowed */
@@ -2859,11 +2883,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
}
}
- /* Do we need to revalidate our fscache cookie. Don't bother on the
- * first cache cap as we already validate at cookie creation time. */
- if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
- queue_revalidate = true;
-
if (newcaps & CEPH_CAP_ANY_RD) {
/* ctime/mtime/atime? */
ceph_decode_timespec(&mtime, &grant->mtime);
@@ -2972,11 +2991,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (fill_inline)
ceph_fill_inline_data(inode, NULL, inline_data, inline_len);
- if (queue_trunc) {
+ if (queue_trunc)
ceph_queue_vmtruncate(inode);
- ceph_queue_revalidate(inode);
- } else if (queue_revalidate)
- ceph_queue_revalidate(inode);
if (writeback)
/*
@@ -3178,10 +3194,8 @@ static void handle_cap_trunc(struct inode *inode,
truncate_seq, truncate_size, size);
spin_unlock(&ci->i_ceph_lock);
- if (queue_trunc) {
+ if (queue_trunc)
ceph_queue_vmtruncate(inode);
- ceph_fscache_invalidate(inode);
- }
}
/*
@@ -3226,6 +3240,8 @@ retry:
if (target < 0) {
__ceph_remove_cap(cap, false);
+ if (!ci->i_auth_cap)
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
goto out_unlock;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 31f831471ed2..39ff678e567f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
kfree(path);
- } else if (req->r_path2) {
+ } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
req->r_path2);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3ab1192d2029..6e0fedf6713b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -70,16 +70,42 @@ out_unlock:
}
/*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
+ * for f_pos for readdir:
+ * - hash order:
+ * (0xff << 52) | ((24 bits hash) << 28) |
+ * (the nth entry has hash collision);
+ * - frag+name order;
+ * ((frag value) << 28) | (the nth entry in frag);
*/
+#define OFFSET_BITS 28
+#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+ loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+ if (hash_order)
+ fpos |= HASH_ORDER;
+ return fpos;
+}
+
+static bool is_hash_order(loff_t p)
+{
+ return (p & HASH_ORDER) == HASH_ORDER;
+}
+
static unsigned fpos_frag(loff_t p)
{
- return p >> 32;
+ return p >> OFFSET_BITS;
}
+
+static unsigned fpos_hash(loff_t p)
+{
+ return ceph_frag_value(fpos_frag(p));
+}
+
static unsigned fpos_off(loff_t p)
{
- return p & 0xffffffff;
+ return p & OFFSET_MASK;
}
static int fpos_cmp(loff_t l, loff_t r)
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
return 0;
}
+
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+ struct ceph_readdir_cache_control *cache_ctl)
+{
+ struct inode *dir = d_inode(parent);
+ struct dentry *dentry;
+ unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+ loff_t ptr_pos = idx * sizeof(struct dentry *);
+ pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+
+ if (ptr_pos >= i_size_read(dir))
+ return NULL;
+
+ if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+ ceph_readdir_cache_release(cache_ctl);
+ cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
+ if (!cache_ctl->page) {
+ dout(" page %lu not found\n", ptr_pgoff);
+ return ERR_PTR(-EAGAIN);
+ }
+ /* reading/filling the cache are serialized by
+ i_mutex, no need to use page lock */
+ unlock_page(cache_ctl->page);
+ cache_ctl->dentries = kmap(cache_ctl->page);
+ }
+
+ cache_ctl->index = idx & idx_mask;
+
+ rcu_read_lock();
+ spin_lock(&parent->d_lock);
+ /* check i_size again here, because empty directory can be
+ * marked as complete while not holding the i_mutex. */
+ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+ dentry = cache_ctl->dentries[cache_ctl->index];
+ else
+ dentry = NULL;
+ spin_unlock(&parent->d_lock);
+ if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+ dentry = NULL;
+ rcu_read_unlock();
+ return dentry ? : ERR_PTR(-EAGAIN);
+}
+
/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
struct inode *dir = d_inode(parent);
struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di;
- unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
- int err = 0;
- loff_t ptr_pos = 0;
struct ceph_readdir_cache_control cache_ctl = {};
+ u64 idx = 0;
+ int err = 0;
- dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+ dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
+
+ /* search start position */
+ if (ctx->pos > 2) {
+ u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+ while (count > 0) {
+ u64 step = count >> 1;
+ dentry = __dcache_find_get_entry(parent, idx + step,
+ &cache_ctl);
+ if (!dentry) {
+ /* use linar search */
+ idx = 0;
+ break;
+ }
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
+ }
+ di = ceph_dentry(dentry);
+ spin_lock(&dentry->d_lock);
+ if (fpos_cmp(di->offset, ctx->pos) < 0) {
+ idx += step + 1;
+ count -= step + 1;
+ } else {
+ count = step;
+ }
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ }
- /* we can calculate cache index for the first dirfrag */
- if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
- cache_ctl.index = fpos_off(ctx->pos) - 2;
- BUG_ON(cache_ctl.index < 0);
- ptr_pos = cache_ctl.index * sizeof(struct dentry *);
+ dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
}
- while (true) {
- pgoff_t pgoff;
- bool emit_dentry;
- if (ptr_pos >= i_size_read(dir)) {
+ for (;;) {
+ bool emit_dentry = false;
+ dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+ if (!dentry) {
fi->flags |= CEPH_F_ATEND;
err = 0;
break;
}
-
- err = -EAGAIN;
- pgoff = ptr_pos >> PAGE_SHIFT;
- if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
- ceph_readdir_cache_release(&cache_ctl);
- cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
- if (!cache_ctl.page) {
- dout(" page %lu not found\n", pgoff);
- break;
- }
- /* reading/filling the cache are serialized by
- * i_mutex, no need to use page lock */
- unlock_page(cache_ctl.page);
- cache_ctl.dentries = kmap(cache_ctl.page);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
}
- rcu_read_lock();
- spin_lock(&parent->d_lock);
- /* check i_size again here, because empty directory can be
- * marked as complete while not holding the i_mutex. */
- if (ceph_dir_is_complete_ordered(dir) &&
- ptr_pos < i_size_read(dir))
- dentry = cache_ctl.dentries[cache_ctl.index % nsize];
- else
- dentry = NULL;
- spin_unlock(&parent->d_lock);
- if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
- dentry = NULL;
- rcu_read_unlock();
- if (!dentry)
- break;
-
- emit_dentry = false;
di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock);
if (di->lease_shared_gen == shared_gen &&
d_really_is_positive(dentry) &&
- ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
- ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
fpos_cmp(ctx->pos, di->offset) <= 0) {
emit_dentry = true;
}
spin_unlock(&dentry->d_lock);
if (emit_dentry) {
- dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+ dout(" %llx dentry %p %pd %p\n", di->offset,
dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
} else {
dput(dentry);
}
-
- cache_ctl.index++;
- ptr_pos += sizeof(struct dentry *);
}
+out:
ceph_readdir_cache_release(&cache_ctl);
if (last) {
int ret;
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
return err;
}
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+ if (!fi->last_readdir)
+ return true;
+ if (is_hash_order(pos))
+ return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+ else
+ return fi->frag != fpos_frag(pos);
+}
+
static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
struct ceph_file_info *fi = file->private_data;
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned frag = fpos_frag(ctx->pos);
- int off = fpos_off(ctx->pos);
+ int i;
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
- dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+ dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
if (fi->flags & CEPH_F_ATEND)
return 0;
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
inode->i_mode >> 12))
return 0;
ctx->pos = 1;
- off = 1;
}
if (ctx->pos == 1) {
ino_t ino = parent_ino(file->f_path.dentry);
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
inode->i_mode >> 12))
return 0;
ctx->pos = 2;
- off = 2;
}
/* can we use the dcache? */
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
return err;
- frag = fpos_frag(ctx->pos);
- off = fpos_off(ctx->pos);
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* proceed with a normal readdir */
more:
/* do we have the correct frag content buffered? */
- if (fi->frag != frag || fi->last_readdir == NULL) {
+ if (need_send_readdir(fi, ctx->pos)) {
struct ceph_mds_request *req;
+ unsigned frag;
int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
@@ -305,6 +372,13 @@ more:
fi->last_readdir = NULL;
}
+ if (is_hash_order(ctx->pos)) {
+ frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+ NULL, NULL);
+ } else {
+ frag = fpos_frag(ctx->pos);
+ }
+
dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
ceph_vinop(inode), frag, fi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -331,6 +405,8 @@ more:
req->r_readdir_cache_idx = fi->readdir_cache_idx;
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
+ req->r_args.readdir.flags =
+ cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
req->r_inode = inode;
ihold(inode);
@@ -340,22 +416,26 @@ more:
ceph_mdsc_put_request(req);
return err;
}
- dout("readdir got and parsed readdir result=%d"
- " on frag %x, end=%d, complete=%d\n", err, frag,
+ dout("readdir got and parsed readdir result=%d on "
+ "frag %x, end=%d, complete=%d, hash_order=%d\n",
+ err, frag,
(int)req->r_reply_info.dir_end,
- (int)req->r_reply_info.dir_complete);
-
+ (int)req->r_reply_info.dir_complete,
+ (int)req->r_reply_info.hash_order);
- /* note next offset and last dentry name */
rinfo = &req->r_reply_info;
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
frag = le32_to_cpu(rinfo->dir_dir->frag);
- off = req->r_readdir_offset;
- fi->next_offset = off;
+ if (!rinfo->hash_order) {
+ fi->next_offset = req->r_readdir_offset;
+ /* adjust ctx->pos to beginning of frag */
+ ctx->pos = ceph_make_fpos(frag,
+ fi->next_offset,
+ false);
+ }
}
fi->frag = frag;
- fi->offset = fi->next_offset;
fi->last_readdir = req;
if (req->r_did_prepopulate) {
@@ -363,7 +443,8 @@ more:
if (fi->readdir_cache_idx < 0) {
/* preclude from marking dir ordered */
fi->dir_ordered_count = 0;
- } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+ } else if (ceph_frag_is_leftmost(frag) &&
+ fi->next_offset == 2) {
/* note dir version at start of readdir so
* we can tell if any dentries get dropped */
fi->dir_release_count = req->r_dir_release_cnt;
@@ -377,65 +458,87 @@ more:
fi->dir_release_count = 0;
}
- if (req->r_reply_info.dir_end) {
- kfree(fi->last_name);
- fi->last_name = NULL;
- if (ceph_frag_is_rightmost(frag))
- fi->next_offset = 2;
- else
- fi->next_offset = 0;
- } else {
- err = note_last_dentry(fi,
- rinfo->dir_dname[rinfo->dir_nr-1],
- rinfo->dir_dname_len[rinfo->dir_nr-1],
- fi->next_offset + rinfo->dir_nr);
+ /* note next offset and last dentry name */
+ if (rinfo->dir_nr > 0) {
+ struct ceph_mds_reply_dir_entry *rde =
+ rinfo->dir_entries + (rinfo->dir_nr-1);
+ unsigned next_offset = req->r_reply_info.dir_end ?
+ 2 : (fpos_off(rde->offset) + 1);
+ err = note_last_dentry(fi, rde->name, rde->name_len,
+ next_offset);
if (err)
return err;
+ } else if (req->r_reply_info.dir_end) {
+ fi->next_offset = 2;
+ /* keep last name */
}
}
rinfo = &fi->last_readdir->r_reply_info;
- dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
- rinfo->dir_nr, off, fi->offset);
-
- ctx->pos = ceph_make_fpos(frag, off);
- while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
- struct ceph_mds_reply_inode *in =
- rinfo->dir_in[off - fi->offset].in;
+ dout("readdir frag %x num %d pos %llx chunk first %llx\n",
+ fi->frag, rinfo->dir_nr, ctx->pos,
+ rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+
+ i = 0;
+ /* search start position */
+ if (rinfo->dir_nr > 0) {
+ int step, nr = rinfo->dir_nr;
+ while (nr > 0) {
+ step = nr >> 1;
+ if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+ i += step + 1;
+ nr -= step + 1;
+ } else {
+ nr = step;
+ }
+ }
+ }
+ for (; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
ino_t ino;
- dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
- off, off - fi->offset, rinfo->dir_nr, ctx->pos,
- rinfo->dir_dname_len[off - fi->offset],
- rinfo->dir_dname[off - fi->offset], in);
- BUG_ON(!in);
- ftype = le32_to_cpu(in->mode) >> 12;
- vino.ino = le64_to_cpu(in->ino);
- vino.snap = le64_to_cpu(in->snapid);
+ BUG_ON(rde->offset < ctx->pos);
+
+ ctx->pos = rde->offset;
+ dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
+ i, rinfo->dir_nr, ctx->pos,
+ rde->name_len, rde->name, &rde->inode.in);
+
+ BUG_ON(!rde->inode.in);
+ ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
ino = ceph_vino_to_ino(vino);
- if (!dir_emit(ctx,
- rinfo->dir_dname[off - fi->offset],
- rinfo->dir_dname_len[off - fi->offset],
- ceph_translate_ino(inode->i_sb, ino), ftype)) {
+
+ if (!dir_emit(ctx, rde->name, rde->name_len,
+ ceph_translate_ino(inode->i_sb, ino), ftype)) {
dout("filldir stopping us...\n");
return 0;
}
- off++;
ctx->pos++;
}
- if (fi->last_name) {
+ if (fi->next_offset > 2) {
ceph_mdsc_put_request(fi->last_readdir);
fi->last_readdir = NULL;
goto more;
}
/* more frags? */
- if (!ceph_frag_is_rightmost(frag)) {
- frag = ceph_frag_next(frag);
- off = 0;
- ctx->pos = ceph_make_fpos(frag, off);
+ if (!ceph_frag_is_rightmost(fi->frag)) {
+ unsigned frag = ceph_frag_next(fi->frag);
+ if (is_hash_order(ctx->pos)) {
+ loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+ fi->next_offset, true);
+ if (new_pos > ctx->pos)
+ ctx->pos = new_pos;
+ /* keep last_name */
+ } else {
+ ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+ kfree(fi->last_name);
+ fi->last_name = NULL;
+ }
dout("readdir next frag is %x\n", frag);
goto more;
}
@@ -467,7 +570,7 @@ more:
return 0;
}
-static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
+static void reset_readdir(struct ceph_file_info *fi)
{
if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir);
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
fi->last_name = NULL;
fi->dir_release_count = 0;
fi->readdir_cache_idx = -1;
- if (ceph_frag_is_leftmost(frag))
- fi->next_offset = 2; /* compensate for . and .. */
- else
- fi->next_offset = 0;
+ fi->next_offset = 2; /* compensate for . and .. */
fi->flags &= ~CEPH_F_ATEND;
}
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
+{
+ struct ceph_mds_reply_info_parsed *rinfo;
+ loff_t chunk_offset;
+ if (new_pos == 0)
+ return true;
+ if (is_hash_order(new_pos)) {
+ /* no need to reset last_name for a forward seek when
+ * dentries are sotred in hash order */
+ } else if (fi->frag |= fpos_frag(new_pos)) {
+ return true;
+ }
+ rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+ if (!rinfo || !rinfo->dir_nr)
+ return true;
+ chunk_offset = rinfo->dir_entries[0].offset;
+ return new_pos < chunk_offset ||
+ is_hash_order(new_pos) != is_hash_order(chunk_offset);
+}
+
static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
- loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval;
inode_lock(inode);
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
}
if (offset >= 0) {
+ if (need_reset_readdir(fi, offset)) {
+ dout("dir_llseek dropping %p content\n", file);
+ reset_readdir(fi);
+ } else if (is_hash_order(offset) && offset > file->f_pos) {
+ /* for hash offset, we don't know if a forward seek
+ * is within same frag */
+ fi->dir_release_count = 0;
+ fi->readdir_cache_idx = -1;
+ }
+
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
fi->flags &= ~CEPH_F_ATEND;
}
retval = offset;
-
- if (offset == 0 ||
- fpos_frag(offset) != fi->frag ||
- fpos_off(offset) < fi->offset) {
- /* discard buffered readdir content on seekdir(0), or
- * seek to new frag, or seek prior to current chunk */
- dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi, fpos_frag(offset));
- } else if (fpos_cmp(offset, old_offset) > 0) {
- /* reset dir_release_count if we did a forward seek */
- fi->dir_release_count = 0;
- fi->readdir_cache_idx = -1;
- }
}
out:
inode_unlock(inode);
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
return dentry;
}
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
{
return ceph_ino(inode) == CEPH_INO_ROOT &&
strncmp(dentry->d_name.name, ".ceph", 5) == 0;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4f1dc7120916..ce2f5795e44b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -137,23 +137,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
struct ceph_file_info *cf;
int ret = 0;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
- /* First file open request creates the cookie, we want to keep
- * this cookie around for the filetime of the inode as not to
- * have to worry about fscache register / revoke / operation
- * races.
- *
- * Also, if we know the operation is going to invalidate data
- * (non readonly) just nuke the cache right away.
- */
- ceph_fscache_register_inode_cookie(mdsc->fsc, ci);
- if ((fmode & CEPH_FILE_MODE_WR))
- ceph_fscache_invalidate(inode);
+ ceph_fscache_register_inode_cookie(inode);
+ ceph_fscache_file_set_cookie(inode, file);
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
@@ -192,6 +180,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
}
/*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_request *req;
+ int err, flags, wanted;
+
+ spin_lock(&ci->i_ceph_lock);
+ wanted = __ceph_caps_file_wanted(ci);
+ if (__ceph_is_any_real_caps(ci) &&
+ (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+ int issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+ dout("renew caps %p want %s issued %s updating mds_wanted\n",
+ inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+ ceph_check_caps(ci, 0, NULL);
+ return 0;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ flags = 0;
+ if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+ flags = O_RDWR;
+ else if (wanted & CEPH_CAP_FILE_RD)
+ flags = O_RDONLY;
+ else if (wanted & CEPH_CAP_FILE_WR)
+ flags = O_WRONLY;
+#ifdef O_LAZY
+ if (wanted & CEPH_CAP_FILE_LAZYIO)
+ flags |= O_LAZY;
+#endif
+
+ req = prepare_open_request(inode->i_sb, flags, 0);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+ req->r_fmode = -1;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ ceph_mdsc_put_request(req);
+out:
+ dout("renew caps %p open result=%d\n", inode, err);
+ return err < 0 ? err : 0;
+}
+
+/*
* If we already have the requisite capabilities, we can satisfy
* the open request locally (no need to request new caps from the
* MDS). We do, however, need to inform the MDS (asynchronously)
@@ -616,8 +657,7 @@ static void ceph_aio_complete(struct inode *inode,
kfree(aio_req);
}
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
- struct ceph_msg *msg)
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
{
int rc = req->r_result;
struct inode *inode = req->r_inode;
@@ -714,14 +754,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
- req->r_base_oloc = orig_req->r_base_oloc;
- req->r_base_oid = orig_req->r_base_oid;
+ ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
+ ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ req = orig_req;
+ goto out;
+ }
req->r_ops[0] = orig_req->r_ops[0];
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
- ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
- snapc, CEPH_NOSNAP, &aio_req->mtime);
+ req->r_mtime = aio_req->mtime;
+ req->r_data_offset = req->r_ops[0].extent.offset;
ceph_osdc_put_request(orig_req);
@@ -733,7 +780,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
out:
if (ret < 0) {
req->r_result = ret;
- ceph_aio_complete_req(req, NULL);
+ ceph_aio_complete_req(req);
}
ceph_put_snap_context(snapc);
@@ -764,6 +811,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
list_add_tail(&req->r_unsafe_item,
&ci->i_unsafe_writes);
spin_unlock(&ci->i_unsafe_lock);
+
+ complete_all(&req->r_completion);
} else {
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_item);
@@ -875,14 +924,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
(pos+len) | (PAGE_SIZE - 1));
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+ req->r_mtime = mtime;
}
-
osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
false, false);
- ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
if (aio_req) {
aio_req->total_len += len;
aio_req->num_reqs++;
@@ -956,7 +1003,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
req, false);
if (ret < 0) {
req->r_result = ret;
- ceph_aio_complete_req(req, NULL);
+ ceph_aio_complete_req(req);
}
}
return -EIOCBQUEUED;
@@ -1067,9 +1114,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
false, true);
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
- ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
+ req->r_mtime = mtime;
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1292,7 +1337,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
retry_snap:
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
err = -ENOSPC;
goto out;
}
@@ -1350,7 +1395,6 @@ retry_snap:
iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
} else {
- loff_t old_size = i_size_read(inode);
/*
* No need to acquire the i_truncate_mutex. Because
* the MDS revokes Fwb caps before sending truncate
@@ -1361,8 +1405,6 @@ retry_snap:
written = generic_perform_write(file, from, pos);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
- if (i_size_read(inode) > old_size)
- ceph_fscache_update_objectsize(inode);
inode_unlock(inode);
}
@@ -1383,7 +1425,7 @@ retry_snap:
ceph_put_cap_refs(ci, got);
if (written >= 0) {
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL))
iocb->ki_flags |= IOCB_DSYNC;
written = generic_write_sync(iocb, written);
@@ -1524,9 +1566,7 @@ static int ceph_zero_partial_object(struct inode *inode,
goto out;
}
- ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
- &inode->i_mtime);
-
+ req->r_mtime = inode->i_mtime;
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret) {
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1617,8 +1657,8 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
- !(mode & FALLOC_FL_PUNCH_HOLE)) {
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
+ !(mode & FALLOC_FL_PUNCH_HOLE)) {
ret = -ENOSPC;
goto unlock;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e669cfa9d793..f059b5997072 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -11,6 +11,7 @@
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/random.h>
+#include <linux/sort.h>
#include "super.h"
#include "mds_client.h"
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
diri_auth = ci->i_auth_cap->mds;
spin_unlock(&ci->i_ceph_lock);
+ if (mds == -1) /* CDIR_AUTH_PARENT */
+ mds = diri_auth;
+
mutex_lock(&ci->i_fragtree_mutex);
if (ndist == 0 && mds == diri_auth) {
/* no delegation info needed. */
@@ -300,20 +304,38 @@ out:
return err;
}
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+ struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+ struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+ return ceph_frag_compare(ls->frag, rs->frag);
+}
+
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+ if (!frag)
+ return f == ceph_frag_make(0, 0);
+ if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+ return false;
+ return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
+
static int ceph_fill_fragtree(struct inode *inode,
struct ceph_frag_tree_head *fragtree,
struct ceph_mds_reply_dirfrag *dirinfo)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_inode_frag *frag;
+ struct ceph_inode_frag *frag, *prev_frag = NULL;
struct rb_node *rb_node;
- int i;
- u32 id, nsplits;
+ unsigned i, split_by, nsplits;
+ u32 id;
bool update = false;
mutex_lock(&ci->i_fragtree_mutex);
nsplits = le32_to_cpu(fragtree->nsplits);
- if (nsplits) {
+ if (nsplits != ci->i_fragtree_nsplits) {
+ update = true;
+ } else if (nsplits) {
i = prandom_u32() % nsplits;
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
if (!update)
goto out_unlock;
+ if (nsplits > 1) {
+ sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+ frag_tree_split_cmp, NULL);
+ }
+
dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
rb_node = rb_first(&ci->i_fragtree);
for (i = 0; i < nsplits; i++) {
id = le32_to_cpu(fragtree->splits[i].frag);
+ split_by = le32_to_cpu(fragtree->splits[i].by);
+ if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+ pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
+ "frag %x split by %d\n", ceph_vinop(inode),
+ i, nsplits, id, split_by);
+ continue;
+ }
frag = NULL;
while (rb_node) {
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
break;
}
rb_node = rb_next(rb_node);
- rb_erase(&frag->node, &ci->i_fragtree);
- kfree(frag);
+ /* delete stale split/leaf node */
+ if (frag->split_by > 0 ||
+ !is_frag_child(frag->frag, prev_frag)) {
+ rb_erase(&frag->node, &ci->i_fragtree);
+ if (frag->split_by > 0)
+ ci->i_fragtree_nsplits--;
+ kfree(frag);
+ }
frag = NULL;
}
if (!frag) {
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
if (IS_ERR(frag))
continue;
}
- frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+ if (frag->split_by == 0)
+ ci->i_fragtree_nsplits++;
+ frag->split_by = split_by;
dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ prev_frag = frag;
}
while (rb_node) {
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
rb_node = rb_next(rb_node);
- rb_erase(&frag->node, &ci->i_fragtree);
- kfree(frag);
+ /* delete stale split/leaf node */
+ if (frag->split_by > 0 ||
+ !is_frag_child(frag->frag, prev_frag)) {
+ rb_erase(&frag->node, &ci->i_fragtree);
+ if (frag->split_by > 0)
+ ci->i_fragtree_nsplits--;
+ kfree(frag);
+ }
}
out_unlock:
mutex_unlock(&ci->i_fragtree_mutex);
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
rb_erase(n, &ci->i_fragtree);
kfree(frag);
}
+ ci->i_fragtree_nsplits = 0;
__ceph_destroy_xattrs(ci);
if (ci->i_xattrs.blob)
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
return 1;
}
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+ return (size + (1<<9) - 1) >> 9;
+}
+
/*
* Helpers to fill in size, ctime, mtime, and atime. We have to be
* careful because either the client or MDS may have more up to date
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
size = 0;
}
i_size_write(inode, size);
- inode->i_blocks = (size + (1<<9) - 1) >> 9;
+ inode->i_blocks = calc_inode_blocks(size);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
dout("truncate_seq %u -> %u\n",
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
- err = -EINVAL;
- if (WARN_ON(symlen != i_size_read(inode)))
- goto out;
+ if (symlen != i_size_read(inode)) {
+ pr_err("fill_inode %llx.%llx BAD symlink "
+ "size %lld\n", ceph_vinop(inode),
+ i_size_read(inode));
+ i_size_write(inode, symlen);
+ inode->i_blocks = calc_inode_blocks(symlen);
+ }
err = -ENOMEM;
sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
int i, err = 0;
for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
struct inode *in;
int rc;
- vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
- vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
in = ceph_get_inode(req->r_dentry->d_sb, vino);
if (IS_ERR(in)) {
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
- rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+ rc = fill_inode(in, NULL, &rde->inode, NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation);
if (rc < 0) {
pr_err("fill_inode badness on %p got %d\n", in, rc);
err = rc;
- continue;
}
+ iput(in);
}
return err;
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
struct dentry *parent = req->r_dentry;
+ struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct qstr dname;
struct dentry *dn;
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
int err = 0, skipped = 0, ret, i;
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- struct ceph_dentry_info *di;
u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+ u32 last_hash = 0;
+ u32 fpos_offset;
struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
+ if (rinfo->hash_order && req->r_path2) {
+ last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ req->r_path2, strlen(req->r_path2));
+ last_hash = ceph_frag_value(last_hash);
+ }
+
if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
dout("readdir_prepopulate got new frag %x -> %x\n",
frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag);
- if (ceph_frag_is_leftmost(frag))
+ if (!rinfo->hash_order)
req->r_readdir_offset = 2;
- else
- req->r_readdir_offset = 0;
}
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
- struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
req->r_readdir_cache_idx = 0;
}
cache_ctl.index = req->r_readdir_cache_idx;
+ fpos_offset = req->r_readdir_offset;
/* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
- dname.name = rinfo->dir_dname[i];
- dname.len = rinfo->dir_dname_len[i];
+ dname.name = rde->name;
+ dname.len = rde->name_len;
dname.hash = full_name_hash(dname.name, dname.len);
- vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
- vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
+
+ if (rinfo->hash_order) {
+ u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ rde->name, rde->name_len);
+ hash = ceph_frag_value(hash);
+ if (hash != last_hash)
+ fpos_offset = 2;
+ last_hash = hash;
+ rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+ } else {
+ rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+ }
retry_lookup:
dn = d_lookup(parent, &dname);
@@ -1490,7 +1569,7 @@ retry_lookup:
}
}
- ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+ ret = fill_inode(in, NULL, &rde->inode, NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation);
if (ret < 0) {
@@ -1523,11 +1602,9 @@ retry_lookup:
dn = realdn;
}
- di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+ ceph_dentry(dn)->offset = rde->offset;
- update_dentry_lease(dn, rinfo->dir_dlease[i],
- req->r_session,
+ update_dentry_lease(dn, rde->lease, req->r_session,
req->r_request_started);
if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
i_size_write(inode, size);
- inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+ inode->i_blocks = calc_inode_blocks(size);
/* tell the MDS if we are approaching max_size */
if ((size << 1) >= ci->i_max_size &&
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_pg_inv_work);
struct inode *inode = &ci->vfs_inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
u32 orig_gen;
int check = 0;
mutex_lock(&ci->i_truncate_mutex);
+
+ if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
+ inode, ceph_ino(inode));
+ mapping_set_error(inode->i_mapping, -EIO);
+ truncate_pagecache(inode, 0);
+ mutex_unlock(&ci->i_truncate_mutex);
+ goto out;
+ }
+
spin_lock(&ci->i_ceph_lock);
dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- truncate_pagecache(inode, 0);
+ if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+ pr_err("invalidate_pages %p fails\n", inode);
+ }
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if ((issued & CEPH_CAP_FILE_EXCL) &&
attr->ia_size > inode->i_size) {
i_size_write(inode, attr->ia_size);
- inode->i_blocks =
- (attr->ia_size + (1 << 9) - 1) >> 9;
+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
inode->i_ctime = attr->ia_ctime;
ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158..be6b1657b1af 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
if (copy_from_user(&dl, arg, sizeof(dl)))
return -EFAULT;
- down_read(&osdc->map_sem);
+ down_read(&osdc->lock);
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset,
&olen);
if (r < 0) {
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
return -EIO;
}
dl.file_offset -= dl.object_offset;
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
ceph_ino(inode), dl.object_no);
oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
- ceph_oid_set_name(&oid, dl.object_name);
+ ceph_oid_printf(&oid, "%s", dl.object_name);
- r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+ r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
if (r < 0) {
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
return r;
}
- dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+ dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
if (dl.osd >= 0) {
struct ceph_entity_addr *a =
ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
} else {
memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
}
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
/* send result back to user */
if (copy_to_user(arg, &dl, sizeof(dl)))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85b8517f17a0..2103b823bec0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
ceph_decode_need(p, end, sizeof(num) + 2, bad);
num = ceph_decode_32(p);
- info->dir_end = ceph_decode_8(p);
- info->dir_complete = ceph_decode_8(p);
+ {
+ u16 flags = ceph_decode_16(p);
+ info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+ info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+ info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+ }
if (num == 0)
goto done;
- BUG_ON(!info->dir_in);
- info->dir_dname = (void *)(info->dir_in + num);
- info->dir_dname_len = (void *)(info->dir_dname + num);
- info->dir_dlease = (void *)(info->dir_dname_len + num);
- if ((unsigned long)(info->dir_dlease + num) >
- (unsigned long)info->dir_in + info->dir_buf_size) {
+ BUG_ON(!info->dir_entries);
+ if ((unsigned long)(info->dir_entries + num) >
+ (unsigned long)info->dir_entries + info->dir_buf_size) {
pr_err("dir contents are larger than expected\n");
WARN_ON(1);
goto bad;
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
info->dir_nr = num;
while (num) {
+ struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
/* dentry */
ceph_decode_need(p, end, sizeof(u32)*2, bad);
- info->dir_dname_len[i] = ceph_decode_32(p);
- ceph_decode_need(p, end, info->dir_dname_len[i], bad);
- info->dir_dname[i] = *p;
- *p += info->dir_dname_len[i];
- dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
- info->dir_dname[i]);
- info->dir_dlease[i] = *p;
+ rde->name_len = ceph_decode_32(p);
+ ceph_decode_need(p, end, rde->name_len, bad);
+ rde->name = *p;
+ *p += rde->name_len;
+ dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
+ rde->lease = *p;
*p += sizeof(struct ceph_mds_reply_lease);
/* inode */
- err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+ err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0)
goto out_bad;
+ /* ceph_readdir_prepopulate() will update it */
+ rde->offset = 0;
i++;
num--;
}
@@ -345,9 +348,9 @@ out_bad:
static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
{
- if (!info->dir_in)
+ if (!info->dir_entries)
return;
- free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
+ free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
}
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
kfree(req);
}
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
/*
* lookup session, bump ref if found.
*
* called under mdsc->mutex.
*/
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
- u64 tid)
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
{
struct ceph_mds_request *req;
- struct rb_node *n = mdsc->request_tree.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_mds_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else {
- ceph_mdsc_get_request(req);
- return req;
- }
- }
- return NULL;
-}
-static void __insert_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *new)
-{
- struct rb_node **p = &mdsc->request_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_mds_request *req = NULL;
+ req = lookup_request(&mdsc->request_tree, tid);
+ if (req)
+ ceph_mdsc_get_request(req);
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_mds_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &mdsc->request_tree);
+ return req;
}
/*
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
req->r_num_caps);
dout("__register_request %p tid %lld\n", req, req->r_tid);
ceph_mdsc_get_request(req);
- __insert_request(mdsc, req);
+ insert_request(&mdsc->request_tree, req);
req->r_uid = current_fsuid();
req->r_gid = current_fsgid();
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
}
}
- rb_erase(&req->r_node, &mdsc->request_tree);
- RB_CLEAR_NODE(&req->r_node);
+ erase_request(&mdsc->request_tree, req);
if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
int metadata_bytes = 0;
int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options;
+ struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
void *p;
const char* metadata[][2] = {
{"hostname", utsname()->nodename},
{"kernel_version", utsname()->release},
- {"entity_id", opt->name ? opt->name : ""},
+ {"entity_id", opt->name ? : ""},
+ {"root", fsopt->server_path ? : "/"},
{NULL, NULL}
};
@@ -1149,9 +1125,11 @@ out:
static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
struct ceph_inode_info *ci = ceph_inode(inode);
LIST_HEAD(to_remove);
- int drop = 0;
+ bool drop = false;
+ bool invalidate = false;
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+
+ if (ci->i_wrbuffer_ref > 0 &&
+ ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ invalidate = true;
while (true) {
struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
inode, ceph_ino(inode));
ci->i_dirty_caps = 0;
list_del_init(&ci->i_dirty_item);
- drop = 1;
+ drop = true;
}
if (!list_empty(&ci->i_flushing_item)) {
pr_warn_ratelimited(
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ci->i_flushing_caps = 0;
list_del_init(&ci->i_flushing_item);
mdsc->num_cap_flushing--;
- drop = 1;
+ drop = true;
}
spin_unlock(&mdsc->cap_dirty_lock);
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
list_del(&cf->list);
ceph_free_cap_flush(cf);
}
- while (drop--)
+
+ wake_up_all(&ci->i_cap_wq);
+ if (invalidate)
+ ceph_queue_invalidate(inode);
+ if (drop)
iput(inode);
return 0;
}
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
*/
static void remove_session_caps(struct ceph_mds_session *session)
{
+ struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+ struct super_block *sb = fsc->sb;
dout("remove_session_caps on %p\n", session);
- iterate_session_caps(session, remove_session_caps_cb, NULL);
+ iterate_session_caps(session, remove_session_caps_cb, fsc);
spin_lock(&session->s_cap_lock);
if (session->s_nr_caps > 0) {
- struct super_block *sb = session->s_mdsc->fsc->sb;
struct inode *inode;
struct ceph_cap *cap, *prev = NULL;
struct ceph_vino vino;
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
{
struct ceph_inode_info *ci = ceph_inode(inode);
- wake_up_all(&ci->i_cap_wq);
if (arg) {
spin_lock(&ci->i_ceph_lock);
ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
}
+ wake_up_all(&ci->i_cap_wq);
return 0;
}
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
- size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
- sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+ size_t size = sizeof(struct ceph_mds_reply_dir_entry);
int order, num_entries;
spin_lock(&ci->i_ceph_lock);
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
- rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
- __GFP_NOWARN,
- order);
- if (rinfo->dir_in)
+ rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
+ __GFP_NOWARN,
+ order);
+ if (rinfo->dir_entries)
break;
order--;
}
- if (!rinfo->dir_in)
+ if (!rinfo->dir_entries)
return -ENOMEM;
num_entries = (PAGE_SIZE << order) / size;
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
kref_init(&req->r_kref);
+ RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_wait);
init_completion(&req->r_completion);
init_completion(&req->r_safe_completion);
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* get request, session */
tid = le64_to_cpu(msg->hdr.tid);
mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
+ req = lookup_get_request(mdsc, tid);
if (!req) {
dout("handle_reply on unknown tid %llu\n", tid);
mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
fwd_seq = ceph_decode_32(&p);
mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
+ req = lookup_get_request(mdsc, tid);
if (!req) {
dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
goto out; /* dup reply? */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ee69a537dba5..e7d38aac7109 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
u32 pool_ns_len;
};
+struct ceph_mds_reply_dir_entry {
+ char *name;
+ u32 name_len;
+ struct ceph_mds_reply_lease *lease;
+ struct ceph_mds_reply_info_in inode;
+ loff_t offset;
+};
+
/*
* parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry,
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_dirfrag *dir_dir;
size_t dir_buf_size;
int dir_nr;
- char **dir_dname;
- u32 *dir_dname_len;
- struct ceph_mds_reply_lease **dir_dlease;
- struct ceph_mds_reply_info_in *dir_in;
- u8 dir_complete, dir_end;
+ bool dir_complete;
+ bool dir_end;
+ bool hash_order;
+ struct ceph_mds_reply_dir_entry *dir_entries;
};
/* for create results */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 261531e55e9d..8c3591a7fbae 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
const void *start = *p;
int i, j, n;
int err = -EINVAL;
- u16 version;
+ u8 mdsmap_v, mdsmap_cv;
m = kzalloc(sizeof(*m), GFP_NOFS);
if (m == NULL)
return ERR_PTR(-ENOMEM);
- ceph_decode_16_safe(p, end, version, bad);
- if (version > 3) {
- pr_warn("got mdsmap version %d > 3, failing", version);
- goto bad;
+ ceph_decode_need(p, end, 1 + 1, bad);
+ mdsmap_v = ceph_decode_8(p);
+ mdsmap_cv = ceph_decode_8(p);
+ if (mdsmap_v >= 4) {
+ u32 mdsmap_len;
+ ceph_decode_32_safe(p, end, mdsmap_len, bad);
+ if (end < *p + mdsmap_len)
+ goto bad;
+ end = *p + mdsmap_len;
}
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u32 namelen;
s32 mds, inc, state;
u64 state_seq;
- u8 infoversion;
+ u8 info_v;
+ void *info_end = NULL;
struct ceph_entity_addr addr;
u32 num_export_targets;
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
struct ceph_mds_info *info;
- ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+ ceph_decode_need(p, end, sizeof(u64) + 1, bad);
global_id = ceph_decode_64(p);
- infoversion = ceph_decode_8(p);
+ info_v= ceph_decode_8(p);
+ if (info_v >= 4) {
+ u32 info_len;
+ u8 info_cv;
+ ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+ info_cv = ceph_decode_8(p);
+ info_len = ceph_decode_32(p);
+ info_end = *p + info_len;
+ if (info_end > end)
+ goto bad;
+ }
+
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
*p += sizeof(u64);
namelen = ceph_decode_32(p); /* skip mds name */
*p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
*p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad);
*p += namelen;
- if (infoversion >= 2) {
+ if (info_v >= 2) {
ceph_decode_32_safe(p, end, num_export_targets, bad);
pexport_targets = *p;
*p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
num_export_targets = 0;
}
+ if (info_end && *p != info_end) {
+ if (*p > info_end)
+ goto bad;
+ *p = info_end;
+ }
+
dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_cas_pg_pool = ceph_decode_64(p);
/* ok, we don't care about the rest. */
+ *p = end;
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
return m;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f12d5e2955c2..91e02481ce06 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
* mount options
*/
enum {
+ Opt_mds_namespace,
Opt_wsize,
Opt_rsize,
Opt_rasize,
@@ -143,6 +144,7 @@ enum {
};
static match_table_t fsopt_tokens = {
+ {Opt_mds_namespace, "mds_namespace=%d"},
{Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"},
{Opt_rasize, "rasize=%d"},
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
break;
/* misc */
+ case Opt_mds_namespace:
+ fsopt->mds_namespace = intval;
+ break;
case Opt_wsize:
fsopt->wsize = intval;
break;
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
{
dout("destroy_mount_options %p\n", args);
kfree(args->snapdir_name);
+ kfree(args->server_path);
kfree(args);
}
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+ if (ret)
+ return ret;
+
return ceph_compare_options(new_opt, fsc->client);
}
static int parse_mount_options(struct ceph_mount_options **pfsopt,
struct ceph_options **popt,
int flags, char *options,
- const char *dev_name,
- const char **path)
+ const char *dev_name)
{
struct ceph_mount_options *fsopt;
const char *dev_name_end;
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb();
+ fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
/*
* Distinguish the server list from the path in "dev_name".
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
*/
dev_name_end = strchr(dev_name, '/');
if (dev_name_end) {
- /* skip over leading '/' for path */
- *path = dev_name_end + 1;
+ fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+ if (!fsopt->server_path) {
+ err = -ENOMEM;
+ goto out;
+ }
} else {
- /* path is empty */
dev_name_end = dev_name + strlen(dev_name);
- *path = dev_name_end;
}
err = -EINVAL;
dev_name_end--; /* back up to ':' separator */
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
goto out;
}
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
- dout("server path '%s'\n", *path);
+ if (fsopt->server_path)
+ dout("server path '%s'\n", fsopt->server_path);
*popt = ceph_parse_options(options, dev_name, dev_name_end,
parse_fsopt_token, (void *)fsopt);
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
+ if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
+ seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
{
struct ceph_fs_client *fsc;
const u64 supported_features =
- CEPH_FEATURE_FLOCK |
- CEPH_FEATURE_DIRLAYOUTHASH |
- CEPH_FEATURE_MDS_INLINE_DATA;
+ CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
+ CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
const u64 required_features = 0;
int page_count;
size_t size;
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+ fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
fsc->mount_options = fsopt;
@@ -785,8 +799,7 @@ out:
/*
* mount: join the ceph cluster, and open root directory.
*/
-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
- const char *path)
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
{
int err;
unsigned long started = jiffies; /* note the start time */
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
goto fail;
}
- if (path[0] == 0) {
+ if (!fsc->mount_options->server_path) {
root = fsc->sb->s_root;
dget(root);
} else {
- dout("mount opening base mountpoint\n");
+ const char *path = fsc->mount_options->server_path + 1;
+ dout("mount opening path %s\n", path);
root = open_root_dentry(fsc, path, started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
struct dentry *res;
int err;
int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
- const char *path = NULL;
struct ceph_mount_options *fsopt = NULL;
struct ceph_options *opt = NULL;
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
#ifdef CONFIG_CEPH_FS_POSIX_ACL
flags |= MS_POSIXACL;
#endif
- err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+ err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
if (err < 0) {
res = ERR_PTR(err);
goto out_final;
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
}
}
- res = ceph_real_mount(fsc, path);
+ res = ceph_real_mount(fsc);
if (IS_ERR(res))
goto out_splat;
dout("root %p inode %p ino %llx.%llx\n", res,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7b99eb756477..0168b49fb6ad 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,6 +62,7 @@ struct ceph_mount_options {
int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
+ int mds_namespace;
/*
* everything above this point can be memcmp'd; everything below
@@ -69,6 +70,7 @@ struct ceph_mount_options {
*/
char *snapdir_name; /* default ".snap" */
+ char *server_path; /* default "/" */
};
struct ceph_fs_client {
@@ -101,7 +103,6 @@ struct ceph_fs_client {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
- struct workqueue_struct *revalidate_wq;
#endif
};
@@ -295,6 +296,7 @@ struct ceph_inode_info {
u64 i_files, i_subdirs;
struct rb_root i_fragtree;
+ int i_fragtree_nsplits;
struct mutex i_fragtree_mutex;
struct ceph_inode_xattrs_info i_xattrs;
@@ -357,8 +359,7 @@ struct ceph_inode_info {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
- u32 i_fscache_gen; /* sequence, for delayed fscache validate */
- struct work_struct i_revalidate_work;
+ u32 i_fscache_gen;
#endif
struct inode vfs_inode; /* at end */
};
@@ -469,6 +470,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
+#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
@@ -537,11 +539,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
return (struct ceph_dentry_info *)dentry->d_fsdata;
}
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
- return ((loff_t)frag << 32) | (loff_t)off;
-}
-
/*
* caps helpers
*/
@@ -632,7 +629,6 @@ struct ceph_file_info {
struct ceph_mds_request *last_readdir;
/* readdir: position within a frag */
- unsigned offset; /* offset of last chunk, adjusted for . and .. */
unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
long long dir_release_count;
@@ -927,6 +923,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */
extern const struct file_operations ceph_file_fops;
+extern int ceph_renew_caps(struct inode *inode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode,
@@ -942,6 +939,7 @@ extern const struct inode_operations ceph_snapdir_iops;
extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
ceph_snapdir_dentry_ops;
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
extern int ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry, int err);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0d66722c6a52..4870b29df224 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
char buf[128];
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
- down_read(&osdc->map_sem);
+ down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
size_t len = strlen(pool_name);
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
ret = -ERANGE;
}
}
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
return ret;
}
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
const char *pool_name;
- down_read(&osdc->map_sem);
+ down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name)
ret = snprintf(val, size, "%s", pool_name);
else
ret = snprintf(val, size, "%lld", (unsigned long long)pool);
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
return ret;
}
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_pagelist *pagelist = NULL;
+ int op = CEPH_MDS_OP_SETXATTR;
int err;
if (size > 0) {
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
if (err)
goto out;
} else if (!value) {
- flags |= CEPH_XATTR_REMOVE;
+ if (flags & CEPH_XATTR_REPLACE)
+ op = CEPH_MDS_OP_RMXATTR;
+ else
+ flags |= CEPH_XATTR_REMOVE;
}
dout("setxattr value=%.*s\n", (int)size, value);
/* do request */
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
- USE_AUTH_MDS);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
}
- req->r_args.setxattr.flags = cpu_to_le32(flags);
req->r_path2 = kstrdup(name, GFP_NOFS);
if (!req->r_path2) {
ceph_mdsc_put_request(req);
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
goto out;
}
- req->r_pagelist = pagelist;
- pagelist = NULL;
+ if (op == CEPH_MDS_OP_SETXATTR) {
+ req->r_args.setxattr.flags = cpu_to_le32(flags);
+ req->r_pagelist = pagelist;
+ pagelist = NULL;
+ }
req->r_inode = inode;
ihold(inode);
@@ -1051,12 +1056,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler,
}
static int ceph_set_xattr_handler(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
- return __ceph_setxattr(d_inode(dentry), name, value, size, flags);
+ return __ceph_setxattr(inode, name, value, size, flags);
}
const struct xattr_handler ceph_other_xattr_handler = {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 24b142569ca9..687471dc04a0 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -91,6 +91,10 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
break;
}
+ if (i < CHRDEV_MAJOR_DYN_END)
+ pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range",
+ name, i);
+
if (i == 0) {
ret = -EBUSY;
goto out;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 6908080e9b6d..b611fc2e8984 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -24,10 +24,13 @@
#include <linux/string.h>
#include <keys/user-type.h>
#include <linux/key-type.h>
+#include <linux/keyctl.h>
#include <linux/inet.h>
#include "cifsglob.h"
#include "cifs_spnego.h"
#include "cifs_debug.h"
+#include "cifsproto.h"
+static const struct cred *spnego_cred;
/* create a new cifs key */
static int
@@ -102,6 +105,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
size_t desc_len;
struct key *spnego_key;
const char *hostname = server->hostname;
+ const struct cred *saved_cred;
/* length of fields (with semicolons): ver=0xyz ip4=ipaddress
host=hostname sec=mechanism uid=0xFF user=username */
@@ -163,7 +167,9 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
sprintf(dp, ";pid=0x%x", current->pid);
cifs_dbg(FYI, "key description = %s\n", description);
+ saved_cred = override_creds(spnego_cred);
spnego_key = request_key(&cifs_spnego_key_type, description, "");
+ revert_creds(saved_cred);
#ifdef CONFIG_CIFS_DEBUG2
if (cifsFYI && !IS_ERR(spnego_key)) {
@@ -177,3 +183,64 @@ out:
kfree(description);
return spnego_key;
}
+
+int
+init_cifs_spnego(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret;
+
+ cifs_dbg(FYI, "Registering the %s key type\n",
+ cifs_spnego_key_type.name);
+
+ /*
+ * Create an override credential set with special thread keyring for
+ * spnego upcalls.
+ */
+
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = keyring_alloc(".cifs_spnego",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = register_key_type(&cifs_spnego_key_type);
+ if (ret < 0)
+ goto failed_put_key;
+
+ /*
+ * instruct request_key() to use this special keyring as a cache for
+ * the results it looks up
+ */
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ spnego_cred = cred;
+
+ cifs_dbg(FYI, "cifs spnego keyring: %d\n", key_serial(keyring));
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+
+void
+exit_cifs_spnego(void)
+{
+ key_revoke(spnego_cred->thread_keyring);
+ unregister_key_type(&cifs_spnego_key_type);
+ put_cred(spnego_cred);
+ cifs_dbg(FYI, "Unregistered %s key type\n", cifs_spnego_key_type.name);
+}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 3f93125916bf..71e8a56e9479 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -360,7 +360,7 @@ init_cifs_idmap(void)
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 08fa36e5b2bc..5d8b7edf8a8f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -890,7 +890,6 @@ const struct inode_operations cifs_dir_inode_ops = {
.rmdir = cifs_rmdir,
.rename2 = cifs_rename2,
.permission = cifs_permission,
-/* revalidate:cifs_revalidate, */
.setattr = cifs_setattr,
.symlink = cifs_symlink,
.mknod = cifs_mknod,
@@ -901,9 +900,8 @@ const struct inode_operations cifs_dir_inode_ops = {
};
const struct inode_operations cifs_file_inode_ops = {
-/* revalidate:cifs_revalidate, */
.setattr = cifs_setattr,
- .getattr = cifs_getattr, /* do we need this anymore? */
+ .getattr = cifs_getattr,
.permission = cifs_permission,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -915,9 +913,6 @@ const struct inode_operations cifs_symlink_inode_ops = {
.readlink = generic_readlink,
.get_link = cifs_get_link,
.permission = cifs_permission,
- /* BB add the following two eventually */
- /* revalidate: cifs_revalidate,
- setattr: cifs_notify_change, *//* BB do we need notify change */
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = cifs_listxattr,
@@ -1303,7 +1298,7 @@ init_cifs(void)
goto out_destroy_mids;
#ifdef CONFIG_CIFS_UPCALL
- rc = register_key_type(&cifs_spnego_key_type);
+ rc = init_cifs_spnego();
if (rc)
goto out_destroy_request_bufs;
#endif /* CONFIG_CIFS_UPCALL */
@@ -1326,7 +1321,7 @@ out_init_cifs_idmap:
out_register_key_type:
#endif
#ifdef CONFIG_CIFS_UPCALL
- unregister_key_type(&cifs_spnego_key_type);
+ exit_cifs_spnego();
out_destroy_request_bufs:
#endif
cifs_destroy_request_bufs();
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0f9a6bc4ba43..1243bd326591 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -58,6 +58,8 @@ do { \
} while (0)
extern int init_cifs_idmap(void);
extern void exit_cifs_idmap(void);
+extern int init_cifs_spnego(void);
+extern void exit_cifs_spnego(void);
extern char *build_path_from_dentry(struct dentry *);
extern char *cifs_build_path_to_root(struct smb_vol *vol,
struct cifs_sb_info *cifs_sb,
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index c8b77aa24a1d..5e23f64c0804 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -39,8 +39,9 @@
enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT };
static int cifs_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
int rc = -EOPNOTSUPP;
unsigned int xid;
@@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
if (value &&
pTcon->ses->server->ops->set_acl)
rc = pTcon->ses->server->ops->set_acl(pacl,
- size, d_inode(dentry),
+ size, inode,
full_path, CIFS_ACL_DACL);
else
rc = -EOPNOTSUPP;
if (rc == 0) /* force revalidate of the inode */
- CIFS_I(d_inode(dentry))->time = 0;
+ CIFS_I(inode)->time = 0;
kfree(pacl);
}
#endif /* CONFIG_CIFS_ACL */
diff --git a/fs/compat.c b/fs/compat.c
index 8754e9aa14ad..be6e48b0a46c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -936,6 +936,8 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
}
dirent = buf->previous;
if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
if (__put_user(offset, &dirent->d_off))
goto efault;
}
@@ -1020,6 +1022,8 @@ static int compat_filldir64(struct dir_context *ctx, const char *name,
dirent = buf->previous;
if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
if (__put_user_unaligned(offset, &dirent->d_off))
goto efault;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 492c2db25dc9..281b768000e6 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -413,7 +413,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
if (!mm->core_state)
core_waiters = zap_threads(tsk, mm, core_state, exit_code);
up_write(&mm->mmap_sem);
@@ -792,6 +794,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
return 0;
file->f_pos = pos;
cprm->written += n;
+ cprm->pos += n;
nr -= n;
}
return 1;
@@ -806,6 +809,7 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
if (dump_interrupted() ||
file->f_op->llseek(file, nr, SEEK_CUR) < 0)
return 0;
+ cprm->pos += nr;
return 1;
} else {
while (nr > PAGE_SIZE) {
@@ -820,7 +824,7 @@ EXPORT_SYMBOL(dump_skip);
int dump_align(struct coredump_params *cprm, int align)
{
- unsigned mod = cprm->file->f_pos & (align - 1);
+ unsigned mod = cprm->pos & (align - 1);
if (align & (align - 1))
return 0;
return mod ? dump_skip(cprm, align - mod) : 1;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 06f5aa478bf2..1ac263eddc4e 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -78,6 +78,67 @@ out:
return res;
}
+static int validate_user_key(struct fscrypt_info *crypt_info,
+ struct fscrypt_context *ctx, u8 *raw_key,
+ u8 *prefix, int prefix_size)
+{
+ u8 *full_key_descriptor;
+ struct key *keyring_key;
+ struct fscrypt_key *master_key;
+ const struct user_key_payload *ukp;
+ int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
+ int res;
+
+ full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
+ if (!full_key_descriptor)
+ return -ENOMEM;
+
+ memcpy(full_key_descriptor, prefix, prefix_size);
+ sprintf(full_key_descriptor + prefix_size,
+ "%*phN", FS_KEY_DESCRIPTOR_SIZE,
+ ctx->master_key_descriptor);
+ full_key_descriptor[full_key_len - 1] = '\0';
+ keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+ kfree(full_key_descriptor);
+ if (IS_ERR(keyring_key))
+ return PTR_ERR(keyring_key);
+
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "%s: key type must be logon\n", __func__);
+ res = -ENOKEY;
+ goto out;
+ }
+ down_read(&keyring_key->sem);
+ ukp = user_key_payload(keyring_key);
+ if (ukp->datalen != sizeof(struct fscrypt_key)) {
+ res = -EINVAL;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ master_key = (struct fscrypt_key *)ukp->data;
+ BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+ if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "%s: key size incorrect: %d\n",
+ __func__, master_key->size);
+ res = -ENOKEY;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+ up_read(&keyring_key->sem);
+ if (res)
+ goto out;
+
+ crypt_info->ci_keyring_key = keyring_key;
+ return 0;
+out:
+ key_put(keyring_key);
+ return res;
+}
+
static void put_crypt_info(struct fscrypt_info *ci)
{
if (!ci)
@@ -91,12 +152,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
int get_crypt_info(struct inode *inode)
{
struct fscrypt_info *crypt_info;
- u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
- (FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
- struct key *keyring_key = NULL;
- struct fscrypt_key *master_key;
struct fscrypt_context ctx;
- const struct user_key_payload *ukp;
struct crypto_skcipher *ctfm;
const char *cipher_str;
u8 raw_key[FS_MAX_KEY_SIZE];
@@ -167,48 +223,24 @@ retry:
memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
goto got_key;
}
- memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
- FS_KEY_DESC_PREFIX_SIZE);
- sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
- "%*phN", FS_KEY_DESCRIPTOR_SIZE,
- ctx.master_key_descriptor);
- full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
- (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
- keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
- if (IS_ERR(keyring_key)) {
- res = PTR_ERR(keyring_key);
- keyring_key = NULL;
- goto out;
- }
- crypt_info->ci_keyring_key = keyring_key;
- if (keyring_key->type != &key_type_logon) {
- printk_once(KERN_WARNING
- "%s: key type must be logon\n", __func__);
- res = -ENOKEY;
- goto out;
- }
- down_read(&keyring_key->sem);
- ukp = user_key_payload(keyring_key);
- if (ukp->datalen != sizeof(struct fscrypt_key)) {
- res = -EINVAL;
- up_read(&keyring_key->sem);
- goto out;
- }
- master_key = (struct fscrypt_key *)ukp->data;
- BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
- if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
- printk_once(KERN_WARNING
- "%s: key size incorrect: %d\n",
- __func__, master_key->size);
- res = -ENOKEY;
- up_read(&keyring_key->sem);
+ res = validate_user_key(crypt_info, &ctx, raw_key,
+ FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
+ if (res && inode->i_sb->s_cop->key_prefix) {
+ u8 *prefix = NULL;
+ int prefix_size, res2;
+
+ prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
+ res2 = validate_user_key(crypt_info, &ctx, raw_key,
+ prefix, prefix_size);
+ if (res2) {
+ if (res2 == -ENOKEY)
+ res = -ENOKEY;
+ goto out;
+ }
+ } else if (res) {
goto out;
}
- res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
- up_read(&keyring_key->sem);
- if (res)
- goto out;
got_key:
ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
if (!ctfm || IS_ERR(ctfm)) {
diff --git a/fs/dax.c b/fs/dax.c
index 0dbe4e0f16fe..761495bf5eb9 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,44 @@
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+/*
+ * We use lowest available bit in exceptional entry for locking, other two
+ * bits to determine entry type. In total 3 special bits.
+ */
+#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+ RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
+ RADIX_TREE_EXCEPTIONAL_ENTRY))
+
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+static int __init init_dax_wait_table(void)
+{
+ int i;
+
+ for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+ init_waitqueue_head(wait_table + i);
+ return 0;
+}
+fs_initcall(init_dax_wait_table);
+
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+ pgoff_t index)
+{
+ unsigned long hash = hash_long((unsigned long)mapping ^ index,
+ DAX_WAIT_TABLE_BITS);
+ return wait_table + hash;
+}
+
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
struct request_queue *q = bdev->bd_queue;
@@ -78,50 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
return page;
}
-/*
- * dax_clear_sectors() is called from within transaction context from XFS,
- * and hence this means the stack from this point must follow GFP_NOFS
- * semantics for all operations.
- */
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
-{
- struct blk_dax_ctl dax = {
- .sector = _sector,
- .size = _size,
- };
-
- might_sleep();
- do {
- long count, sz;
-
- count = dax_map_atomic(bdev, &dax);
- if (count < 0)
- return count;
- sz = min_t(long, count, SZ_128K);
- clear_pmem(dax.addr, sz);
- dax.size -= sz;
- dax.sector += sz / 512;
- dax_unmap_atomic(bdev, &dax);
- cond_resched();
- } while (dax.size);
-
- wmb_pmem();
- return 0;
-}
-EXPORT_SYMBOL_GPL(dax_clear_sectors);
-
-/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
- loff_t pos, loff_t end)
-{
- loff_t final = end - pos + first; /* The final byte of the buffer */
-
- if (first > 0)
- clear_pmem(addr, first);
- if (final < size)
- clear_pmem(addr + final, size - final);
-}
-
static bool buffer_written(struct buffer_head *bh)
{
return buffer_mapped(bh) && !buffer_unwritten(bh);
@@ -160,6 +154,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
struct blk_dax_ctl dax = {
.addr = (void __pmem *) ERR_PTR(-EIO),
};
+ unsigned blkbits = inode->i_blkbits;
+ sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
+ >> blkbits;
if (rw == READ)
end = min(end, i_size_read(inode));
@@ -167,7 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
while (pos < end) {
size_t len;
if (pos == max) {
- unsigned blkbits = inode->i_blkbits;
long page = pos >> PAGE_SHIFT;
sector_t block = page << (PAGE_SHIFT - blkbits);
unsigned first = pos - (block << blkbits);
@@ -183,6 +179,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
bdev = bh->b_bdev;
+ /*
+ * We allow uninitialized buffers for writes
+ * beyond EOF as those cannot race with faults
+ */
+ WARN_ON_ONCE(
+ (buffer_new(bh) && block < file_blks) ||
+ (rw == WRITE && buffer_unwritten(bh)));
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
@@ -202,11 +205,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
rc = map_len;
break;
}
- if (buffer_unwritten(bh) || buffer_new(bh)) {
- dax_new_buf(dax.addr, map_len, first,
- pos, end);
- need_wmb = true;
- }
dax.addr += first;
size = map_len - first;
}
@@ -267,15 +265,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
memset(&bh, 0, sizeof(bh));
bh.b_bdev = inode->i_sb->s_bdev;
- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
- struct address_space *mapping = inode->i_mapping;
+ if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
inode_lock(inode);
- retval = filemap_write_and_wait_range(mapping, pos, end - 1);
- if (retval) {
- inode_unlock(inode);
- goto out;
- }
- }
/* Protects against truncate */
if (!(flags & DIO_SKIP_DIO_COUNT))
@@ -296,12 +287,268 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
if (!(flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(inode);
- out:
return retval;
}
EXPORT_SYMBOL_GPL(dax_do_io);
/*
+ * DAX radix tree locking
+ */
+struct exceptional_entry_key {
+ struct address_space *mapping;
+ unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+ wait_queue_t wait;
+ struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+ int sync, void *keyp)
+{
+ struct exceptional_entry_key *key = keyp;
+ struct wait_exceptional_entry_queue *ewait =
+ container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+ if (key->mapping != ewait->key.mapping ||
+ key->index != ewait->key.index)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * Check whether the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline int slot_locked(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ return entry & RADIX_DAX_ENTRY_LOCK;
+}
+
+/*
+ * Mark the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *lock_slot(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+ entry |= RADIX_DAX_ENTRY_LOCK;
+ radix_tree_replace_slot(slot, (void *)entry);
+ return (void *)entry;
+}
+
+/*
+ * Mark the given slot is unlocked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *unlock_slot(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+ entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
+ radix_tree_replace_slot(slot, (void *)entry);
+ return (void *)entry;
+}
+
+/*
+ * Lookup entry in radix tree, wait for it to become unlocked if it is
+ * exceptional entry and return it. The caller must call
+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
+ * put_locked_mapping_entry() when he locked the entry and now wants to
+ * unlock it.
+ *
+ * The function must be called with mapping->tree_lock held.
+ */
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void ***slotp)
+{
+ void *ret, **slot;
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+ ewait.key.mapping = mapping;
+ ewait.key.index = index;
+
+ for (;;) {
+ ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+ &slot);
+ if (!ret || !radix_tree_exceptional_entry(ret) ||
+ !slot_locked(mapping, slot)) {
+ if (slotp)
+ *slotp = slot;
+ return ret;
+ }
+ prepare_to_wait_exclusive(wq, &ewait.wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&mapping->tree_lock);
+ schedule();
+ finish_wait(wq, &ewait.wait);
+ spin_lock_irq(&mapping->tree_lock);
+ }
+}
+
+/*
+ * Find radix tree entry at given index. If it points to a page, return with
+ * the page locked. If it points to the exceptional entry, return with the
+ * radix tree entry locked. If the radix tree doesn't contain given index,
+ * create empty exceptional entry for the index and return with it locked.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ */
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *ret, **slot;
+
+restart:
+ spin_lock_irq(&mapping->tree_lock);
+ ret = get_unlocked_mapping_entry(mapping, index, &slot);
+ /* No entry for given index? Make sure radix tree is big enough. */
+ if (!ret) {
+ int err;
+
+ spin_unlock_irq(&mapping->tree_lock);
+ err = radix_tree_preload(
+ mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+ if (err)
+ return ERR_PTR(err);
+ ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+ RADIX_DAX_ENTRY_LOCK);
+ spin_lock_irq(&mapping->tree_lock);
+ err = radix_tree_insert(&mapping->page_tree, index, ret);
+ radix_tree_preload_end();
+ if (err) {
+ spin_unlock_irq(&mapping->tree_lock);
+ /* Someone already created the entry? */
+ if (err == -EEXIST)
+ goto restart;
+ return ERR_PTR(err);
+ }
+ /* Good, we have inserted empty locked entry into the tree. */
+ mapping->nrexceptional++;
+ spin_unlock_irq(&mapping->tree_lock);
+ return ret;
+ }
+ /* Normal page in radix tree? */
+ if (!radix_tree_exceptional_entry(ret)) {
+ struct page *page = ret;
+
+ get_page(page);
+ spin_unlock_irq(&mapping->tree_lock);
+ lock_page(page);
+ /* Page got truncated? Retry... */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto restart;
+ }
+ return page;
+ }
+ ret = lock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ return ret;
+}
+
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+ pgoff_t index, bool wake_all)
+{
+ wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+ /*
+ * Checking for locked entry and prepare_to_wait_exclusive() happens
+ * under mapping->tree_lock, ditto for entry handling in our callers.
+ * So at this point all tasks that could have seen our entry locked
+ * must be in the waitqueue and the following check will see them.
+ */
+ if (waitqueue_active(wq)) {
+ struct exceptional_entry_key key;
+
+ key.mapping = mapping;
+ key.index = index;
+ __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+ }
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *ret, **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+ if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+ !slot_locked(mapping, slot))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return;
+ }
+ unlock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry)) {
+ unlock_page(entry);
+ put_page(entry);
+ } else {
+ dax_unlock_mapping_entry(mapping, index);
+ }
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry))
+ return;
+
+ /* We have to wake up next waiter for the radix tree entry lock */
+ dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+/*
+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
+ * entry to get unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *entry;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ /*
+ * This gets called from truncate / punch_hole path. As such, the caller
+ * must hold locks protecting against concurrent modifications of the
+ * radix tree (usually fs-private i_mmap_sem for writing). Since the
+ * caller has seen exceptional entry for this index, we better find it
+ * at that index as well...
+ */
+ if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+ }
+ radix_tree_delete(&mapping->page_tree, index);
+ mapping->nrexceptional--;
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, true);
+
+ return 1;
+}
+
+/*
* The user has performed a load from a hole in the file. Allocating
* a new page in the file would cause excessive storage usage for
* workloads with sparse files. We allocate a page cache page instead.
@@ -309,24 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
* otherwise it will simply fall out of the page cache under memory
* pressure without ever having been dirtied.
*/
-static int dax_load_hole(struct address_space *mapping, struct page *page,
- struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+ struct vm_fault *vmf)
{
- unsigned long size;
- struct inode *inode = mapping->host;
- if (!page)
- page = find_or_create_page(mapping, vmf->pgoff,
- GFP_KERNEL | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
- /* Recheck i_size under page lock to avoid truncate race */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size) {
- unlock_page(page);
- put_page(page);
- return VM_FAULT_SIGBUS;
+ struct page *page;
+
+ /* Hole page already exists? Return it... */
+ if (!radix_tree_exceptional_entry(entry)) {
+ vmf->page = entry;
+ return VM_FAULT_LOCKED;
}
+ /* This will replace locked radix tree entry with a hole page */
+ page = find_or_create_page(mapping, vmf->pgoff,
+ vmf->gfp_mask | __GFP_ZERO);
+ if (!page) {
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ return VM_FAULT_OOM;
+ }
vmf->page = page;
return VM_FAULT_LOCKED;
}
@@ -350,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
return 0;
}
-#define NO_SECTOR -1
#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
- sector_t sector, bool pmd_entry, bool dirty)
+static void *dax_insert_mapping_entry(struct address_space *mapping,
+ struct vm_fault *vmf,
+ void *entry, sector_t sector)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- pgoff_t pmd_index = DAX_PMD_INDEX(index);
- int type, error = 0;
- void *entry;
+ int error = 0;
+ bool hole_fill = false;
+ void *new_entry;
+ pgoff_t index = vmf->pgoff;
- WARN_ON_ONCE(pmd_entry && !dirty);
- if (dirty)
+ if (vmf->flags & FAULT_FLAG_WRITE)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- spin_lock_irq(&mapping->tree_lock);
-
- entry = radix_tree_lookup(page_tree, pmd_index);
- if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
- index = pmd_index;
- goto dirty;
+ /* Replacing hole page with block mapping? */
+ if (!radix_tree_exceptional_entry(entry)) {
+ hole_fill = true;
+ /*
+ * Unmap the page now before we remove it from page cache below.
+ * The page is locked so it cannot be faulted in again.
+ */
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+ PAGE_SIZE, 0);
+ error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
+ if (error)
+ return ERR_PTR(error);
}
- entry = radix_tree_lookup(page_tree, index);
- if (entry) {
- type = RADIX_DAX_TYPE(entry);
- if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
- type != RADIX_DAX_PMD)) {
- error = -EIO;
+ spin_lock_irq(&mapping->tree_lock);
+ new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
+ RADIX_DAX_ENTRY_LOCK);
+ if (hole_fill) {
+ __delete_from_page_cache(entry, NULL);
+ /* Drop pagecache reference */
+ put_page(entry);
+ error = radix_tree_insert(page_tree, index, new_entry);
+ if (error) {
+ new_entry = ERR_PTR(error);
goto unlock;
}
+ mapping->nrexceptional++;
+ } else {
+ void **slot;
+ void *ret;
- if (!pmd_entry || type == RADIX_DAX_PMD)
- goto dirty;
-
- /*
- * We only insert dirty PMD entries into the radix tree. This
- * means we don't need to worry about removing a dirty PTE
- * entry and inserting a clean PMD entry, thus reducing the
- * range we would flush with a follow-up fsync/msync call.
- */
- radix_tree_delete(&mapping->page_tree, index);
- mapping->nrexceptional--;
- }
-
- if (sector == NO_SECTOR) {
- /*
- * This can happen during correct operation if our pfn_mkwrite
- * fault raced against a hole punch operation. If this
- * happens the pte that was hole punched will have been
- * unmapped and the radix tree entry will have been removed by
- * the time we are called, but the call will still happen. We
- * will return all the way up to wp_pfn_shared(), where the
- * pte_same() check will fail, eventually causing page fault
- * to be retried by the CPU.
- */
- goto unlock;
+ ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+ WARN_ON_ONCE(ret != entry);
+ radix_tree_replace_slot(slot, new_entry);
}
-
- error = radix_tree_insert(page_tree, index,
- RADIX_DAX_ENTRY(sector, pmd_entry));
- if (error)
- goto unlock;
-
- mapping->nrexceptional++;
- dirty:
- if (dirty)
+ if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
unlock:
spin_unlock_irq(&mapping->tree_lock);
- return error;
+ if (hole_fill) {
+ radix_tree_preload_end();
+ /*
+ * We don't need hole page anymore, it has been replaced with
+ * locked radix tree entry now.
+ */
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(entry);
+ unlock_page(entry);
+ put_page(entry);
+ }
+ return new_entry;
}
static int dax_writeback_one(struct block_device *bdev,
@@ -546,56 +788,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+static int dax_insert_mapping(struct address_space *mapping,
+ struct buffer_head *bh, void **entryp,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- struct address_space *mapping = inode->i_mapping;
struct block_device *bdev = bh->b_bdev;
struct blk_dax_ctl dax = {
- .sector = to_sector(bh, inode),
+ .sector = to_sector(bh, mapping->host),
.size = bh->b_size,
};
- pgoff_t size;
- int error;
-
- i_mmap_lock_read(mapping);
-
- /*
- * Check truncate didn't happen while we were allocating a block.
- * If it did, this block may or may not be still allocated to the
- * file. We can't tell the filesystem to free it because we can't
- * take i_mutex here. In the worst case, the file still has blocks
- * allocated past the end of the file.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- error = -EIO;
- goto out;
- }
-
- if (dax_map_atomic(bdev, &dax) < 0) {
- error = PTR_ERR(dax.addr);
- goto out;
- }
+ void *ret;
+ void *entry = *entryp;
- if (buffer_unwritten(bh) || buffer_new(bh)) {
- clear_pmem(dax.addr, PAGE_SIZE);
- wmb_pmem();
- }
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
dax_unmap_atomic(bdev, &dax);
- error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
- vmf->flags & FAULT_FLAG_WRITE);
- if (error)
- goto out;
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+ *entryp = ret;
- error = vm_insert_mixed(vma, vaddr, dax.pfn);
-
- out:
- i_mmap_unlock_read(mapping);
-
- return error;
+ return vm_insert_mixed(vma, vaddr, dax.pfn);
}
/**
@@ -603,24 +818,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @get_block: The filesystem method used to translate file offsets to blocks
- * @complete_unwritten: The filesystem method used to convert unwritten blocks
- * to written so the data written to them is exposed. This is required for
- * required by write faults for filesystems that will return unwritten
- * extent mappings from @get_block, but it is optional for reads as
- * dax_insert_mapping() will always zero unwritten blocks. If the fs does
- * not support unwritten extents, the it should pass NULL.
*
* When a page fault occurs, filesystems may call this helper in their
* fault handler for DAX files. __dax_fault() assumes the caller has done all
* the necessary locking for the page fault to proceed successfully.
*/
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block, dax_iodone_t complete_unwritten)
+ get_block_t get_block)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- struct page *page;
+ void *entry;
struct buffer_head bh;
unsigned long vaddr = (unsigned long)vmf->virtual_address;
unsigned blkbits = inode->i_blkbits;
@@ -629,6 +838,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
int error;
int major = 0;
+ /*
+ * Check whether offset isn't beyond end of file now. Caller is supposed
+ * to hold locks serializing us with truncate / punch hole so this is
+ * a reliable test.
+ */
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;
@@ -638,49 +852,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
bh.b_bdev = inode->i_sb->s_bdev;
bh.b_size = PAGE_SIZE;
- repeat:
- page = find_get_page(mapping, vmf->pgoff);
- if (page) {
- if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
- put_page(page);
- return VM_FAULT_RETRY;
- }
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto repeat;
- }
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- /*
- * We have a struct page covering a hole in the file
- * from a read fault and we've raced with a truncate
- */
- error = -EIO;
- goto unlock_page;
- }
+ entry = grab_mapping_entry(mapping, vmf->pgoff);
+ if (IS_ERR(entry)) {
+ error = PTR_ERR(entry);
+ goto out;
}
error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */
if (error)
- goto unlock_page;
-
- if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
- if (vmf->flags & FAULT_FLAG_WRITE) {
- error = get_block(inode, block, &bh, 1);
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- major = VM_FAULT_MAJOR;
- if (!error && (bh.b_size < PAGE_SIZE))
- error = -EIO;
- if (error)
- goto unlock_page;
- } else {
- return dax_load_hole(mapping, page, vmf);
- }
- }
+ goto unlock_entry;
if (vmf->cow_page) {
struct page *new_page = vmf->cow_page;
@@ -689,53 +871,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else
clear_user_highpage(new_page, vaddr);
if (error)
- goto unlock_page;
- vmf->page = page;
- if (!page) {
- i_mmap_lock_read(mapping);
- /* Check we didn't race with truncate */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
- error = -EIO;
- goto out;
- }
+ goto unlock_entry;
+ if (!radix_tree_exceptional_entry(entry)) {
+ vmf->page = entry;
+ return VM_FAULT_LOCKED;
}
- return VM_FAULT_LOCKED;
- }
-
- /* Check we didn't race with a read fault installing a new page */
- if (!page && major)
- page = find_lock_page(mapping, vmf->pgoff);
-
- if (page) {
- unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
- PAGE_SIZE, 0);
- delete_from_page_cache(page);
- unlock_page(page);
- put_page(page);
- page = NULL;
+ vmf->entry = entry;
+ return VM_FAULT_DAX_LOCKED;
}
- /*
- * If we successfully insert the new mapping over an unwritten extent,
- * we need to ensure we convert the unwritten extent. If there is an
- * error inserting the mapping, the filesystem needs to leave it as
- * unwritten to prevent exposure of the stale underlying data to
- * userspace, but we still need to call the completion function so
- * the private resources on the mapping buffer can be released. We
- * indicate what the callback should do via the uptodate variable, same
- * as for normal BH based IO completions.
- */
- error = dax_insert_mapping(inode, &bh, vma, vmf);
- if (buffer_unwritten(&bh)) {
- if (complete_unwritten)
- complete_unwritten(&bh, !error);
- else
- WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
+ if (!buffer_mapped(&bh)) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ error = get_block(inode, block, &bh, 1);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ major = VM_FAULT_MAJOR;
+ if (!error && (bh.b_size < PAGE_SIZE))
+ error = -EIO;
+ if (error)
+ goto unlock_entry;
+ } else {
+ return dax_load_hole(mapping, entry, vmf);
+ }
}
+ /* Filesystem should not return unwritten buffers to us! */
+ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
+ error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
@@ -743,13 +907,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if ((error < 0) && (error != -EBUSY))
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
-
- unlock_page:
- if (page) {
- unlock_page(page);
- put_page(page);
- }
- goto out;
}
EXPORT_SYMBOL(__dax_fault);
@@ -763,7 +920,7 @@ EXPORT_SYMBOL(__dax_fault);
* fault handler for DAX files.
*/
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block, dax_iodone_t complete_unwritten)
+ get_block_t get_block)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -772,7 +929,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = __dax_fault(vma, vmf, get_block, complete_unwritten);
+ result = __dax_fault(vma, vmf, get_block);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
@@ -780,7 +937,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
EXPORT_SYMBOL_GPL(dax_fault);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
* The 'colour' (ie low bits) within a PMD of a page offset. This comes up
* more often than one might expect in the below function.
@@ -806,8 +963,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block,
- dax_iodone_t complete_unwritten)
+ pmd_t *pmd, unsigned int flags, get_block_t get_block)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@@ -819,7 +975,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
struct block_device *bdev;
pgoff_t size, pgoff;
sector_t block;
- int error, result = 0;
+ int result = 0;
bool alloc = false;
/* dax pmd mappings require pfn_t_devmap() */
@@ -866,6 +1022,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (get_block(inode, block, &bh, 1) != 0)
return VM_FAULT_SIGBUS;
alloc = true;
+ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
}
bdev = bh.b_bdev;
@@ -891,26 +1048,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
truncate_pagecache_range(inode, lstart, lend);
}
- i_mmap_lock_read(mapping);
-
- /*
- * If a truncate happened while we were allocating blocks, we may
- * leave blocks allocated to the file that are beyond EOF. We can't
- * take i_mutex here, so just leave them hanging; they'll be freed
- * when the file is deleted.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size) {
- result = VM_FAULT_SIGBUS;
- goto out;
- }
- if ((pgoff | PG_PMD_COLOUR) >= size) {
- dax_pmd_dbg(&bh, address,
- "offset + huge page size > file size");
- goto fallback;
- }
-
- if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ if (!write && !buffer_mapped(&bh)) {
spinlock_t *ptl;
pmd_t entry;
struct page *zero_page = get_huge_zero_page();
@@ -945,8 +1083,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
long length = dax_map_atomic(bdev, &dax);
if (length < 0) {
- result = VM_FAULT_SIGBUS;
- goto out;
+ dax_pmd_dbg(&bh, address, "dax-error fallback");
+ goto fallback;
}
if (length < PMD_SIZE) {
dax_pmd_dbg(&bh, address, "dax-length too small");
@@ -964,14 +1102,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
dax_pmd_dbg(&bh, address, "pfn not in memmap");
goto fallback;
}
-
- if (buffer_unwritten(&bh) || buffer_new(&bh)) {
- clear_pmem(dax.addr, PMD_SIZE);
- wmb_pmem();
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- result |= VM_FAULT_MAJOR;
- }
dax_unmap_atomic(bdev, &dax);
/*
@@ -990,13 +1120,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
* the write to insert a dirty entry.
*/
if (write) {
- error = dax_radix_entry(mapping, pgoff, dax.sector,
- true, true);
- if (error) {
- dax_pmd_dbg(&bh, address,
- "PMD radix insertion failed");
- goto fallback;
- }
+ /*
+ * We should insert radix-tree entry and dirty it here.
+ * For now this is broken...
+ */
}
dev_dbg(part_to_dev(bdev->bd_part),
@@ -1009,11 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
}
out:
- i_mmap_unlock_read(mapping);
-
- if (buffer_unwritten(&bh))
- complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
-
return result;
fallback:
@@ -1033,8 +1155,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
* pmd_fault handler for DAX files.
*/
int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block,
- dax_iodone_t complete_unwritten)
+ pmd_t *pmd, unsigned int flags, get_block_t get_block)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1043,8 +1164,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
- complete_unwritten);
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
if (flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
@@ -1061,27 +1181,59 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
- int error;
-
- /*
- * We pass NO_SECTOR to dax_radix_entry() because we expect that a
- * RADIX_DAX_PTE entry already exists in the radix tree from a
- * previous call to __dax_fault(). We just want to look up that PTE
- * entry using vmf->pgoff and make sure the dirty tag is set. This
- * saves us from having to make a call to get_block() here to look
- * up the sector.
- */
- error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
- true);
+ struct address_space *mapping = file->f_mapping;
+ void *entry;
+ pgoff_t index = vmf->pgoff;
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- if (error)
- return VM_FAULT_SIGBUS;
+ spin_lock_irq(&mapping->tree_lock);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ if (!entry || !radix_tree_exceptional_entry(entry))
+ goto out;
+ radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+ put_unlocked_mapping_entry(mapping, index, entry);
+out:
+ spin_unlock_irq(&mapping->tree_lock);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
+static bool dax_range_is_aligned(struct block_device *bdev,
+ unsigned int offset, unsigned int length)
+{
+ unsigned short sector_size = bdev_logical_block_size(bdev);
+
+ if (!IS_ALIGNED(offset, sector_size))
+ return false;
+ if (!IS_ALIGNED(length, sector_size))
+ return false;
+
+ return true;
+}
+
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+ unsigned int offset, unsigned int length)
+{
+ struct blk_dax_ctl dax = {
+ .sector = sector,
+ .size = PAGE_SIZE,
+ };
+
+ if (dax_range_is_aligned(bdev, offset, length)) {
+ sector_t start_sector = dax.sector + (offset >> 9);
+
+ return blkdev_issue_zeroout(bdev, start_sector,
+ length >> 9, GFP_NOFS, true);
+ } else {
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
+ clear_pmem(dax.addr + offset, length);
+ wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+
/**
* dax_zero_page_range - zero a range within a page of a DAX file
* @inode: The file being truncated
@@ -1093,12 +1245,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
* page in a DAX file. This is intended for hole-punch operations. If
* you are truncating a file, the helper function dax_truncate_page() may be
* more convenient.
- *
- * We work in terms of PAGE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks. Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
*/
int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
get_block_t get_block)
@@ -1117,23 +1263,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
bh.b_bdev = inode->i_sb->s_bdev;
bh.b_size = PAGE_SIZE;
err = get_block(inode, index, &bh, 0);
- if (err < 0)
+ if (err < 0 || !buffer_written(&bh))
return err;
- if (buffer_written(&bh)) {
- struct block_device *bdev = bh.b_bdev;
- struct blk_dax_ctl dax = {
- .sector = to_sector(&bh, inode),
- .size = PAGE_SIZE,
- };
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- clear_pmem(dax.addr + offset, length);
- wmb_pmem();
- dax_unmap_atomic(bdev, &dax);
- }
-
- return 0;
+ return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+ offset, length);
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);
@@ -1145,12 +1279,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
*
* Similar to block_truncate_page(), this function can be called by a
* filesystem when it is truncating a DAX file to handle the partial page.
- *
- * We work in terms of PAGE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks. Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
*/
int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
{
diff --git a/fs/dcache.c b/fs/dcache.c
index c622872c12c5..817c243c1ff1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1636,7 +1636,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
struct dentry *dentry = __d_alloc(parent->d_sb, name);
if (!dentry)
return NULL;
-
+ dentry->d_flags |= DCACHE_RCUACCESS;
spin_lock(&parent->d_lock);
/*
* don't need child lock because it is not subject
@@ -1670,8 +1670,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
struct qstr q;
q.name = name;
- q.len = strlen(name);
- q.hash = full_name_hash(q.name, q.len);
+ q.hash_len = hashlen_string(name);
return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);
@@ -2359,7 +2358,6 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
{
BUG_ON(!d_unhashed(entry));
hlist_bl_lock(b);
- entry->d_flags |= DCACHE_RCUACCESS;
hlist_bl_add_head_rcu(&entry->d_hash, b);
hlist_bl_unlock(b);
}
@@ -2844,6 +2842,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
/* ... and switch them in the tree */
if (IS_ROOT(dentry)) {
/* splicing a tree */
+ dentry->d_flags |= DCACHE_RCUACCESS;
dentry->d_parent = target->d_parent;
target->d_parent = target;
list_del_init(&target->d_child);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index d2ba12e23ed9..592059f88e04 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -22,6 +22,12 @@
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/device.h>
+#include <linux/srcu.h>
+#include <asm/poll.h>
+
+#include "internal.h"
+
+struct poll_table_struct;
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -35,27 +41,294 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
return count;
}
-const struct file_operations debugfs_file_operations = {
+const struct file_operations debugfs_noop_file_operations = {
.read = default_read_file,
.write = default_write_file,
.open = simple_open,
.llseek = noop_llseek,
};
-static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
- struct dentry *parent, void *value,
- const struct file_operations *fops,
- const struct file_operations *fops_ro,
- const struct file_operations *fops_wo)
+/**
+ * debugfs_use_file_start - mark the beginning of file data access
+ * @dentry: the dentry object whose data is being accessed.
+ * @srcu_idx: a pointer to some memory to store a SRCU index in.
+ *
+ * Up to a matching call to debugfs_use_file_finish(), any
+ * successive call into the file removing functions debugfs_remove()
+ * and debugfs_remove_recursive() will block. Since associated private
+ * file data may only get freed after a successful return of any of
+ * the removal functions, you may safely access it after a successful
+ * call to debugfs_use_file_start() without worrying about
+ * lifetime issues.
+ *
+ * If -%EIO is returned, the file has already been removed and thus,
+ * it is not safe to access any of its data. If, on the other hand,
+ * it is allowed to access the file data, zero is returned.
+ *
+ * Regardless of the return code, any call to
+ * debugfs_use_file_start() must be followed by a matching call
+ * to debugfs_use_file_finish().
+ */
+int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
+ __acquires(&debugfs_srcu)
+{
+ *srcu_idx = srcu_read_lock(&debugfs_srcu);
+ barrier();
+ if (d_unlinked(dentry))
+ return -EIO;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(debugfs_use_file_start);
+
+/**
+ * debugfs_use_file_finish - mark the end of file data access
+ * @srcu_idx: the SRCU index "created" by a former call to
+ * debugfs_use_file_start().
+ *
+ * Allow any ongoing concurrent call into debugfs_remove() or
+ * debugfs_remove_recursive() blocked by a former call to
+ * debugfs_use_file_start() to proceed and return to its caller.
+ */
+void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
+{
+ srcu_read_unlock(&debugfs_srcu, srcu_idx);
+}
+EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
+
+#define F_DENTRY(filp) ((filp)->f_path.dentry)
+
+#define REAL_FOPS_DEREF(dentry) \
+ ((const struct file_operations *)(dentry)->d_fsdata)
+
+static int open_proxy_open(struct inode *inode, struct file *filp)
+{
+ const struct dentry *dentry = F_DENTRY(filp);
+ const struct file_operations *real_fops = NULL;
+ int srcu_idx, r;
+
+ r = debugfs_use_file_start(dentry, &srcu_idx);
+ if (r) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ real_fops = REAL_FOPS_DEREF(dentry);
+ real_fops = fops_get(real_fops);
+ if (!real_fops) {
+ /* Huh? Module did not clean up after itself at exit? */
+ WARN(1, "debugfs file owner did not clean up at exit: %pd",
+ dentry);
+ r = -ENXIO;
+ goto out;
+ }
+ replace_fops(filp, real_fops);
+
+ if (real_fops->open)
+ r = real_fops->open(inode, filp);
+
+out:
+ debugfs_use_file_finish(srcu_idx);
+ return r;
+}
+
+const struct file_operations debugfs_open_proxy_file_operations = {
+ .open = open_proxy_open,
+};
+
+#define PROTO(args...) args
+#define ARGS(args...) args
+
+#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \
+static ret_type full_proxy_ ## name(proto) \
+{ \
+ const struct dentry *dentry = F_DENTRY(filp); \
+ const struct file_operations *real_fops = \
+ REAL_FOPS_DEREF(dentry); \
+ int srcu_idx; \
+ ret_type r; \
+ \
+ r = debugfs_use_file_start(dentry, &srcu_idx); \
+ if (likely(!r)) \
+ r = real_fops->name(args); \
+ debugfs_use_file_finish(srcu_idx); \
+ return r; \
+}
+
+FULL_PROXY_FUNC(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence));
+
+FULL_PROXY_FUNC(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos));
+
+FULL_PROXY_FUNC(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos));
+
+FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
+ PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
+ ARGS(filp, cmd, arg));
+
+static unsigned int full_proxy_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ const struct dentry *dentry = F_DENTRY(filp);
+ const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+ int srcu_idx;
+ unsigned int r = 0;
+
+ if (debugfs_use_file_start(dentry, &srcu_idx)) {
+ debugfs_use_file_finish(srcu_idx);
+ return POLLHUP;
+ }
+
+ r = real_fops->poll(filp, wait);
+ debugfs_use_file_finish(srcu_idx);
+ return r;
+}
+
+static int full_proxy_release(struct inode *inode, struct file *filp)
+{
+ const struct dentry *dentry = F_DENTRY(filp);
+ const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+ const struct file_operations *proxy_fops = filp->f_op;
+ int r = 0;
+
+ /*
+ * We must not protect this against removal races here: the
+ * original releaser should be called unconditionally in order
+ * not to leak any resources. Releasers must not assume that
+ * ->i_private is still being meaningful here.
+ */
+ if (real_fops->release)
+ r = real_fops->release(inode, filp);
+
+ replace_fops(filp, d_inode(dentry)->i_fop);
+ kfree((void *)proxy_fops);
+ fops_put(real_fops);
+ return 0;
+}
+
+static void __full_proxy_fops_init(struct file_operations *proxy_fops,
+ const struct file_operations *real_fops)
+{
+ proxy_fops->release = full_proxy_release;
+ if (real_fops->llseek)
+ proxy_fops->llseek = full_proxy_llseek;
+ if (real_fops->read)
+ proxy_fops->read = full_proxy_read;
+ if (real_fops->write)
+ proxy_fops->write = full_proxy_write;
+ if (real_fops->poll)
+ proxy_fops->poll = full_proxy_poll;
+ if (real_fops->unlocked_ioctl)
+ proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
+}
+
+static int full_proxy_open(struct inode *inode, struct file *filp)
+{
+ const struct dentry *dentry = F_DENTRY(filp);
+ const struct file_operations *real_fops = NULL;
+ struct file_operations *proxy_fops = NULL;
+ int srcu_idx, r;
+
+ r = debugfs_use_file_start(dentry, &srcu_idx);
+ if (r) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ real_fops = REAL_FOPS_DEREF(dentry);
+ real_fops = fops_get(real_fops);
+ if (!real_fops) {
+ /* Huh? Module did not cleanup after itself at exit? */
+ WARN(1, "debugfs file owner did not clean up at exit: %pd",
+ dentry);
+ r = -ENXIO;
+ goto out;
+ }
+
+ proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL);
+ if (!proxy_fops) {
+ r = -ENOMEM;
+ goto free_proxy;
+ }
+ __full_proxy_fops_init(proxy_fops, real_fops);
+ replace_fops(filp, proxy_fops);
+
+ if (real_fops->open) {
+ r = real_fops->open(inode, filp);
+ if (r) {
+ replace_fops(filp, d_inode(dentry)->i_fop);
+ goto free_proxy;
+ } else if (filp->f_op != proxy_fops) {
+ /* No protection against file removal anymore. */
+ WARN(1, "debugfs file owner replaced proxy fops: %pd",
+ dentry);
+ goto free_proxy;
+ }
+ }
+
+ goto out;
+free_proxy:
+ kfree(proxy_fops);
+ fops_put(real_fops);
+out:
+ debugfs_use_file_finish(srcu_idx);
+ return r;
+}
+
+const struct file_operations debugfs_full_proxy_file_operations = {
+ .open = full_proxy_open,
+};
+
+ssize_t debugfs_attr_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ ssize_t ret;
+ int srcu_idx;
+
+ ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!ret))
+ ret = simple_attr_read(file, buf, len, ppos);
+ debugfs_use_file_finish(srcu_idx);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_read);
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ ssize_t ret;
+ int srcu_idx;
+
+ ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!ret))
+ ret = simple_attr_write(file, buf, len, ppos);
+ debugfs_use_file_finish(srcu_idx);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write);
+
+static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
+ struct dentry *parent, void *value,
+ const struct file_operations *fops,
+ const struct file_operations *fops_ro,
+ const struct file_operations *fops_wo)
{
/* if there are no write bits set, make read only */
if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, fops_ro);
+ return debugfs_create_file_unsafe(name, mode, parent, value,
+ fops_ro);
/* if there are no read bits set, make write only */
if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, fops_wo);
+ return debugfs_create_file_unsafe(name, mode, parent, value,
+ fops_wo);
- return debugfs_create_file(name, mode, parent, value, fops);
+ return debugfs_create_file_unsafe(name, mode, parent, value, fops);
}
static int debugfs_u8_set(void *data, u64 val)
@@ -68,9 +341,9 @@ static int debugfs_u8_get(void *data, u64 *val)
*val = *(u8 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
/**
* debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
@@ -99,7 +372,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
struct dentry *debugfs_create_u8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_u8,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8,
&fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -114,9 +387,9 @@ static int debugfs_u16_get(void *data, u64 *val)
*val = *(u16 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
/**
* debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
@@ -145,7 +418,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
struct dentry *debugfs_create_u16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_u16,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16,
&fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -160,9 +433,9 @@ static int debugfs_u32_get(void *data, u64 *val)
*val = *(u32 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
/**
* debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
@@ -191,7 +464,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_u32,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32,
&fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -207,9 +480,9 @@ static int debugfs_u64_get(void *data, u64 *val)
*val = *(u64 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
/**
* debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
@@ -238,7 +511,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_u64,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64,
&fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
@@ -254,9 +527,10 @@ static int debugfs_ulong_get(void *data, u64 *val)
*val = *(unsigned long *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set,
+ "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
/**
* debugfs_create_ulong - create a debugfs file that is used to read and write
@@ -286,26 +560,30 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
struct dentry *parent, unsigned long *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_ulong,
- &fops_ulong_ro, &fops_ulong_wo);
+ return debugfs_create_mode_unsafe(name, mode, parent, value,
+ &fops_ulong, &fops_ulong_ro,
+ &fops_ulong_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_ulong);
-DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set,
+ "0x%04llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set,
+ "0x%08llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set,
+ "0x%016llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
/*
* debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
@@ -328,7 +606,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_x8,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8,
&fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -346,7 +624,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
struct dentry *debugfs_create_x16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_x16,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16,
&fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -364,7 +642,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
struct dentry *debugfs_create_x32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_x32,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32,
&fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -382,7 +660,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
struct dentry *debugfs_create_x64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_x64,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64,
&fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -398,10 +676,10 @@ static int debugfs_size_t_get(void *data, u64 *val)
*val = *(size_t *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
- "%llu\n"); /* %llu and %zu are more or less the same */
-DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+ "%llu\n"); /* %llu and %zu are more or less the same */
+DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
/**
* debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
@@ -416,8 +694,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *parent, size_t *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_size_t,
- &fops_size_t_ro, &fops_size_t_wo);
+ return debugfs_create_mode_unsafe(name, mode, parent, value,
+ &fops_size_t, &fops_size_t_ro,
+ &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -431,10 +710,12 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
*val = atomic_read((atomic_t *)data);
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
debugfs_atomic_t_set, "%lld\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
+ "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
+ "%lld\n");
/**
* debugfs_create_atomic_t - create a debugfs file that is used to read and
@@ -450,8 +731,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t,
- &fops_atomic_t_ro, &fops_atomic_t_wo);
+ return debugfs_create_mode_unsafe(name, mode, parent, value,
+ &fops_atomic_t, &fops_atomic_t_ro,
+ &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
@@ -459,9 +741,17 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[3];
- bool *val = file->private_data;
+ bool val;
+ int r, srcu_idx;
- if (*val)
+ r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!r))
+ val = *(bool *)file->private_data;
+ debugfs_use_file_finish(srcu_idx);
+ if (r)
+ return r;
+
+ if (val)
buf[0] = 'Y';
else
buf[0] = 'N';
@@ -477,6 +767,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
char buf[32];
size_t buf_size;
bool bv;
+ int r, srcu_idx;
bool *val = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
@@ -484,8 +775,14 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
return -EFAULT;
buf[buf_size] = '\0';
- if (strtobool(buf, &bv) == 0)
- *val = bv;
+ if (strtobool(buf, &bv) == 0) {
+ r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!r))
+ *val = bv;
+ debugfs_use_file_finish(srcu_idx);
+ if (r)
+ return r;
+ }
return count;
}
@@ -537,7 +834,7 @@ static const struct file_operations fops_bool_wo = {
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
struct dentry *parent, bool *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_bool,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool,
&fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
@@ -546,8 +843,15 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
struct debugfs_blob_wrapper *blob = file->private_data;
- return simple_read_from_buffer(user_buf, count, ppos, blob->data,
- blob->size);
+ ssize_t r;
+ int srcu_idx;
+
+ r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!r))
+ r = simple_read_from_buffer(user_buf, count, ppos, blob->data,
+ blob->size);
+ debugfs_use_file_finish(srcu_idx);
+ return r;
}
static const struct file_operations fops_blob = {
@@ -584,7 +888,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_blob_wrapper *blob)
{
- return debugfs_create_file(name, mode, parent, blob, &fops_blob);
+ return debugfs_create_file_unsafe(name, mode, parent, blob, &fops_blob);
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);
@@ -689,7 +993,8 @@ struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
data->array = array;
data->elements = elements;
- return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+ return debugfs_create_file_unsafe(name, mode, parent, data,
+ &u32_array_fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 8580831ed237..4bc1f68243c1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,9 +27,14 @@
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/srcu.h>
+
+#include "internal.h"
#define DEBUGFS_DEFAULT_MODE 0700
+DEFINE_SRCU(debugfs_srcu);
+
static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
@@ -39,7 +44,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime =
+ inode->i_ctime = current_fs_time(sb);
}
return inode;
}
@@ -294,6 +300,37 @@ static struct dentry *end_creating(struct dentry *dentry)
return dentry;
}
+static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *proxy_fops,
+ const struct file_operations *real_fops)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+ BUG_ON(!S_ISREG(mode));
+ dentry = start_creating(name, parent);
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = mode;
+ inode->i_private = data;
+
+ inode->i_fop = proxy_fops;
+ dentry->d_fsdata = (void *)real_fops;
+
+ d_instantiate(dentry, inode);
+ fsnotify_create(d_inode(dentry->d_parent), dentry);
+ return end_creating(dentry);
+}
+
/**
* debugfs_create_file - create a file in the debugfs filesystem
* @name: a pointer to a string containing the name of the file to create.
@@ -324,29 +361,52 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
- struct dentry *dentry;
- struct inode *inode;
- if (!(mode & S_IFMT))
- mode |= S_IFREG;
- BUG_ON(!S_ISREG(mode));
- dentry = start_creating(name, parent);
-
- if (IS_ERR(dentry))
- return NULL;
+ return __debugfs_create_file(name, mode, parent, data,
+ fops ? &debugfs_full_proxy_file_operations :
+ &debugfs_noop_file_operations,
+ fops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file);
- inode = debugfs_get_inode(dentry->d_sb);
- if (unlikely(!inode))
- return failed_creating(dentry);
+/**
+ * debugfs_create_file_unsafe - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ * on. The inode.i_private pointer will point to this value on
+ * the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ * this file.
+ *
+ * debugfs_create_file_unsafe() is completely analogous to
+ * debugfs_create_file(), the only difference being that the fops
+ * handed it will not get protected against file removals by the
+ * debugfs core.
+ *
+ * It is your responsibility to protect your struct file_operation
+ * methods against file removals by means of debugfs_use_file_start()
+ * and debugfs_use_file_finish(). ->open() is still protected by
+ * debugfs though.
+ *
+ * Any struct file_operations defined by means of
+ * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and
+ * thus, may be used here.
+ */
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops)
+{
- inode->i_mode = mode;
- inode->i_fop = fops ? fops : &debugfs_file_operations;
- inode->i_private = data;
- d_instantiate(dentry, inode);
- fsnotify_create(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return __debugfs_create_file(name, mode, parent, data,
+ fops ? &debugfs_open_proxy_file_operations :
+ &debugfs_noop_file_operations,
+ fops);
}
-EXPORT_SYMBOL_GPL(debugfs_create_file);
+EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
/**
* debugfs_create_file_size - create a file in the debugfs filesystem
@@ -461,7 +521,11 @@ struct dentry *debugfs_create_automount(const char *name,
inode->i_flags |= S_AUTOMOUNT;
inode->i_private = data;
dentry->d_fsdata = (void *)f;
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
d_instantiate(dentry, inode);
+ inc_nlink(d_inode(dentry->d_parent));
+ fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
EXPORT_SYMBOL(debugfs_create_automount);
@@ -565,6 +629,8 @@ void debugfs_remove(struct dentry *dentry)
inode_unlock(d_inode(parent));
if (!ret)
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+
+ synchronize_srcu(&debugfs_srcu);
}
EXPORT_SYMBOL_GPL(debugfs_remove);
@@ -642,6 +708,8 @@ void debugfs_remove_recursive(struct dentry *dentry)
if (!__debugfs_remove(child, parent))
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
inode_unlock(d_inode(parent));
+
+ synchronize_srcu(&debugfs_srcu);
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
new file mode 100644
index 000000000000..bba52634b995
--- /dev/null
+++ b/fs/debugfs/internal.h
@@ -0,0 +1,26 @@
+/*
+ * internal.h - declarations internal to debugfs
+ *
+ * Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ */
+
+#ifndef _DEBUGFS_INTERNAL_H_
+#define _DEBUGFS_INTERNAL_H_
+
+struct file_operations;
+
+/* declared over in file.c */
+extern const struct file_operations debugfs_noop_file_operations;
+extern const struct file_operations debugfs_open_proxy_file_operations;
+extern const struct file_operations debugfs_full_proxy_file_operations;
+
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops);
+
+#endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0b2954d7172d..37c134a132c7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = {
static DEFINE_MUTEX(allocated_ptys_lock);
-static struct vfsmount *devpts_mnt;
-
struct pts_mount_opts {
int setuid;
int setgid;
@@ -104,7 +102,7 @@ struct pts_mount_opts {
kgid_t gid;
umode_t mode;
umode_t ptmxmode;
- int newinstance;
+ int reserve;
int max;
};
@@ -117,11 +115,9 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
{Opt_ptmxmode, "ptmxmode=%o"},
{Opt_newinstance, "newinstance"},
{Opt_max, "max=%d"},
-#endif
{Opt_err, NULL}
};
@@ -137,15 +133,48 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
return sb->s_fs_info;
}
-static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+struct pts_fs_info *devpts_acquire(struct file *filp)
{
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
- if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
- return inode->i_sb;
-#endif
- if (!devpts_mnt)
- return NULL;
- return devpts_mnt->mnt_sb;
+ struct pts_fs_info *result;
+ struct path path;
+ struct super_block *sb;
+ int err;
+
+ path = filp->f_path;
+ path_get(&path);
+
+ /* Has the devpts filesystem already been found? */
+ sb = path.mnt->mnt_sb;
+ if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
+ /* Is a devpts filesystem at "pts" in the same directory? */
+ err = path_pts(&path);
+ if (err) {
+ result = ERR_PTR(err);
+ goto out;
+ }
+
+ /* Is the path the root of a devpts filesystem? */
+ result = ERR_PTR(-ENODEV);
+ sb = path.mnt->mnt_sb;
+ if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+ (path.mnt->mnt_root != sb->s_root))
+ goto out;
+ }
+
+ /*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+ atomic_inc(&sb->s_active);
+ result = DEVPTS_SB(sb);
+
+out:
+ path_put(&path);
+ return result;
+}
+
+void devpts_release(struct pts_fs_info *fsi)
+{
+ deactivate_super(fsi->sb);
}
#define PARSE_MOUNT 0
@@ -154,9 +183,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
/*
* parse_mount_options():
* Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value. The exception is
- * 'newinstance' option which can only be set/cleared on a mount (i.e.
- * cannot be changed during remount).
+ * specified in @data, set it to its default value.
*
* Note: @data may be NULL (in which case all options are set to default).
*/
@@ -174,9 +201,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
- /* newinstance makes sense only on initial mount */
+ /* Only allow instances mounted from the initial mount
+ * namespace to tap the reserve pool of ptys.
+ */
if (op == PARSE_MOUNT)
- opts->newinstance = 0;
+ opts->reserve =
+ (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns);
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
@@ -211,16 +241,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return -EINVAL;
opts->mode = option & S_IALLUGO;
break;
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
case Opt_ptmxmode:
if (match_octal(&args[0], &option))
return -EINVAL;
opts->ptmxmode = option & S_IALLUGO;
break;
case Opt_newinstance:
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 1;
break;
case Opt_max:
if (match_int(&args[0], &option) ||
@@ -228,7 +254,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return -EINVAL;
opts->max = option;
break;
-#endif
default:
pr_err("called with bogus options\n");
return -EINVAL;
@@ -238,7 +263,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return 0;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
static int mknod_ptmx(struct super_block *sb)
{
int mode;
@@ -305,12 +329,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
}
}
-#else
-static inline void update_ptmx_mode(struct pts_fs_info *fsi)
-{
- return;
-}
-#endif
static int devpts_remount(struct super_block *sb, int *flags, char *data)
{
@@ -344,11 +362,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
if (opts->max < NR_UNIX98_PTY_MAX)
seq_printf(seq, ",max=%d", opts->max);
-#endif
return 0;
}
@@ -410,40 +426,11 @@ fail:
return -ENOMEM;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
-{
- if (devpts_mnt)
- return devpts_mnt->mnt_sb == s;
- return 0;
-}
-
/*
* devpts_mount()
*
- * If the '-o newinstance' mount option was specified, mount a new
- * (private) instance of devpts. PTYs created in this instance are
- * independent of the PTYs in other devpts instances.
- *
- * If the '-o newinstance' option was not specified, mount/remount the
- * initial kernel mount of devpts. This type of mount gives the
- * legacy, single-instance semantics.
- *
- * The 'newinstance' option is needed to support multiple namespace
- * semantics in devpts while preserving backward compatibility of the
- * current 'single-namespace' semantics. i.e all mounts of devpts
- * without the 'newinstance' mount option should bind to the initial
- * kernel mount, like mount_single().
- *
- * Mounts with 'newinstance' option create a new, private namespace.
- *
- * NOTE:
- *
- * For single-mount semantics, devpts cannot use mount_single(),
- * because mount_single()/sget() find and use the super-block from
- * the most recent mount of devpts. But that recent mount may be a
- * 'newinstance' mount and mount_single() would pick the newinstance
- * super-block instead of the initial super-block.
+ * Mount a new (private) instance of devpts. PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
*/
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
@@ -456,18 +443,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
return ERR_PTR(error);
- /* Require newinstance for all user namespace mounts to ensure
- * the mount options are not changed.
- */
- if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
- return ERR_PTR(-EINVAL);
-
- if (opts.newinstance)
- s = sget(fs_type, NULL, set_anon_super, flags, NULL);
- else
- s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
- NULL);
-
+ s = sget(fs_type, NULL, set_anon_super, flags, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
@@ -491,18 +467,6 @@ out_undo_sget:
return ERR_PTR(error);
}
-#else
-/*
- * This supports only the legacy single-instance semantics (no
- * multiple-instance semantics)
- */
-static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_single(fs_type, flags, data, devpts_fill_super);
-}
-#endif
-
static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
@@ -516,9 +480,7 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
.fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
-#endif
};
/*
@@ -531,16 +493,13 @@ int devpts_new_index(struct pts_fs_info *fsi)
int index;
int ida_ret;
- if (!fsi)
- return -ENODEV;
-
retry:
if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
return -ENOMEM;
mutex_lock(&allocated_ptys_lock);
- if (pty_count >= pty_limit -
- (fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+ if (pty_count >= (pty_limit -
+ (fsi->mount_opts.reserve ? 0 : pty_reserve))) {
mutex_unlock(&allocated_ptys_lock);
return -ENOSPC;
}
@@ -571,30 +530,6 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx)
mutex_unlock(&allocated_ptys_lock);
}
-/*
- * pty code needs to hold extra references in case of last /dev/tty close
- */
-struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file)
-{
- struct super_block *sb;
- struct pts_fs_info *fsi;
-
- sb = pts_sb_from_inode(ptmx_inode);
- if (!sb)
- return NULL;
- fsi = DEVPTS_SB(sb);
- if (!fsi)
- return NULL;
-
- atomic_inc(&sb->s_active);
- return fsi;
-}
-
-void devpts_put_ref(struct pts_fs_info *fsi)
-{
- deactivate_super(fsi->sb);
-}
-
/**
* devpts_pty_new -- create a new inode in /dev/pts/
* @ptmx_inode: inode of the master
@@ -607,16 +542,12 @@ void devpts_put_ref(struct pts_fs_info *fsi)
struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
{
struct dentry *dentry;
- struct super_block *sb;
+ struct super_block *sb = fsi->sb;
struct inode *inode;
struct dentry *root;
struct pts_mount_opts *opts;
char s[12];
- if (!fsi)
- return ERR_PTR(-ENODEV);
-
- sb = fsi->sb;
root = sb->s_root;
opts = &fsi->mount_opts;
@@ -676,20 +607,8 @@ void devpts_pty_kill(struct dentry *dentry)
static int __init init_devpts_fs(void)
{
int err = register_filesystem(&devpts_fs_type);
- struct ctl_table_header *table;
-
if (!err) {
- struct vfsmount *mnt;
-
- table = register_sysctl_table(pty_root_table);
- mnt = kern_mount(&devpts_fs_type);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
- unregister_filesystem(&devpts_fs_type);
- unregister_sysctl_table(table);
- } else {
- devpts_mnt = mnt;
- }
+ register_sysctl_table(pty_root_table);
}
return err;
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3bf3f20f8ecc..f3b4408be590 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -628,11 +628,11 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
map_bh->b_size = fs_count << i_blkbits;
/*
- * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
- * forbid block creations: only overwrites are permitted.
- * We will return early to the caller once we see an
- * unmapped buffer head returned, and the caller will fall
- * back to buffered I/O.
+ * For writes that could fill holes inside i_size on a
+ * DIO_SKIP_HOLES filesystem we forbid block creations: only
+ * overwrites are permitted. We will return early to the caller
+ * once we see an unmapped buffer head returned, and the caller
+ * will fall back to buffered I/O.
*
* Otherwise the decision is left to the get_blocks method,
* which may decide to handle it or also return an unmapped
@@ -640,8 +640,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
*/
create = dio->rw & WRITE;
if (dio->flags & DIO_SKIP_HOLES) {
- if (sdio->block_in_file < (i_size_read(dio->inode) >>
- sdio->blkbits))
+ if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
+ i_blkbits))
create = 0;
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index ebd40f46ed4c..0d8eb3455b34 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1141,12 +1141,13 @@ ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
static int
ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
+ struct inode *ecryptfs_inode,
char *page_virt, size_t size)
{
int rc;
- rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt,
- size, 0);
+ rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode,
+ ECRYPTFS_XATTR_NAME, page_virt, size, 0);
return rc;
}
@@ -1215,8 +1216,8 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
goto out_free;
}
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
- rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
- size);
+ rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, ecryptfs_inode,
+ virt, size);
else
rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
virt_len);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 3ec495db7e82..4ba1547bb9ad 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -609,8 +609,8 @@ ssize_t
ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
const char *name, void *value, size_t size);
int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags);
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
#ifdef CONFIG_ECRYPT_FS_MESSAGING
int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 318b04689d76..9d153b6a1d72 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1001,7 +1001,8 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
}
int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
size_t size, int flags)
{
int rc = 0;
@@ -1014,8 +1015,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
- if (!rc && d_really_is_positive(dentry))
- fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry));
+ if (!rc && inode)
+ fsstack_copy_attr_all(inode, d_inode(lower_dentry));
out:
return rc;
}
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index 866bb18efefe..e818f5ac7a26 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -25,6 +25,7 @@
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/mount.h>
+#include <linux/file.h>
#include "ecryptfs_kernel.h"
struct ecryptfs_open_req {
@@ -147,7 +148,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR;
(*lower_file) = dentry_open(&req.path, flags, cred);
if (!IS_ERR(*lower_file))
- goto out;
+ goto have_file;
if ((flags & O_ACCMODE) == O_RDONLY) {
rc = PTR_ERR((*lower_file));
goto out;
@@ -165,8 +166,16 @@ int ecryptfs_privileged_open(struct file **lower_file,
mutex_unlock(&ecryptfs_kthread_ctl.mux);
wake_up(&ecryptfs_kthread_ctl.wait);
wait_for_completion(&req.done);
- if (IS_ERR(*lower_file))
+ if (IS_ERR(*lower_file)) {
rc = PTR_ERR(*lower_file);
+ goto out;
+ }
+have_file:
+ if ((*lower_file)->f_op->mmap == NULL) {
+ fput(*lower_file);
+ *lower_file = NULL;
+ rc = -EMEDIUMTYPE;
+ }
out:
return rc;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 148d11b514fb..9c3437c8a5b1 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -442,7 +442,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
if (size < 0)
size = 8;
put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
- rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+ rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode,
+ ECRYPTFS_XATTR_NAME,
xattr_virt, size, 0);
inode_unlock(lower_inode);
if (rc)
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index e2ab6d0497f2..1d73fc6dba13 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/uuid.h>
#include "internal.h"
@@ -46,11 +47,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
*/
bool efivarfs_valid_name(const char *str, int len)
{
- static const char dashes[EFI_VARIABLE_GUID_LEN] = {
- [8] = 1, [13] = 1, [18] = 1, [23] = 1
- };
const char *s = str + len - EFI_VARIABLE_GUID_LEN;
- int i;
/*
* We need a GUID, plus at least one letter for the variable name,
@@ -68,37 +65,7 @@ bool efivarfs_valid_name(const char *str, int len)
*
* 12345678-1234-1234-1234-123456789abc
*/
- for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) {
- if (dashes[i]) {
- if (*s++ != '-')
- return false;
- } else {
- if (!isxdigit(*s++))
- return false;
- }
- }
-
- return true;
-}
-
-static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
-{
- guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]);
- guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]);
- guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]);
- guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]);
- guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]);
- guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]);
- guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]);
- guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]);
- guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]);
- guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]);
- guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]);
- guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]);
- guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]);
- guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]);
- guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]);
- guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]);
+ return uuid_is_valid(s);
}
static int efivarfs_create(struct inode *dir, struct dentry *dentry,
@@ -119,8 +86,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
/* length of the variable name itself: remove GUID and separator */
namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
- efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
- &var->var.VendorGuid);
+ uuid_le_to_bin(dentry->d_name.name + namelen + 1, &var->var.VendorGuid);
if (efivar_variable_is_removable(var->var.VendorGuid,
dentry->d_name.name, namelen))
diff --git a/fs/efs/super.c b/fs/efs/super.c
index cb68dac4f9d3..368f7dd21c61 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -275,7 +275,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
if (!bh) {
pr_err("cannot read volume header\n");
- return -EINVAL;
+ return -EIO;
}
/*
@@ -293,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
bh = sb_bread(s, sb->fs_start + EFS_SUPER);
if (!bh) {
pr_err("cannot read superblock\n");
- return -EINVAL;
+ return -EIO;
}
if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a74a2a52e0f..10db91218933 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep,
return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}
-static inline struct timespec ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_mstimeout(long ms)
{
- struct timespec now, ts = {
+ struct timespec64 now, ts = {
.tv_sec = ms / MSEC_PER_SEC,
.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
};
- ktime_get_ts(&now);
- return timespec_add_safe(now, ts);
+ ktime_get_ts64(&now);
+ return timespec64_add_safe(now, ts);
}
/**
@@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
ktime_t expires, *to = NULL;
if (timeout > 0) {
- struct timespec end_time = ep_set_mstimeout(timeout);
+ struct timespec64 end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
- *to = timespec_to_ktime(end_time);
+ *to = timespec64_to_ktime(end_time);
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
diff --git a/fs/exec.c b/fs/exec.c
index a98b21d47385..887c1c955df8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -243,10 +243,6 @@ static void put_arg_page(struct page *page)
put_page(page);
}
-static void free_arg_page(struct linux_binprm *bprm, int i)
-{
-}
-
static void free_arg_pages(struct linux_binprm *bprm)
{
}
@@ -267,7 +263,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
if (!vma)
return -ENOMEM;
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem)) {
+ err = -EINTR;
+ goto err_free;
+ }
vma->vm_mm = mm;
/*
@@ -294,6 +293,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
return 0;
err:
up_write(&mm->mmap_sem);
+err_free:
bprm->vma = NULL;
kmem_cache_free(vm_area_cachep, vma);
return err;
@@ -700,7 +700,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
bprm->loader -= stack_shift;
bprm->exec -= stack_shift;
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
vm_flags = VM_STACK_FLAGS;
/*
@@ -850,15 +852,25 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
if (ret)
return ret;
+ ret = deny_write_access(file);
+ if (ret)
+ return ret;
+
i_size = i_size_read(file_inode(file));
- if (max_size > 0 && i_size > max_size)
- return -EFBIG;
- if (i_size <= 0)
- return -EINVAL;
+ if (max_size > 0 && i_size > max_size) {
+ ret = -EFBIG;
+ goto out;
+ }
+ if (i_size <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
*buf = vmalloc(i_size);
- if (!*buf)
- return -ENOMEM;
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
pos = 0;
while (pos < i_size) {
@@ -876,18 +888,21 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
if (pos != i_size) {
ret = -EIO;
- goto out;
+ goto out_free;
}
ret = security_kernel_post_read_file(file, *buf, i_size, id);
if (!ret)
*size = pos;
-out:
+out_free:
if (ret < 0) {
vfree(*buf);
*buf = NULL;
}
+
+out:
+ allow_write_access(file);
return ret;
}
EXPORT_SYMBOL_GPL(kernel_read_file);
@@ -1486,9 +1501,6 @@ int remove_arg_zero(struct linux_binprm *bprm)
kunmap_atomic(kaddr);
put_arg_page(page);
-
- if (offset == PAGE_SIZE)
- free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
} while (offset == PAGE_SIZE);
bprm->p++;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c1400b109805..868c02317b05 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+ ret = __dax_fault(vma, vmf, ext2_get_block);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
}
down_read(&ei->dax_sem);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
up_read(&ei->dax_sem);
if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b675610391b8..fcbe58641e40 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,6 +26,7 @@
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/dax.h>
+#include <linux/blkdev.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
@@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode,
* so that it's not found by another thread before it's
* initialised
*/
- err = dax_clear_sectors(inode->i_sb->s_bdev,
- le32_to_cpu(chain[depth-1].key) <<
- (inode->i_blkbits - 9),
- 1 << inode->i_blkbits);
+ err = sb_issue_zeroout(inode->i_sb,
+ le32_to_cpu(chain[depth-1].key), count,
+ GFP_NOFS);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
}
- }
+ } else
+ set_buffer_new(bh_result);
ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
mutex_unlock(&ei->truncate_mutex);
- set_buffer_new(bh_result);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b78caf25f746..1d9379568aa8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
- if (blocksize != PAGE_SIZE) {
- ext2_msg(sb, KERN_ERR,
- "error: unsupported blocksize for dax");
+ err = bdev_dax_supported(sb, blocksize);
+ if (err)
goto failed_mount;
- }
- if (!sb->s_bdev->bd_disk->fops->direct_access) {
- ext2_msg(sb, KERN_ERR,
- "error: device does not support dax");
- goto failed_mount;
- }
}
/* If the blocksize doesn't match, re-read the thing.. */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 7fd3b867ce65..7b9e9c1842d5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler,
static int
ext2_xattr_security_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 0f85705ff519..65049b71af13 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
static int
ext2_xattr_trusted_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 1fafd27037cc..fb2f992ae763 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
static int
ext2_xattr_user_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- if (!test_opt(dentry->d_sb, XATTR_USER))
+ if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fe1f50fe764f..3020fd70c392 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -610,7 +610,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
- return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ return 1;
}
/*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 5d00bf060254..68323e3da3fa 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -150,6 +150,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto errout;
+ }
+ cond_resched();
map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 72f4c9e00e97..b84aa1ca480a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,6 +33,7 @@
#include <linux/ratelimit.h>
#include <crypto/hash.h>
#include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -581,6 +582,9 @@ enum {
#define EXT4_GET_BLOCKS_ZERO 0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
+ /* Caller will submit data before dropping transaction handle. This
+ * allows jbd2 to avoid submitting data before commit. */
+#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
/*
* The bit position of these flags must not overlap with any of the
@@ -1505,6 +1509,9 @@ struct ext4_sb_info {
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+
+ /* Barrier between changing inodes' journal flags and writepages ops. */
+ struct percpu_rw_semaphore s_journal_flag_rwsem;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1549,7 +1556,6 @@ enum {
EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
- EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
};
@@ -2521,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2581,7 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@ -3329,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
}
+static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
+{
+ int blksize = 1 << inode->i_blkbits;
+
+ return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
+}
+
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5f5846211095..09c1ef38cbe6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -359,10 +359,21 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return 0;
}
-static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+static inline int ext4_jbd2_inode_add_write(handle_t *handle,
+ struct inode *inode)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+ return jbd2_journal_inode_add_write(handle,
+ EXT4_I(inode)->jinode);
+ return 0;
+}
+
+static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
+ struct inode *inode)
+{
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_inode_add_wait(handle,
+ EXT4_I(inode)->jinode);
return 0;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 95bf4679ac54..2a2eef9c14e4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -120,9 +120,14 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
if (!ext4_handle_valid(handle))
return 0;
- if (handle->h_buffer_credits > needed)
+ if (handle->h_buffer_credits >= needed)
return 0;
- err = ext4_journal_extend(handle, needed);
+ /*
+ * If we need to extend the journal get a few extra blocks
+ * while we're at it for efficiency's sake.
+ */
+ needed += 3;
+ err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
if (err <= 0)
return err;
err = ext4_truncate_restart_trans(handle, inode, needed);
@@ -907,13 +912,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
eh = ext_block_hdr(bh);
ppos++;
- if (unlikely(ppos > depth)) {
- put_bh(bh);
- EXT4_ERROR_INODE(inode,
- "ppos %d > depth %d", ppos, depth);
- ret = -EFSCORRUPTED;
- goto err;
- }
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
}
@@ -2583,7 +2581,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
}
} else
ext4_error(sbi->s_sb, "strange request: removal(2) "
- "%u-%u from %u:%u\n",
+ "%u-%u from %u:%u",
from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
@@ -3738,7 +3736,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
if (ee_block != map->m_lblk || ee_len > map->m_len) {
#ifdef EXT4_DEBUG
ext4_warning("Inode (%ld) finished: extent logical block %llu,"
- " len %u; IO logical block %llu, len %u\n",
+ " len %u; IO logical block %llu, len %u",
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e38b987ac7f5..37e059202cd2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -707,7 +707,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
(status & EXTENT_STATUS_WRITTEN)) {
ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
" delayed and written which can potentially "
- " cause data loss.\n", lblk, len);
+ " cause data loss.", lblk, len);
WARN_ON(1);
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 00ff6912adb3..df44c877892a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
- result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
+ result = __dax_fault(vma, vmf, ext4_dax_get_block);
if (write) {
if (!IS_ERR(handle))
@@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
result = VM_FAULT_SIGBUS;
else
result = __dax_pmd_fault(vma, addr, pmd, flags,
- ext4_dax_mmap_get_block, NULL);
+ ext4_dax_get_block);
if (write) {
if (!IS_ERR(handle))
@@ -373,7 +373,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ext4_encrypted_inode(d_inode(dir)) &&
!ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) {
ext4_warning(inode->i_sb,
- "Inconsistent encryption contexts: %lu/%lu\n",
+ "Inconsistent encryption contexts: %lu/%lu",
(unsigned long) d_inode(dir)->i_ino,
(unsigned long) inode->i_ino);
dput(dir);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 237b877d316d..3da4cf8d18b6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1150,25 +1150,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
ext4_group_t block_group;
int bit;
- struct buffer_head *bitmap_bh;
+ struct buffer_head *bitmap_bh = NULL;
struct inode *inode = NULL;
- long err = -EIO;
+ int err = -EFSCORRUPTED;
- /* Error cases - e2fsck has already cleaned up for us */
- if (ino > max_ino) {
- ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
- err = -EFSCORRUPTED;
- goto error;
- }
+ if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+ goto bad_orphan;
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
- ino, err);
- goto error;
+ ext4_error(sb, "inode bitmap error %ld for orphan %lu",
+ ino, PTR_ERR(bitmap_bh));
+ return (struct inode *) bitmap_bh;
}
/* Having the inode bit set should be a 100% indicator that this
@@ -1179,15 +1174,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
goto bad_orphan;
inode = ext4_iget(sb, ino);
- if (IS_ERR(inode))
- goto iget_failed;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
+ ino, err);
+ return inode;
+ }
/*
- * If the orphans has i_nlinks > 0 then it should be able to be
- * truncated, otherwise it won't be removed from the orphan list
- * during processing and an infinite loop will result.
+ * If the orphans has i_nlinks > 0 then it should be able to
+ * be truncated, otherwise it won't be removed from the orphan
+ * list during processing and an infinite loop will result.
+ * Similarly, it must not be a bad inode.
*/
- if (inode->i_nlink && !ext4_can_truncate(inode))
+ if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
+ is_bad_inode(inode))
goto bad_orphan;
if (NEXT_ORPHAN(inode) > max_ino)
@@ -1195,29 +1196,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
brelse(bitmap_bh);
return inode;
-iget_failed:
- err = PTR_ERR(inode);
- inode = NULL;
bad_orphan:
- ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
- bit, (unsigned long long)bitmap_bh->b_blocknr,
- ext4_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_WARNING "inode=%p\n", inode);
+ ext4_error(sb, "bad orphan inode %lu", ino);
+ if (bitmap_bh)
+ printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ bit, (unsigned long long)bitmap_bh->b_blocknr,
+ ext4_test_bit(bit, bitmap_bh->b_data));
if (inode) {
- printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
+ printk(KERN_ERR "is_bad_inode(inode)=%d\n",
is_bad_inode(inode));
- printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
+ printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
- printk(KERN_WARNING "max_ino=%lu\n", max_ino);
- printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
+ printk(KERN_ERR "max_ino=%lu\n", max_ino);
+ printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
iput(inode);
}
brelse(bitmap_bh);
-error:
return ERR_PTR(err);
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 627b7e8f9ef3..bc15c2c17633 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -649,133 +649,6 @@ out:
}
/*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- loff_t offset = iocb->ki_pos;
- handle_t *handle;
- ssize_t ret;
- int orphan = 0;
- size_t count = iov_iter_count(iter);
- int retries = 0;
-
- if (iov_iter_rw(iter) == WRITE) {
- loff_t final_size = offset + count;
-
- if (final_size > inode->i_size) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ei->i_disksize = inode->i_size;
- ext4_journal_stop(handle);
- }
- }
-
-retry:
- if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
- /*
- * Nolock dioread optimization may be dynamically disabled
- * via ext4_inode_block_unlocked_dio(). Check inode's state
- * while holding extra i_dio_count ref.
- */
- inode_dio_begin(inode);
- smp_mb();
- if (unlikely(ext4_test_inode_state(inode,
- EXT4_STATE_DIOREAD_LOCK))) {
- inode_dio_end(inode);
- goto locked;
- }
- if (IS_DAX(inode))
- ret = dax_do_io(iocb, inode, iter,
- ext4_dio_get_block, NULL, 0);
- else
- ret = __blockdev_direct_IO(iocb, inode,
- inode->i_sb->s_bdev, iter,
- ext4_dio_get_block,
- NULL, NULL, 0);
- inode_dio_end(inode);
- } else {
-locked:
- if (IS_DAX(inode))
- ret = dax_do_io(iocb, inode, iter,
- ext4_dio_get_block, NULL, DIO_LOCKING);
- else
- ret = blockdev_direct_IO(iocb, inode, iter,
- ext4_dio_get_block);
-
- if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
- loff_t isize = i_size_read(inode);
- loff_t end = offset + count;
-
- if (end > isize)
- ext4_truncate_failed_write(inode);
- }
- }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
- if (orphan) {
- int err;
-
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- /* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
-
- goto out;
- }
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
- err = ext4_journal_stop(handle);
- if (ret == 0)
- ret = err;
- }
-out:
- return ret;
-}
-
-/*
* Calculate the number of metadata blocks need to reserve
* to allocate a new block at @lblocks for non extent file based file
*/
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 7bc6c855cc18..ff7538c26992 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1780,7 +1780,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - "
"inode %u, rec_len %u, name_len %d"
- "inline size %d\n",
+ "inline size %d",
dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len,
inline_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 79b298d397b4..f7140ca66e3b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -684,6 +684,24 @@ out_sem:
ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
+
+ /*
+ * Inodes with freshly allocated blocks where contents will be
+ * visible after transaction commit must be on transaction's
+ * ordered data list.
+ */
+ if (map->m_flags & EXT4_MAP_NEW &&
+ !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
+ !(flags & EXT4_GET_BLOCKS_ZERO) &&
+ !IS_NOQUOTA(inode) &&
+ ext4_should_order_data(inode)) {
+ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+ ret = ext4_jbd2_inode_add_wait(handle, inode);
+ else
+ ret = ext4_jbd2_inode_add_write(handle, inode);
+ if (ret)
+ return ret;
+ }
}
return retval;
}
@@ -1289,15 +1307,6 @@ static int ext4_write_end(struct file *file,
int i_size_changed = 0;
trace_ext4_write_end(inode, pos, len, copied);
- if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
- ret = ext4_jbd2_file_inode(handle, inode);
- if (ret) {
- unlock_page(page);
- put_page(page);
- goto errout;
- }
- }
-
if (ext4_has_inline_data(inode)) {
ret = ext4_write_inline_data_end(inode, pos, len,
copied, page);
@@ -2313,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* the data was copied into the page cache.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
- EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ EXT4_GET_BLOCKS_METADATA_NOFAIL |
+ EXT4_GET_BLOCKS_IO_SUBMIT;
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2602,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
struct blk_plug plug;
bool give_up_on_write = false;
+ percpu_down_read(&sbi->s_journal_flag_rwsem);
trace_ext4_writepages(inode, wbc);
- if (dax_mapping(mapping))
- return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
- wbc);
+ if (dax_mapping(mapping)) {
+ ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+ wbc);
+ goto out_writepages;
+ }
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2776,6 +2789,7 @@ retry:
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
+ percpu_up_read(&sbi->s_journal_flag_rwsem);
return ret;
}
@@ -3215,75 +3229,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
#ifdef CONFIG_FS_DAX
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+/*
+ * Get block function for DAX IO and mmap faults. It takes care of converting
+ * unwritten extents to written ones and initializes new / converted blocks
+ * to zeros.
+ */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
- int ret, err;
- int credits;
- struct ext4_map_blocks map;
- handle_t *handle = NULL;
- int flags = 0;
-
- ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
- inode->i_ino, create);
- map.m_lblk = iblock;
- map.m_len = bh_result->b_size >> inode->i_blkbits;
- credits = ext4_chunk_trans_blocks(inode, map.m_len);
- if (create) {
- flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
- }
- }
+ int ret;
- ret = ext4_map_blocks(handle, inode, &map, flags);
- if (create) {
- err = ext4_journal_stop(handle);
- if (ret >= 0 && err < 0)
- ret = err;
- }
- if (ret <= 0)
- goto out;
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int err2;
+ ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
+ if (!create)
+ return _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * We are protected by i_mmap_sem so we know block cannot go
- * away from under us even though we dropped i_data_sem.
- * Convert extent to written and write zeros there.
- *
- * Note: We may get here even when create == 0.
- */
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
+ ret = ext4_get_block_trans(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_CREATE_ZERO);
+ if (ret < 0)
+ return ret;
- err = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
- if (err < 0)
- ret = err;
- err2 = ext4_journal_stop(handle);
- if (err2 < 0 && ret > 0)
- ret = err2;
- }
-out:
- WARN_ON_ONCE(ret == 0 && create);
- if (ret > 0) {
- map_bh(bh_result, inode->i_sb, map.m_pblk);
+ if (buffer_unwritten(bh_result)) {
/*
- * At least for now we have to clear BH_New so that DAX code
- * doesn't attempt to zero blocks again in a racy way.
+ * We are protected by i_mmap_sem or i_mutex so we know block
+ * cannot go away from under us even though we dropped
+ * i_data_sem. Convert extent to written and write zeros there.
*/
- map.m_flags &= ~EXT4_MAP_NEW;
- ext4_update_bh_state(bh_result, map.m_flags);
- bh_result->b_size = map.m_len << inode->i_blkbits;
- ret = 0;
+ ret = ext4_get_block_trans(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_CONVERT |
+ EXT4_GET_BLOCKS_CREATE_ZERO);
+ if (ret < 0)
+ return ret;
}
- return ret;
+ /*
+ * At least for now we have to clear BH_New so that DAX code
+ * doesn't attempt to zero blocks again in a racy way.
+ */
+ clear_buffer_new(bh_result);
+ return 0;
+}
+#else
+/* Just define empty function, it will never get called. */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ BUG();
+ return 0;
}
#endif
@@ -3316,7 +3307,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
}
/*
- * For ext4 extent files, ext4 will do direct-io write to holes,
+ * Handling of direct IO writes.
+ *
+ * For ext4 extent files, ext4 will do direct-io write even to holes,
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
@@ -3334,10 +3327,11 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
* if the machine crashes during the write.
*
*/
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
ssize_t ret;
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(iter);
@@ -3345,10 +3339,25 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
+ int orphan = 0;
+ handle_t *handle;
- /* Use the old path for reads and writes beyond i_size. */
- if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
- return ext4_ind_direct_IO(iocb, iter);
+ if (final_size > inode->i_size) {
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ orphan = 1;
+ ei->i_disksize = inode->i_size;
+ ext4_journal_stop(handle);
+ }
BUG_ON(iocb->private == NULL);
@@ -3357,8 +3366,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* conversion. This also disallows race between truncate() and
* overwrite DIO as i_dio_count needs to be incremented under i_mutex.
*/
- if (iov_iter_rw(iter) == WRITE)
- inode_dio_begin(inode);
+ inode_dio_begin(inode);
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
@@ -3367,7 +3375,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
inode_unlock(inode);
/*
- * We could direct write to holes and fallocate.
+ * For extent mapped files we could direct write to holes and fallocate.
*
* Allocated blocks to fill the hole are marked as unwritten to prevent
* parallel buffered read to expose the stale data before DIO complete
@@ -3389,7 +3397,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
iocb->private = NULL;
if (overwrite)
get_block_func = ext4_dio_get_block_overwrite;
- else if (is_sync_kiocb(iocb)) {
+ else if (IS_DAX(inode)) {
+ /*
+ * We can avoid zeroing for aligned DAX writes beyond EOF. Other
+ * writes need zeroing either because they can race with page
+ * faults or because they use partial blocks.
+ */
+ if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
+ ext4_aligned_io(inode, offset, count))
+ get_block_func = ext4_dio_get_block;
+ else
+ get_block_func = ext4_dax_get_block;
+ dio_flags = DIO_LOCKING;
+ } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+ get_block_func = ext4_dio_get_block;
+ dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
+ } else if (is_sync_kiocb(iocb)) {
get_block_func = ext4_dio_get_block_unwritten_sync;
dio_flags = DIO_LOCKING;
} else {
@@ -3399,10 +3423,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
#ifdef CONFIG_EXT4_FS_ENCRYPTION
BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
#endif
- if (IS_DAX(inode))
+ if (IS_DAX(inode)) {
ret = dax_do_io(iocb, inode, iter, get_block_func,
ext4_end_io_dio, dio_flags);
- else
+ } else
ret = __blockdev_direct_IO(iocb, inode,
inode->i_sb->s_bdev, iter,
get_block_func,
@@ -3422,12 +3446,86 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
- if (iov_iter_rw(iter) == WRITE)
- inode_dio_end(inode);
+ inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite)
inode_lock(inode);
+ if (ret < 0 && final_size > inode->i_size)
+ ext4_truncate_failed_write(inode);
+
+ /* Handle extending of i_size after direct IO write */
+ if (orphan) {
+ int err;
+
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ /* This is really bad luck. We've written the data
+ * but cannot extend i_size. Bail out and pretend
+ * the write failed... */
+ ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
+ goto out;
+ }
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ if (ret > 0) {
+ loff_t end = offset + ret;
+ if (end > inode->i_size) {
+ ei->i_disksize = end;
+ i_size_write(inode, end);
+ /*
+ * We're going to return a positive `ret'
+ * here due to non-zero-length I/O, so there's
+ * no way of reporting error returns from
+ * ext4_mark_inode_dirty() to userspace. So
+ * ignore it.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ err = ext4_journal_stop(handle);
+ if (ret == 0)
+ ret = err;
+ }
+out:
+ return ret;
+}
+
+static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int unlocked = 0;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ ssize_t ret;
+
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ inode_dio_begin(inode);
+ smp_mb();
+ if (unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK)))
+ inode_dio_end(inode);
+ else
+ unlocked = 1;
+ }
+ if (IS_DAX(inode)) {
+ ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block,
+ NULL, unlocked ? 0 : DIO_LOCKING);
+ } else {
+ ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, ext4_dio_get_block,
+ NULL, NULL,
+ unlocked ? 0 : DIO_LOCKING);
+ }
+ if (unlocked)
+ inode_dio_end(inode);
return ret;
}
@@ -3455,10 +3553,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return 0;
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_direct_IO(iocb, iter);
+ if (iov_iter_rw(iter) == READ)
+ ret = ext4_direct_IO_read(iocb, iter);
else
- ret = ext4_ind_direct_IO(iocb, iter);
+ ret = ext4_direct_IO_write(iocb, iter);
trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
return ret;
}
@@ -3534,10 +3632,7 @@ void ext4_set_aops(struct inode *inode)
{
switch (ext4_inode_journal_mode(inode)) {
case EXT4_INODE_ORDERED_DATA_MODE:
- ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
- break;
case EXT4_INODE_WRITEBACK_DATA_MODE:
- ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
break;
case EXT4_INODE_JOURNAL_DATA_MODE:
inode->i_mapping->a_ops = &ext4_journalled_aops;
@@ -3630,8 +3725,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
} else {
err = 0;
mark_buffer_dirty(bh);
- if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
- err = ext4_jbd2_file_inode(handle, inode);
+ if (ext4_should_order_data(inode))
+ err = ext4_jbd2_inode_add_write(handle, inode);
}
unlock:
@@ -5429,6 +5524,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
journal_t *journal;
handle_t *handle;
int err;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
/*
* We have to be very careful here: changing a data block's
@@ -5445,22 +5541,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return 0;
if (is_journal_aborted(journal))
return -EROFS;
- /* We have to allocate physical blocks for delalloc blocks
- * before flushing journal. otherwise delalloc blocks can not
- * be allocated any more. even more truncate on delalloc blocks
- * could trigger BUG by flushing delalloc blocks in journal.
- * There is no delalloc block in non-journal data mode.
- */
- if (val && test_opt(inode->i_sb, DELALLOC)) {
- err = ext4_alloc_da_blocks(inode);
- if (err < 0)
- return err;
- }
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Before flushing the journal and switching inode's aops, we have
+ * to flush all dirty data the inode has. There can be outstanding
+ * delayed allocations, there can be unwritten extents created by
+ * fallocate or buffered writes in dioread_nolock mode covered by
+ * dirty data which can be converted only after flushing the dirty
+ * data (and journalled aops don't know how to handle these cases).
+ */
+ if (val) {
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err < 0) {
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ ext4_inode_resume_unlocked_dio(inode);
+ return err;
+ }
+ }
+
+ percpu_down_write(&sbi->s_journal_flag_rwsem);
jbd2_journal_lock_updates(journal);
/*
@@ -5477,6 +5581,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
err = jbd2_journal_flush(journal);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
+ percpu_up_write(&sbi->s_journal_flag_rwsem);
ext4_inode_resume_unlocked_dio(inode);
return err;
}
@@ -5485,6 +5590,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ percpu_up_write(&sbi->s_journal_flag_rwsem);
+
+ if (val)
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eae5917c534e..28cc412852af 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -13,8 +13,8 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/file.h>
-#include <linux/random.h>
#include <linux/quotaops.h>
+#include <linux/uuid.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -365,7 +365,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
struct dquot *transfer_to[MAXQUOTAS] = { };
transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
- if (transfer_to[PRJQUOTA]) {
+ if (!IS_ERR(transfer_to[PRJQUOTA])) {
err = __dquot_transfer(inode, transfer_to);
dqput(transfer_to[PRJQUOTA]);
if (err)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index eeeade76012e..c1ab3ec30423 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1266,6 +1266,7 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
int order = 1;
+ int bb_incr = 1 << (e4b->bd_blkbits - 1);
void *bb;
BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
@@ -1278,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
/* this block is part of buddy of order 'order' */
return order;
}
- bb += 1 << (e4b->bd_blkbits - order);
+ bb += bb_incr;
+ bb_incr >>= 1;
order++;
}
return 0;
@@ -2583,7 +2585,7 @@ int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
- unsigned offset;
+ unsigned offset, offset_incr;
unsigned max;
int ret;
@@ -2612,11 +2614,13 @@ int ext4_mb_init(struct super_block *sb)
i = 1;
offset = 0;
+ offset_incr = 1 << (sb->s_blocksize_bits - 1);
max = sb->s_blocksize << 2;
do {
sbi->s_mb_offsets[i] = offset;
sbi->s_mb_maxs[i] = max;
- offset += 1 << (sb->s_blocksize_bits - i);
+ offset += offset_incr;
+ offset_incr = offset_incr >> 1;
max = max >> 1;
i++;
} while (i <= sb->s_blocksize_bits + 1);
@@ -4935,7 +4939,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
* boundary.
*/
if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- ext4_warning(sb, "too much blocks added to group %u\n",
+ ext4_warning(sb, "too much blocks added to group %u",
block_group);
err = -EINVAL;
goto error_return;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 24445275d330..23d436d6f8b8 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -121,7 +121,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
__ext4_warning(sb, function, line, "%s", msg);
__ext4_warning(sb, function, line,
"MMP failure info: last update time: %llu, last update "
- "node: %s, last update device: %s\n",
+ "node: %s, last update device: %s",
(long long unsigned int) le64_to_cpu(mmp->mmp_time),
mmp->mmp_nodename, mmp->mmp_bdevname);
}
@@ -353,7 +353,7 @@ skip:
* wait for MMP interval and check mmp_seq.
*/
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
- ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ ext4_warning(sb, "MMP startup interrupted, failing mount");
goto failed;
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 325cef48b39a..a920c5d29fac 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -400,7 +400,7 @@ data_copy:
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
- *err = ext4_jbd2_file_inode(handle, orig_inode);
+ *err = ext4_jbd2_inode_add_write(handle, orig_inode);
unlock_pages:
unlock_page(pagep[0]);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5611ec9348d7..ec4c39952e84 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1107,6 +1107,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
while (1) {
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto errout;
+ }
+ cond_resched();
block = dx_get_block(frame->at);
ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
start_hash, start_minor_hash);
@@ -1613,7 +1618,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
if (nokey)
return ERR_PTR(-ENOKEY);
ext4_warning(inode->i_sb,
- "Inconsistent encryption contexts: %lu/%lu\n",
+ "Inconsistent encryption contexts: %lu/%lu",
(unsigned long) dir->i_ino,
(unsigned long) inode->i_ino);
return ERR_PTR(-EPERM);
@@ -2828,7 +2833,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
* list entries can cause panics at unmount time.
*/
mutex_lock(&sbi->s_orphan_lock);
- list_del(&EXT4_I(inode)->i_orphan);
+ list_del_init(&EXT4_I(inode)->i_orphan);
mutex_unlock(&sbi->s_orphan_lock);
}
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index e4fc8ea45d78..2a01df9cc1c3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -342,9 +342,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
WRITE_SYNC : WRITE;
- bio_get(io->io_bio);
submit_bio(io_op, io->io_bio);
- bio_put(io->io_bio);
}
io->io_bio = NULL;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 34038e3598d5..cf681004b196 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -41,7 +41,7 @@ int ext4_resize_begin(struct super_block *sb)
*/
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
ext4_warning(sb, "There are errors in the filesystem, "
- "so online resizing is not allowed\n");
+ "so online resizing is not allowed");
return -EPERM;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 304c712dbe12..3822a5aedc61 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3416,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
- if (blocksize != PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR,
- "error: unsupported blocksize for dax");
- goto failed_mount;
- }
- if (!sb->s_bdev->bd_disk->fops->direct_access) {
- ext4_msg(sb, KERN_ERR,
- "error: device does not support dax");
+ err = bdev_dax_supported(sb, blocksize);
+ if (err)
goto failed_mount;
- }
}
if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
@@ -3930,6 +3924,9 @@ no_journal:
if (!err)
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
GFP_KERNEL);
+ if (!err)
+ err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount6;
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 123a7d010efe..a8921112030d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
static int
ext4_xattr_security_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 60652fa24cbc..c7765c735714 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
static int
ext4_xattr_trusted_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 17a446ffecd3..ca20e423034b 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
static int
ext4_xattr_user_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- if (!test_opt(dentry->d_sb, XATTR_USER))
+ if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 1f8982a957f1..378c221d68a9 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -94,3 +94,11 @@ config F2FS_IO_TRACE
information and block IO patterns in the filesystem level.
If unsure, say N.
+
+config F2FS_FAULT_INJECTION
+ bool "F2FS fault injection facility"
+ depends on F2FS_FS
+ help
+ Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on.
+
+ If unsure, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 6f1fdda977b3..a31c7e859af6 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -115,7 +115,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
struct f2fs_acl_entry *entry;
int i;
- f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
+ f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
sizeof(struct f2fs_acl_entry), GFP_NOFS);
if (!f2fs_acl)
return ERR_PTR(-ENOMEM);
@@ -175,7 +175,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
if (retval > 0) {
- value = kmalloc(retval, GFP_F2FS_ZERO);
+ value = f2fs_kmalloc(retval, GFP_F2FS_ZERO);
if (!value)
return ERR_PTR(-ENOMEM);
retval = f2fs_getxattr(inode, name_index, "", value,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0955312e5ca0..389160049993 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -26,6 +26,14 @@
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *inode_entry_slab;
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
+{
+ set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ sbi->sb->s_flags |= MS_RDONLY;
+ if (!end_io)
+ f2fs_flush_merged_bios(sbi);
+}
+
/*
* We guarantee no failure on the returned page.
*/
@@ -34,7 +42,7 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
struct address_space *mapping = META_MAPPING(sbi);
struct page *page = NULL;
repeat:
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (!page) {
cond_resched();
goto repeat;
@@ -64,7 +72,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
if (unlikely(!is_meta))
fio.rw &= ~REQ_META;
repeat:
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (!page) {
cond_resched();
goto repeat;
@@ -91,7 +99,7 @@ repeat:
* meta page.
*/
if (unlikely(!PageUptodate(page)))
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
out:
return page;
}
@@ -186,7 +194,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
BUG();
}
- page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr);
+ page = f2fs_grab_cache_page(META_MAPPING(sbi),
+ fio.new_blkaddr, false);
if (!page)
continue;
if (PageUptodate(page)) {
@@ -211,7 +220,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
bool readahead = false;
page = find_get_page(META_MAPPING(sbi), index);
- if (!page || (page && !PageUptodate(page)))
+ if (!page || !PageUptodate(page))
readahead = true;
f2fs_put_page(page, 0);
@@ -448,12 +457,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
return e ? true : false;
}
-void release_ino_entry(struct f2fs_sb_info *sbi)
+void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
{
struct ino_entry *e, *tmp;
int i;
- for (i = APPEND_INO; i <= UPDATE_INO; i++) {
+ for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
struct inode_management *im = &sbi->im[i];
spin_lock(&im->ino_lock);
@@ -473,6 +482,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
int err = 0;
spin_lock(&im->ino_lock);
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_ORPHAN)) {
+ spin_unlock(&im->ino_lock);
+ return -ENOSPC;
+ }
+#endif
if (unlikely(im->ino_num >= sbi->max_orphans))
err = -ENOSPC;
else
@@ -777,43 +793,32 @@ void update_dirty_page(struct inode *inode, struct page *page)
!S_ISLNK(inode->i_mode))
return;
- spin_lock(&sbi->inode_lock[type]);
- __add_dirty_inode(inode, type);
- inode_inc_dirty_pages(inode);
- spin_unlock(&sbi->inode_lock[type]);
+ if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) {
+ spin_lock(&sbi->inode_lock[type]);
+ __add_dirty_inode(inode, type);
+ spin_unlock(&sbi->inode_lock[type]);
+ }
+ inode_inc_dirty_pages(inode);
SetPagePrivate(page);
f2fs_trace_pid(page);
}
-void add_dirty_dir_inode(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- spin_lock(&sbi->inode_lock[DIR_INODE]);
- __add_dirty_inode(inode, DIR_INODE);
- spin_unlock(&sbi->inode_lock[DIR_INODE]);
-}
-
void remove_dirty_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
!S_ISLNK(inode->i_mode))
return;
+ if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH))
+ return;
+
spin_lock(&sbi->inode_lock[type]);
__remove_dirty_inode(inode, type);
spin_unlock(&sbi->inode_lock[type]);
-
- /* Only from the recovery routine */
- if (is_inode_flag_set(fi, FI_DELAY_IPUT)) {
- clear_inode_flag(fi, FI_DELAY_IPUT);
- iput(inode);
- }
}
int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
@@ -892,7 +897,7 @@ retry_flush_nodes:
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
- err = sync_node_pages(sbi, 0, &wbc);
+ err = sync_node_pages(sbi, &wbc);
if (err) {
f2fs_unlock_all(sbi);
goto out;
@@ -917,7 +922,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!get_pages(sbi, F2FS_WRITEBACK))
+ if (!atomic_read(&sbi->nr_wb_bios))
break;
io_schedule_timeout(5*HZ);
@@ -1082,7 +1087,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
- sbi->alloc_valid_block_count = 0;
+ percpu_counter_set(&sbi->alloc_valid_block_count, 0);
/* Here, we only have one bio having CP pack */
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
@@ -1098,7 +1103,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
discard_blk);
- release_ino_entry(sbi);
+ release_ino_entry(sbi, false);
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index bb376c3bca62..9a8bbc1fb1fa 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -68,13 +68,12 @@ static void f2fs_write_end_io(struct bio *bio)
if (unlikely(bio->bi_error)) {
set_bit(AS_EIO, &page->mapping->flags);
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, true);
}
end_page_writeback(page);
- dec_page_count(sbi, F2FS_WRITEBACK);
}
-
- if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait))
+ if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
+ wq_has_sleeper(&sbi->cp_wait))
wake_up(&sbi->cp_wait);
bio_put(bio);
@@ -98,6 +97,14 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
return bio;
}
+static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw,
+ struct bio *bio)
+{
+ if (!is_read_io(rw))
+ atomic_inc(&sbi->nr_wb_bios);
+ submit_bio(rw, bio);
+}
+
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -110,7 +117,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
else
trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
- submit_bio(fio->rw, io->bio);
+ __submit_bio(io->sbi, fio->rw, io->bio);
io->bio = NULL;
}
@@ -228,7 +235,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
return -EFAULT;
}
- submit_bio(fio->rw, bio);
+ __submit_bio(fio->sbi, fio->rw, bio);
return 0;
}
@@ -248,9 +255,6 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
down_write(&io->io_rwsem);
- if (!is_read)
- inc_page_count(sbi, F2FS_WRITEBACK);
-
if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
io->fio.rw != fio->rw))
__submit_merged_bio(io);
@@ -278,6 +282,16 @@ alloc_new:
trace_f2fs_submit_page_mbio(fio->page, fio);
}
+static void __set_data_blkaddr(struct dnode_of_data *dn)
+{
+ struct f2fs_node *rn = F2FS_NODE(dn->node_page);
+ __le32 *addr_array;
+
+ /* Get physical address of data block */
+ addr_array = blkaddr_in_node(rn);
+ addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+}
+
/*
* Lock ordering for the change of data block address:
* ->data_page
@@ -286,19 +300,9 @@ alloc_new:
*/
void set_data_blkaddr(struct dnode_of_data *dn)
{
- struct f2fs_node *rn;
- __le32 *addr_array;
- struct page *node_page = dn->node_page;
- unsigned int ofs_in_node = dn->ofs_in_node;
-
- f2fs_wait_on_page_writeback(node_page, NODE, true);
-
- rn = F2FS_NODE(node_page);
-
- /* Get physical address of data block */
- addr_array = blkaddr_in_node(rn);
- addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
- if (set_page_dirty(node_page))
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+ __set_data_blkaddr(dn);
+ if (set_page_dirty(dn->node_page))
dn->node_changed = true;
}
@@ -309,24 +313,53 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
f2fs_update_extent_cache(dn);
}
-int reserve_new_block(struct dnode_of_data *dn)
+/* dn->ofs_in_node will be returned with up-to-date last block pointer */
+int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ if (!count)
+ return 0;
+
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
- if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
return -ENOSPC;
- trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
+ trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
+ dn->ofs_in_node, count);
+
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+
+ for (; count > 0; dn->ofs_in_node++) {
+ block_t blkaddr =
+ datablock_addr(dn->node_page, dn->ofs_in_node);
+ if (blkaddr == NULL_ADDR) {
+ dn->data_blkaddr = NEW_ADDR;
+ __set_data_blkaddr(dn);
+ count--;
+ }
+ }
+
+ if (set_page_dirty(dn->node_page))
+ dn->node_changed = true;
- dn->data_blkaddr = NEW_ADDR;
- set_data_blkaddr(dn);
mark_inode_dirty(dn->inode);
sync_inode_page(dn);
return 0;
}
+/* Should keep dn->ofs_in_node unchanged */
+int reserve_new_block(struct dnode_of_data *dn)
+{
+ unsigned int ofs_in_node = dn->ofs_in_node;
+ int ret;
+
+ ret = reserve_new_blocks(dn, 1);
+ dn->ofs_in_node = ofs_in_node;
+ return ret;
+}
+
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
{
bool need_put = dn->inode_page ? false : true;
@@ -545,6 +578,7 @@ static int __allocate_data_block(struct dnode_of_data *dn)
struct node_info ni;
int seg = CURSEG_WARM_DATA;
pgoff_t fofs;
+ blkcnt_t count = 1;
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
@@ -553,7 +587,7 @@ static int __allocate_data_block(struct dnode_of_data *dn)
if (dn->data_blkaddr == NEW_ADDR)
goto alloc;
- if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
return -ENOSPC;
alloc:
@@ -582,8 +616,8 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
struct f2fs_map_blocks map;
ssize_t ret = 0;
- map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos);
- map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from));
+ map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
+ map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from));
map.m_next_pgofs = NULL;
if (f2fs_encrypted_inode(inode))
@@ -621,8 +655,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
struct dnode_of_data dn;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
- pgoff_t pgofs, end_offset;
+ pgoff_t pgofs, end_offset, end;
int err = 0, ofs = 1;
+ unsigned int ofs_in_node, last_ofs_in_node;
+ blkcnt_t prealloc;
struct extent_info ei;
bool allocated = false;
block_t blkaddr;
@@ -632,6 +668,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
/* it only supports block size == page size */
pgofs = (pgoff_t)map->m_lblk;
+ end = pgofs + maxblocks;
if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
map->m_pblk = ei.blk + pgofs - ei.fofs;
@@ -648,6 +685,8 @@ next_dnode:
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
+ if (flag == F2FS_GET_BLOCK_BMAP)
+ map->m_pblk = 0;
if (err == -ENOENT) {
err = 0;
if (map->m_next_pgofs)
@@ -657,6 +696,8 @@ next_dnode:
goto unlock_out;
}
+ prealloc = 0;
+ ofs_in_node = dn.ofs_in_node;
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
@@ -669,31 +710,41 @@ next_block:
goto sync_out;
}
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
- if (blkaddr == NULL_ADDR)
- err = reserve_new_block(&dn);
+ if (blkaddr == NULL_ADDR) {
+ prealloc++;
+ last_ofs_in_node = dn.ofs_in_node;
+ }
} else {
err = __allocate_data_block(&dn);
+ if (!err) {
+ set_inode_flag(F2FS_I(inode),
+ FI_APPEND_WRITE);
+ allocated = true;
+ }
}
if (err)
goto sync_out;
- allocated = true;
map->m_flags = F2FS_MAP_NEW;
blkaddr = dn.data_blkaddr;
} else {
+ if (flag == F2FS_GET_BLOCK_BMAP) {
+ map->m_pblk = 0;
+ goto sync_out;
+ }
if (flag == F2FS_GET_BLOCK_FIEMAP &&
blkaddr == NULL_ADDR) {
if (map->m_next_pgofs)
*map->m_next_pgofs = pgofs + 1;
}
if (flag != F2FS_GET_BLOCK_FIEMAP ||
- blkaddr != NEW_ADDR) {
- if (flag == F2FS_GET_BLOCK_BMAP)
- err = -ENOENT;
+ blkaddr != NEW_ADDR)
goto sync_out;
- }
}
}
+ if (flag == F2FS_GET_BLOCK_PRE_AIO)
+ goto skip;
+
if (map->m_len == 0) {
/* preallocated unwritten block should be mapped for fiemap. */
if (blkaddr == NEW_ADDR)
@@ -705,32 +756,49 @@ next_block:
} else if ((map->m_pblk != NEW_ADDR &&
blkaddr == (map->m_pblk + ofs)) ||
(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
- flag == F2FS_GET_BLOCK_PRE_DIO ||
- flag == F2FS_GET_BLOCK_PRE_AIO) {
+ flag == F2FS_GET_BLOCK_PRE_DIO) {
ofs++;
map->m_len++;
} else {
goto sync_out;
}
+skip:
dn.ofs_in_node++;
pgofs++;
- if (map->m_len < maxblocks) {
- if (dn.ofs_in_node < end_offset)
- goto next_block;
+ /* preallocate blocks in batch for one dnode page */
+ if (flag == F2FS_GET_BLOCK_PRE_AIO &&
+ (pgofs == end || dn.ofs_in_node == end_offset)) {
- if (allocated)
- sync_inode_page(&dn);
- f2fs_put_dnode(&dn);
+ dn.ofs_in_node = ofs_in_node;
+ err = reserve_new_blocks(&dn, prealloc);
+ if (err)
+ goto sync_out;
- if (create) {
- f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, allocated);
+ map->m_len += dn.ofs_in_node - ofs_in_node;
+ if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
+ err = -ENOSPC;
+ goto sync_out;
}
- allocated = false;
- goto next_dnode;
+ dn.ofs_in_node = end_offset;
+ }
+
+ if (pgofs >= end)
+ goto sync_out;
+ else if (dn.ofs_in_node < end_offset)
+ goto next_block;
+
+ if (allocated)
+ sync_inode_page(&dn);
+ f2fs_put_dnode(&dn);
+
+ if (create) {
+ f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, allocated);
}
+ allocated = false;
+ goto next_dnode;
sync_out:
if (allocated)
@@ -983,7 +1051,7 @@ got_it:
*/
if (bio && (last_block_in_bio != block_nr - 1)) {
submit_and_realloc:
- submit_bio(READ, bio);
+ __submit_bio(F2FS_I_SB(inode), READ, bio);
bio = NULL;
}
if (bio == NULL) {
@@ -1026,7 +1094,7 @@ set_error_page:
goto next_page;
confused:
if (bio) {
- submit_bio(READ, bio);
+ __submit_bio(F2FS_I_SB(inode), READ, bio);
bio = NULL;
}
unlock_page(page);
@@ -1036,7 +1104,7 @@ next_page:
}
BUG_ON(pages && !list_empty(pages));
if (bio)
- submit_bio(READ, bio);
+ __submit_bio(F2FS_I_SB(inode), READ, bio);
return 0;
}
@@ -1177,8 +1245,10 @@ write:
goto redirty_out;
if (f2fs_is_drop_cache(inode))
goto out;
- if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
- available_free_memory(sbi, BASE_CHECK))
+ /* we should not write 0'th page having journal header */
+ if (f2fs_is_volatile_file(inode) && (!page->index ||
+ (!wbc->for_reclaim &&
+ available_free_memory(sbi, BASE_CHECK))))
goto redirty_out;
/* Dentry blocks are controlled by checkpoint */
@@ -1480,7 +1550,8 @@ restart:
if (pos + len <= MAX_INLINE_DATA) {
read_inline_data(page, ipage);
set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
- set_inline_node(ipage);
+ if (inode->i_nlink)
+ set_inline_node(ipage);
} else {
err = f2fs_convert_inline_page(&dn, page);
if (err)
@@ -1496,7 +1567,7 @@ restart:
} else {
/* hole case */
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err || (!err && dn.data_blkaddr == NULL_ADDR)) {
+ if (err || dn.data_blkaddr == NULL_ADDR) {
f2fs_put_dnode(&dn);
f2fs_lock_op(sbi);
locked = true;
@@ -1683,8 +1754,12 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
- if (err < 0 && iov_iter_rw(iter) == WRITE)
- f2fs_write_failed(mapping, offset + count);
+ if (iov_iter_rw(iter) == WRITE) {
+ if (err > 0)
+ set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
+ else if (err < 0)
+ f2fs_write_failed(mapping, offset + count);
+ }
trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err);
@@ -1714,6 +1789,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
if (IS_ATOMIC_WRITTEN_PAGE(page))
return;
+ set_page_private(page, 0);
ClearPagePrivate(page);
}
@@ -1727,6 +1803,7 @@ int f2fs_release_page(struct page *page, gfp_t wait)
if (IS_ATOMIC_WRITTEN_PAGE(page))
return 0;
+ set_page_private(page, 0);
ClearPagePrivate(page);
return 1;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index f4a61a5ff79f..d89a425055d0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -48,7 +48,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
- si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
+ si->wb_bios = atomic_read(&sbi->nr_wb_bios);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -58,6 +58,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->inline_xattr = atomic_read(&sbi->inline_xattr);
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
+ si->orphans = sbi->im[ORPHAN_INO].ino_num;
si->utilization = utilization(sbi);
si->free_segs = free_segments(sbi);
@@ -143,6 +144,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
si->base_mem += 2 * sizeof(struct f2fs_inode_info);
si->base_mem += sizeof(*sbi->ckpt);
+ si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
/* build sm */
si->base_mem += sizeof(struct f2fs_sm_info);
@@ -192,7 +194,7 @@ get_cache:
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
- for (i = 0; i <= UPDATE_INO; i++)
+ for (i = 0; i <= ORPHAN_INO; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
si->cache_mem += atomic_read(&sbi->total_ext_tree) *
sizeof(struct extent_tree);
@@ -216,8 +218,9 @@ static int stat_show(struct seq_file *s, void *v)
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
update_general_status(si->sbi);
- seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n",
- si->sbi->sb->s_bdev, i++);
+ seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n",
+ si->sbi->sb->s_bdev, i++,
+ f2fs_readonly(si->sbi->sb) ? "RO": "RW");
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -237,6 +240,8 @@ static int stat_show(struct seq_file *s, void *v)
si->inline_inode);
seq_printf(s, " - Inline_dentry Inode: %u\n",
si->inline_dir);
+ seq_printf(s, " - Orphan Inode: %u\n",
+ si->orphans);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
@@ -295,15 +300,15 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - inmem: %4d, wb: %4d\n",
- si->inmem_pages, si->wb_pages);
- seq_printf(s, " - nodes: %4d in %4d\n",
+ seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n",
+ si->inmem_pages, si->wb_bios);
+ seq_printf(s, " - nodes: %4lld in %4d\n",
si->ndirty_node, si->node_pages);
- seq_printf(s, " - dents: %4d in dirs:%4d\n",
+ seq_printf(s, " - dents: %4lld in dirs:%4d\n",
si->ndirty_dent, si->ndirty_dirs);
- seq_printf(s, " - datas: %4d in files:%4d\n",
+ seq_printf(s, " - datas: %4lld in files:%4d\n",
si->ndirty_data, si->ndirty_files);
- seq_printf(s, " - meta: %4d in %4d\n",
+ seq_printf(s, " - meta: %4lld in %4d\n",
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 9e4615146d13..f9313f684540 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
[F2FS_FT_SYMLINK] = DT_LNK,
};
-#define S_SHIFT 12
static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = F2FS_FT_DIR,
@@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
+unsigned char get_de_type(struct f2fs_dir_entry *de)
+{
+ if (de->file_type < F2FS_FT_MAX)
+ return f2fs_filetype_table[de->file_type];
+ return DT_UNKNOWN;
+}
+
static unsigned long dir_block_index(unsigned int level,
int dir_level, unsigned int idx)
{
@@ -95,11 +101,6 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
else
kunmap(dentry_page);
- /*
- * For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs has occurred.
- */
- f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0);
return de;
}
@@ -124,6 +125,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
de = &d->dentry[bit_pos];
+ if (unlikely(!de->name_len)) {
+ bit_pos++;
+ continue;
+ }
+
/* encrypted case */
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
@@ -141,10 +147,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
*max_slots = max_len;
max_len = 0;
- /* remain bug on condition */
- if (unlikely(!de->name_len))
- d->max = -1;
-
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
@@ -389,9 +391,14 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
return page;
if (S_ISDIR(inode->i_mode)) {
+ /* in order to handle error case */
+ get_page(page);
err = make_empty_dir(inode, dir, page);
- if (err)
- goto error;
+ if (err) {
+ lock_page(page);
+ goto put_error;
+ }
+ put_page(page);
}
err = f2fs_init_acl(inode, dir, page, dpage);
@@ -435,13 +442,12 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
return page;
put_error:
- f2fs_put_page(page, 1);
-error:
- /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
+ /* truncate empty dir pages */
truncate_inode_pages(&inode->i_data, 0);
- truncate_blocks(inode, 0, false);
- remove_dirty_inode(inode);
- remove_inode_page(inode);
+
+ clear_nlink(inode);
+ update_inode(inode, page);
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
}
@@ -509,11 +515,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
}
}
-/*
- * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op().
- */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
struct inode *inode, nid_t ino, umode_t mode)
{
unsigned int bit_pos;
@@ -526,28 +528,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dentry_ptr d;
struct page *page = NULL;
- struct fscrypt_name fname;
- struct qstr new_name;
- int slots, err;
-
- err = fscrypt_setup_filename(dir, name, 0, &fname);
- if (err)
- return err;
-
- new_name.name = fname_name(&fname);
- new_name.len = fname_len(&fname);
-
- if (f2fs_has_inline_dentry(dir)) {
- err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
- if (!err || err != -EAGAIN)
- goto out;
- else
- err = 0;
- }
+ int slots, err = 0;
level = 0;
- slots = GET_DENTRY_SLOTS(new_name.len);
- dentry_hash = f2fs_dentry_hash(&new_name);
+ slots = GET_DENTRY_SLOTS(new_name->len);
+ dentry_hash = f2fs_dentry_hash(new_name);
current_depth = F2FS_I(dir)->i_current_depth;
if (F2FS_I(dir)->chash == dentry_hash) {
@@ -556,10 +541,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
}
start:
- if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) {
- err = -ENOSPC;
- goto out;
- }
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_DIR_DEPTH))
+ return -ENOSPC;
+#endif
+ if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
+ return -ENOSPC;
/* Increase the depth, if required */
if (level == current_depth)
@@ -573,10 +560,8 @@ start:
for (block = bidx; block <= (bidx + nblock - 1); block++) {
dentry_page = get_new_data_page(dir, NULL, block, true);
- if (IS_ERR(dentry_page)) {
- err = PTR_ERR(dentry_page);
- goto out;
- }
+ if (IS_ERR(dentry_page))
+ return PTR_ERR(dentry_page);
dentry_blk = kmap(dentry_page);
bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
@@ -596,7 +581,7 @@ add_dentry:
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, &new_name, NULL);
+ page = init_inode_metadata(inode, dir, new_name, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -606,7 +591,7 @@ add_dentry:
}
make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
- f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos);
+ f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
set_page_dirty(dentry_page);
@@ -628,7 +613,34 @@ fail:
}
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
-out:
+
+ return err;
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+ struct inode *inode, nid_t ino, umode_t mode)
+{
+ struct fscrypt_name fname;
+ struct qstr new_name;
+ int err;
+
+ err = fscrypt_setup_filename(dir, name, 0, &fname);
+ if (err)
+ return err;
+
+ new_name.name = fname_name(&fname);
+ new_name.len = fname_len(&fname);
+
+ err = -EAGAIN;
+ if (f2fs_has_inline_dentry(dir))
+ err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
+ if (err == -EAGAIN)
+ err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode);
+
fscrypt_free_filename(&fname);
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
return err;
@@ -792,10 +804,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
continue;
}
- if (de->file_type < F2FS_FT_MAX)
- d_type = f2fs_filetype_table[de->file_type];
- else
- d_type = DT_UNKNOWN;
+ d_type = get_de_type(de);
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
@@ -804,7 +813,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
int save_len = fstr->len;
int ret;
- de_name.name = kmalloc(de_name.len, GFP_NOFS);
+ de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS);
if (!de_name.name)
return false;
@@ -887,6 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
+ err = 0;
out:
fscrypt_fname_free_buffer(&fstr);
return err;
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index c859bb044728..5bfcdb9b69f2 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -196,8 +196,7 @@ bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
if (!i_ext || !i_ext->len)
return false;
- set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
- le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
+ get_extent_info(&ei, i_ext);
write_lock(&et->lock);
if (atomic_read(&et->node_cnt))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7a4558d17f36..916e7c238e3d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,6 +37,57 @@
} while (0)
#endif
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+enum {
+ FAULT_KMALLOC,
+ FAULT_PAGE_ALLOC,
+ FAULT_ALLOC_NID,
+ FAULT_ORPHAN,
+ FAULT_BLOCK,
+ FAULT_DIR_DEPTH,
+ FAULT_MAX,
+};
+
+struct f2fs_fault_info {
+ atomic_t inject_ops;
+ unsigned int inject_rate;
+ unsigned int inject_type;
+};
+
+extern struct f2fs_fault_info f2fs_fault;
+extern char *fault_name[FAULT_MAX];
+#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type)))
+
+static inline bool time_to_inject(int type)
+{
+ if (!f2fs_fault.inject_rate)
+ return false;
+ if (type == FAULT_KMALLOC && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_BLOCK && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type))
+ return false;
+
+ atomic_inc(&f2fs_fault.inject_ops);
+ if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) {
+ atomic_set(&f2fs_fault.inject_ops, 0);
+ printk("%sF2FS-fs : inject %s in %pF\n",
+ KERN_INFO,
+ fault_name[type],
+ __builtin_return_address(0));
+ return true;
+ }
+ return false;
+}
+#endif
+
/*
* For mount options
*/
@@ -56,6 +107,7 @@
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
+#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -159,7 +211,6 @@ struct fsync_inode_entry {
struct inode *inode; /* vfs inode pointer */
block_t blkaddr; /* block address locating the last fsync */
block_t last_dentry; /* block address locating the last dentry */
- block_t last_inode; /* block address locating the last inode */
};
#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats))
@@ -385,7 +436,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
- atomic_t dirty_pages; /* # of dirty pages */
+ struct percpu_counter dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
@@ -398,11 +449,11 @@ struct f2fs_inode_info {
};
static inline void get_extent_info(struct extent_info *ext,
- struct f2fs_extent i_ext)
+ struct f2fs_extent *i_ext)
{
- ext->fofs = le32_to_cpu(i_ext.fofs);
- ext->blk = le32_to_cpu(i_ext.blk);
- ext->len = le32_to_cpu(i_ext.len);
+ ext->fofs = le32_to_cpu(i_ext->fofs);
+ ext->blk = le32_to_cpu(i_ext->blk);
+ ext->len = le32_to_cpu(i_ext->len);
}
static inline void set_raw_extent(struct extent_info *ext,
@@ -599,7 +650,6 @@ struct f2fs_sm_info {
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
enum count_type {
- F2FS_WRITEBACK,
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
F2FS_DIRTY_NODES,
@@ -672,6 +722,7 @@ enum {
SBI_IS_CLOSE, /* specify unmounting */
SBI_NEED_FSCK, /* need fsck.f2fs to fix */
SBI_POR_DOING, /* recovery is doing or not */
+ SBI_NEED_SB_WRITE, /* need to recover superblock */
};
enum {
@@ -680,6 +731,10 @@ enum {
MAX_TIME,
};
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#define F2FS_KEY_DESC_PREFIX "f2fs:"
+#define F2FS_KEY_DESC_PREFIX_SIZE 5
+#endif
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
@@ -687,6 +742,10 @@ struct f2fs_sb_info {
int valid_super_block; /* valid super block no */
int s_flag; /* flags for sbi */
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
+ u8 key_prefix_size;
+#endif
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
struct inode *node_inode; /* cache node blocks */
@@ -742,18 +801,24 @@ struct f2fs_sb_info {
unsigned int total_sections; /* total section count */
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
- unsigned int total_valid_inode_count; /* valid inode count */
loff_t max_file_blocks; /* max block index of file */
int active_logs; /* # of active logs */
int dir_level; /* directory level */
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
- block_t alloc_valid_block_count; /* # of allocated blocks */
block_t discard_blks; /* discard command candidats */
block_t last_valid_block_count; /* for recovery */
u32 s_next_generation; /* for NFS support */
- atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */
+ atomic_t nr_wb_bios; /* # of writeback bios */
+
+ /* # of pages, see count_type */
+ struct percpu_counter nr_pages[NR_COUNT_TYPE];
+ /* # of allocated blocks */
+ struct percpu_counter alloc_valid_block_count;
+
+ /* valid inode count */
+ struct percpu_counter total_valid_inode_count;
struct f2fs_mount_info mount_opt; /* mount options */
@@ -1055,21 +1120,33 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
}
static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
- struct inode *inode, blkcnt_t count)
+ struct inode *inode, blkcnt_t *count)
{
block_t valid_block_count;
spin_lock(&sbi->stat_lock);
- valid_block_count =
- sbi->total_valid_block_count + (block_t)count;
- if (unlikely(valid_block_count > sbi->user_block_count)) {
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_BLOCK)) {
spin_unlock(&sbi->stat_lock);
return false;
}
- inode->i_blocks += count;
- sbi->total_valid_block_count = valid_block_count;
- sbi->alloc_valid_block_count += (block_t)count;
+#endif
+ valid_block_count =
+ sbi->total_valid_block_count + (block_t)(*count);
+ if (unlikely(valid_block_count > sbi->user_block_count)) {
+ *count = sbi->user_block_count - sbi->total_valid_block_count;
+ if (!*count) {
+ spin_unlock(&sbi->stat_lock);
+ return false;
+ }
+ }
+ /* *count can be recalculated */
+ inode->i_blocks += *count;
+ sbi->total_valid_block_count =
+ sbi->total_valid_block_count + (block_t)(*count);
spin_unlock(&sbi->stat_lock);
+
+ percpu_counter_add(&sbi->alloc_valid_block_count, (*count));
return true;
}
@@ -1087,20 +1164,20 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
- atomic_inc(&sbi->nr_pages[count_type]);
+ percpu_counter_inc(&sbi->nr_pages[count_type]);
set_sbi_flag(sbi, SBI_IS_DIRTY);
}
static inline void inode_inc_dirty_pages(struct inode *inode)
{
- atomic_inc(&F2FS_I(inode)->dirty_pages);
+ percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
{
- atomic_dec(&sbi->nr_pages[count_type]);
+ percpu_counter_dec(&sbi->nr_pages[count_type]);
}
static inline void inode_dec_dirty_pages(struct inode *inode)
@@ -1109,26 +1186,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
!S_ISLNK(inode->i_mode))
return;
- atomic_dec(&F2FS_I(inode)->dirty_pages);
+ percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
-static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
+static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
{
- return atomic_read(&sbi->nr_pages[count_type]);
+ return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
}
-static inline int get_dirty_pages(struct inode *inode)
+static inline s64 get_dirty_pages(struct inode *inode)
{
- return atomic_read(&F2FS_I(inode)->dirty_pages);
+ return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
}
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
- return ((get_pages(sbi, block_type) + pages_per_sec - 1)
- >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+ unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >>
+ sbi->log_blocks_per_seg;
+
+ return segs / sbi->segs_per_sec;
}
static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
@@ -1217,11 +1296,11 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
if (inode)
inode->i_blocks++;
- sbi->alloc_valid_block_count++;
sbi->total_valid_node_count++;
sbi->total_valid_block_count++;
spin_unlock(&sbi->stat_lock);
+ percpu_counter_inc(&sbi->alloc_valid_block_count);
return true;
}
@@ -1248,28 +1327,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
{
- spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
- sbi->total_valid_inode_count++;
- spin_unlock(&sbi->stat_lock);
+ percpu_counter_inc(&sbi->total_valid_inode_count);
}
static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
{
- spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
- sbi->total_valid_inode_count--;
- spin_unlock(&sbi->stat_lock);
+ percpu_counter_dec(&sbi->total_valid_inode_count);
}
-static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
+static inline s64 valid_inode_count(struct f2fs_sb_info *sbi)
{
- return sbi->total_valid_inode_count;
+ return percpu_counter_sum_positive(&sbi->total_valid_inode_count);
}
static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
pgoff_t index, bool for_write)
{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct page *page = find_lock_page(mapping, index);
+ if (page)
+ return page;
+
+ if (time_to_inject(FAULT_PAGE_ALLOC))
+ return NULL;
+#endif
if (!for_write)
return grab_cache_page(mapping, index);
return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@ -1435,7 +1516,6 @@ enum {
FI_NO_ALLOC, /* should not allocate any blocks */
FI_FREE_NID, /* free allocated nide */
FI_UPDATE_DIR, /* should update inode block for consistency */
- FI_DELAY_IPUT, /* used for the recovery */
FI_NO_EXTENT, /* not to use the extent cache */
FI_INLINE_XATTR, /* used for inline xattr */
FI_INLINE_DATA, /* used for inline data*/
@@ -1618,12 +1698,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
}
-static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
-{
- set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
- sbi->sb->s_flags |= MS_RDONLY;
-}
-
static inline bool is_dot_dotdot(const struct qstr *str)
{
if (str->len == 1 && str->name[0] == '.')
@@ -1644,6 +1718,15 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
return S_ISREG(inode->i_mode);
}
+static inline void *f2fs_kmalloc(size_t size, gfp_t flags)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_KMALLOC))
+ return NULL;
+#endif
+ return kmalloc(size, flags);
+}
+
static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
{
void *ret;
@@ -1710,7 +1793,7 @@ struct dentry *f2fs_get_parent(struct dentry *child);
*/
extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
void set_de_type(struct f2fs_dir_entry *, umode_t);
-
+unsigned char get_de_type(struct f2fs_dir_entry *);
struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
@@ -1731,6 +1814,8 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
const struct qstr *, f2fs_hash_t , unsigned int);
+int f2fs_add_regular_entry(struct inode *, const struct qstr *,
+ struct inode *, nid_t, umode_t);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
umode_t);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
@@ -1781,7 +1866,10 @@ void ra_node_page(struct f2fs_sb_info *, nid_t);
struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_node_page_ra(struct page *, int);
void sync_inode_page(struct dnode_of_data *);
-int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
+void move_node_page(struct page *, int);
+int fsync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *,
+ bool);
+int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
@@ -1843,6 +1931,7 @@ void destroy_segment_manager_caches(void);
/*
* checkpoint.c
*/
+void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool);
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
@@ -1852,7 +1941,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
-void release_ino_entry(struct f2fs_sb_info *);
+void release_ino_entry(struct f2fs_sb_info *, bool);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
@@ -1861,7 +1950,6 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void update_dirty_page(struct inode *, struct page *);
-void add_dirty_dir_inode(struct inode *);
void remove_dirty_inode(struct inode *);
int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
@@ -1880,6 +1968,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *);
void f2fs_submit_page_mbio(struct f2fs_io_info *);
void set_data_blkaddr(struct dnode_of_data *);
void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
+int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
int reserve_new_block(struct dnode_of_data *);
int f2fs_get_block(struct dnode_of_data *, pgoff_t);
ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
@@ -1906,7 +1995,7 @@ void build_gc_manager(struct f2fs_sb_info *);
/*
* recovery.c
*/
-int recover_fsync_data(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *, bool);
bool space_for_roll_forward(struct f2fs_sb_info *);
/*
@@ -1921,12 +2010,12 @@ struct f2fs_stat_info {
unsigned long long hit_largest, hit_cached, hit_rbtree;
unsigned long long hit_total, total_ext;
int ext_tree, zombie_tree, ext_node;
- int ndirty_node, ndirty_meta;
- int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files;
+ s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages;
+ unsigned int ndirty_dirs, ndirty_files;
int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
- int bg_gc, inmem_pages, wb_pages;
- int inline_xattr, inline_inode, inline_dir;
+ int bg_gc, wb_bios;
+ int inline_xattr, inline_inode, inline_dir, orphans;
unsigned int valid_count, valid_node_count, valid_inode_count;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index eb9d027e5981..f4c0086655c4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,7 +20,7 @@
#include <linux/uaccess.h>
#include <linux/mount.h>
#include <linux/pagevec.h>
-#include <linux/random.h>
+#include <linux/uuid.h>
#include "f2fs.h"
#include "node.h"
@@ -182,7 +182,8 @@ static void try_to_fix_pino(struct inode *inode)
}
}
-int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
+ int datasync, bool atomic)
{
struct inode *inode = file->f_mapping->host;
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -256,7 +257,9 @@ go_write:
goto out;
}
sync_nodes:
- sync_node_pages(sbi, ino, &wbc);
+ ret = fsync_node_pages(sbi, ino, &wbc, atomic);
+ if (ret)
+ goto out;
/* if cp_error was enabled, we should avoid infinite loop */
if (unlikely(f2fs_cp_error(sbi))) {
@@ -288,6 +291,11 @@ out:
return ret;
}
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ return f2fs_do_sync_file(file, start, end, datasync, false);
+}
+
static pgoff_t __get_first_dirty_index(struct address_space *mapping,
pgoff_t pgofs, int whence)
{
@@ -555,6 +563,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
+ if (free_from >= sbi->max_file_blocks)
+ goto free_partial;
+
if (lock)
f2fs_lock_op(sbi);
@@ -573,7 +584,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
}
set_new_dnode(&dn, inode, ipage, NULL, 0);
- err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
+ err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA);
if (err) {
if (err == -ENOENT)
goto free_next;
@@ -596,7 +607,7 @@ free_next:
out:
if (lock)
f2fs_unlock_op(sbi);
-
+free_partial:
/* lastly zero out the first data page */
if (!err)
err = truncate_partial_data_page(inode, from, truncate_page);
@@ -986,6 +997,49 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
+static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
+ pgoff_t end)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ pgoff_t index = start;
+ unsigned int ofs_in_node = dn->ofs_in_node;
+ blkcnt_t count = 0;
+ int ret;
+
+ for (; index < end; index++, dn->ofs_in_node++) {
+ if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR)
+ count++;
+ }
+
+ dn->ofs_in_node = ofs_in_node;
+ ret = reserve_new_blocks(dn, count);
+ if (ret)
+ return ret;
+
+ dn->ofs_in_node = ofs_in_node;
+ for (index = start; index < end; index++, dn->ofs_in_node++) {
+ dn->data_blkaddr =
+ datablock_addr(dn->node_page, dn->ofs_in_node);
+ /*
+ * reserve_new_blocks will not guarantee entire block
+ * allocation.
+ */
+ if (dn->data_blkaddr == NULL_ADDR) {
+ ret = -ENOSPC;
+ break;
+ }
+ if (dn->data_blkaddr != NEW_ADDR) {
+ invalidate_blocks(sbi, dn->data_blkaddr);
+ dn->data_blkaddr = NEW_ADDR;
+ set_data_blkaddr(dn);
+ }
+ }
+
+ f2fs_update_extent_cache_range(dn, start, 0, index - start);
+
+ return ret;
+}
+
static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
int mode)
{
@@ -1036,35 +1090,32 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
(loff_t)pg_start << PAGE_SHIFT);
}
- for (index = pg_start; index < pg_end; index++) {
+ for (index = pg_start; index < pg_end;) {
struct dnode_of_data dn;
- struct page *ipage;
+ unsigned int end_offset;
+ pgoff_t end;
f2fs_lock_op(sbi);
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- f2fs_unlock_op(sbi);
- goto out;
- }
-
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, index);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (ret) {
f2fs_unlock_op(sbi);
goto out;
}
- if (dn.data_blkaddr != NEW_ADDR) {
- invalidate_blocks(sbi, dn.data_blkaddr);
- f2fs_update_data_blkaddr(&dn, NEW_ADDR);
- }
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end = min(pg_end, end_offset - dn.ofs_in_node + index);
+
+ ret = f2fs_do_zero_range(&dn, index, end);
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+ if (ret)
+ goto out;
+ index = end;
new_size = max_t(loff_t, new_size,
- (loff_t)(index + 1) << PAGE_SHIFT);
+ (loff_t)index << PAGE_SHIFT);
}
if (off_end) {
@@ -1147,10 +1198,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t len, int mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- pgoff_t index, pg_start, pg_end;
+ struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+ pgoff_t pg_end;
loff_t new_size = i_size_read(inode);
- loff_t off_start, off_end;
- int ret = 0;
+ loff_t off_end;
+ int ret;
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
@@ -1162,43 +1214,35 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
f2fs_balance_fs(sbi, true);
- pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
-
- off_start = offset & (PAGE_SIZE - 1);
+ pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT;
off_end = (offset + len) & (PAGE_SIZE - 1);
- f2fs_lock_op(sbi);
+ map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT;
+ map.m_len = pg_end - map.m_lblk;
+ if (off_end)
+ map.m_len++;
- for (index = pg_start; index <= pg_end; index++) {
- struct dnode_of_data dn;
+ ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ if (ret) {
+ pgoff_t last_off;
- if (index == pg_end && !off_end)
- goto noalloc;
+ if (!map.m_len)
+ return ret;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = f2fs_reserve_block(&dn, index);
- if (ret)
- break;
-noalloc:
- if (pg_start == pg_end)
- new_size = offset + len;
- else if (index == pg_start && off_start)
- new_size = (loff_t)(index + 1) << PAGE_SHIFT;
- else if (index == pg_end)
- new_size = ((loff_t)index << PAGE_SHIFT) +
- off_end;
- else
- new_size += PAGE_SIZE;
+ last_off = map.m_lblk + map.m_len - 1;
+
+ /* update new size to the failed position */
+ new_size = (last_off == pg_end) ? offset + len:
+ (loff_t)(last_off + 1) << PAGE_SHIFT;
+ } else {
+ new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
}
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- i_size_read(inode) < new_size) {
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) {
i_size_write(inode, new_size);
mark_inode_dirty(inode);
update_inode_page(inode);
}
- f2fs_unlock_op(sbi);
return ret;
}
@@ -1254,10 +1298,19 @@ out:
static int f2fs_release_file(struct inode *inode, struct file *filp)
{
+ /*
+ * f2fs_relase_file is called at every close calls. So we should
+ * not drop any inmemory pages by close called by other process.
+ */
+ if (!(filp->f_mode & FMODE_WRITE) ||
+ atomic_read(&inode->i_writecount) != 1)
+ return 0;
+
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
+ clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
filemap_fdatawrite(inode->i_mapping);
clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
@@ -1294,20 +1347,16 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
unsigned int oldflags;
int ret;
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (get_user(flags, (int __user *)arg))
+ return -EFAULT;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto out;
- }
-
- if (get_user(flags, (int __user *)arg)) {
- ret = -EFAULT;
- goto out;
- }
-
flags = f2fs_mask_flags(inode->i_mode, flags);
inode_lock(inode);
@@ -1350,17 +1399,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
if (f2fs_is_atomic_file(inode))
- return 0;
+ goto out;
ret = f2fs_convert_inline_inode(inode);
if (ret)
- return ret;
+ goto out;
set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return 0;
+ if (!get_dirty_pages(inode))
+ goto out;
+
+ f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+ "Unexpected flush for atomic writes: ino=%lu, npages=%lld",
+ inode->i_ino, get_dirty_pages(inode));
+ ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+ if (ret)
+ clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_commit_atomic_write(struct file *filp)
@@ -1371,13 +1438,15 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
- if (f2fs_is_volatile_file(inode))
- return 0;
-
ret = mnt_want_write_file(filp);
if (ret)
return ret;
+ inode_lock(inode);
+
+ if (f2fs_is_volatile_file(inode))
+ goto err_out;
+
if (f2fs_is_atomic_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
ret = commit_inmem_pages(inode);
@@ -1387,8 +1456,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
}
}
- ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
err_out:
+ inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
}
@@ -1401,32 +1471,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
if (f2fs_is_volatile_file(inode))
- return 0;
+ goto out;
ret = f2fs_convert_inline_inode(inode);
if (ret)
- return ret;
+ goto out;
set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return 0;
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_release_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ int ret;
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
if (!f2fs_is_volatile_file(inode))
- return 0;
+ goto out;
- if (!f2fs_is_first_block_written(inode))
- return truncate_partial_data_page(inode, 0, true);
+ if (!f2fs_is_first_block_written(inode)) {
+ ret = truncate_partial_data_page(inode, 0, true);
+ goto out;
+ }
- return punch_hole(inode, 0, F2FS_BLKSIZE);
+ ret = punch_hole(inode, 0, F2FS_BLKSIZE);
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_abort_volatile_write(struct file *filp)
@@ -1441,15 +1533,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
if (ret)
return ret;
- if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ inode_lock(inode);
+
+ if (f2fs_is_atomic_file(inode))
drop_inmem_pages(inode);
- }
if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
- ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
}
+ inode_unlock(inode);
+
mnt_drop_write_file(filp);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
@@ -1461,6 +1555,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct super_block *sb = sbi->sb;
__u32 in;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1468,31 +1563,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
if (get_user(in, (__u32 __user *)arg))
return -EFAULT;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
switch (in) {
case F2FS_GOING_DOWN_FULLSYNC:
sb = freeze_bdev(sb->s_bdev);
if (sb && !IS_ERR(sb)) {
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
thaw_bdev(sb->s_bdev, sb);
}
break;
case F2FS_GOING_DOWN_METASYNC:
/* do checkpoint only */
f2fs_sync_fs(sb, 1);
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
break;
case F2FS_GOING_DOWN_NOSYNC:
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
break;
case F2FS_GOING_DOWN_METAFLUSH:
sync_meta_pages(sbi, META, LONG_MAX);
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
break;
default:
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
f2fs_update_time(sbi, REQ_TIME);
- return 0;
+out:
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
@@ -1513,9 +1615,14 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
sizeof(range)))
return -EFAULT;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
range.minlen = max((unsigned int)range.minlen,
q->limits.discard_granularity);
ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+ mnt_drop_write_file(filp);
if (ret < 0)
return ret;
@@ -1540,13 +1647,21 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
+ int ret;
if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
sizeof(policy)))
return -EFAULT;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return fscrypt_process_policy(inode, &policy);
+ ret = fscrypt_process_policy(inode, &policy);
+
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
@@ -1603,6 +1718,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
__u32 sync;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1613,20 +1729,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
if (f2fs_readonly(sbi->sb))
return -EROFS;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
if (!sync) {
- if (!mutex_trylock(&sbi->gc_mutex))
- return -EBUSY;
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ ret = -EBUSY;
+ goto out;
+ }
} else {
mutex_lock(&sbi->gc_mutex);
}
- return f2fs_gc(sbi, sync);
+ ret = f2fs_gc(sbi, sync);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1634,7 +1760,14 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
if (f2fs_readonly(sbi->sb))
return -EROFS;
- return f2fs_sync_fs(sbi->sb, 1);
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = f2fs_sync_fs(sbi->sb, 1);
+
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b0051a97824c..38d56f678912 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -96,7 +96,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
dev_t dev = sbi->sb->s_bdev->bd_dev;
int err = 0;
- gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+ gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th) {
err = -ENOMEM;
goto out;
@@ -465,15 +465,7 @@ next_step:
continue;
}
- /* set page dirty and write it */
- if (gc_type == FG_GC) {
- f2fs_wait_on_page_writeback(node_page, NODE, true);
- set_page_dirty(node_page);
- } else {
- if (!PageWriteback(node_page))
- set_page_dirty(node_page);
- }
- f2fs_put_page(node_page, 1);
+ move_node_page(node_page, gc_type);
stat_inc_node_blk_count(sbi, 1, gc_type);
}
@@ -834,18 +826,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
f2fs_put_page(sum_page, 0);
}
- if (gc_type == FG_GC) {
- if (type == SUM_TYPE_NODE) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .for_reclaim = 0,
- };
- sync_node_pages(sbi, 0, &wbc);
- } else {
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- }
- }
+ if (gc_type == FG_GC)
+ f2fs_submit_merged_bio(sbi,
+ (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE);
blk_finish_plug(&plug);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index a2fbe6f427d3..a4bb155dd00a 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -161,7 +161,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
if (!f2fs_has_inline_data(inode))
return 0;
- page = grab_cache_page(inode->i_mapping, 0);
+ page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
if (!page)
return -ENOMEM;
@@ -303,11 +303,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
else
f2fs_put_page(ipage, 0);
- /*
- * For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs has occurred.
- */
- f2fs_bug_on(sbi, d.max < 0);
return de;
}
@@ -355,7 +350,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
* NOTE: ipage is grabbed by caller, but if any error occurs, we should
* release ipage in this function.
*/
-static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
struct f2fs_inline_dentry *inline_dentry)
{
struct page *page;
@@ -363,7 +358,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
struct f2fs_dentry_block *dentry_blk;
int err;
- page = grab_cache_page(dir->i_mapping, 0);
+ page = f2fs_grab_cache_page(dir->i_mapping, 0, false);
if (!page) {
f2fs_put_page(ipage, 1);
return -ENOMEM;
@@ -405,6 +400,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
stat_dec_inline_dir(dir);
clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
+ F2FS_I(dir)->i_current_depth = 1;
if (i_size_read(dir) < PAGE_SIZE) {
i_size_write(dir, PAGE_SIZE);
set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -416,6 +412,105 @@ out:
return err;
}
+static int f2fs_add_inline_entries(struct inode *dir,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ struct f2fs_dentry_ptr d;
+ unsigned long bit_pos = 0;
+ int err = 0;
+
+ make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+
+ while (bit_pos < d.max) {
+ struct f2fs_dir_entry *de;
+ struct qstr new_name;
+ nid_t ino;
+ umode_t fake_mode;
+
+ if (!test_bit_le(bit_pos, d.bitmap)) {
+ bit_pos++;
+ continue;
+ }
+
+ de = &d.dentry[bit_pos];
+
+ if (unlikely(!de->name_len)) {
+ bit_pos++;
+ continue;
+ }
+
+ new_name.name = d.filename[bit_pos];
+ new_name.len = de->name_len;
+
+ ino = le32_to_cpu(de->ino);
+ fake_mode = get_de_type(de) << S_SHIFT;
+
+ err = f2fs_add_regular_entry(dir, &new_name, NULL,
+ ino, fake_mode);
+ if (err)
+ goto punch_dentry_pages;
+
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ }
+ return 0;
+punch_dentry_pages:
+ truncate_inode_pages(&dir->i_data, 0);
+ truncate_blocks(dir, 0, false);
+ remove_dirty_inode(dir);
+ return err;
+}
+
+static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ struct f2fs_inline_dentry *backup_dentry;
+ struct f2fs_inode_info *fi = F2FS_I(dir);
+ int err;
+
+ backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry),
+ GFP_F2FS_ZERO);
+ if (!backup_dentry) {
+ f2fs_put_page(ipage, 1);
+ return -ENOMEM;
+ }
+
+ memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+ truncate_inline_inode(ipage, 0);
+
+ unlock_page(ipage);
+
+ err = f2fs_add_inline_entries(dir, backup_dentry);
+ if (err)
+ goto recover;
+
+ lock_page(ipage);
+
+ stat_dec_inline_dir(dir);
+ clear_inode_flag(fi, FI_INLINE_DENTRY);
+ update_inode(dir, ipage);
+ kfree(backup_dentry);
+ return 0;
+recover:
+ lock_page(ipage);
+ memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+ fi->i_current_depth = 0;
+ i_size_write(dir, MAX_INLINE_DATA);
+ update_inode(dir, ipage);
+ f2fs_put_page(ipage, 1);
+
+ kfree(backup_dentry);
+ return err;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ if (!F2FS_I(dir)->i_dir_level)
+ return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
+ else
+ return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
+}
+
int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode)
{
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cb269c46ac25..2e68adab0d64 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -283,7 +283,7 @@ retry:
cond_resched();
goto retry;
} else if (err != -ENOENT) {
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
}
return 0;
}
@@ -344,7 +344,7 @@ void f2fs_evict_inode(struct inode *inode)
sb_start_intwrite(inode->i_sb);
set_inode_flag(fi, FI_NO_ALLOC);
i_size_write(inode, 0);
-
+retry:
if (F2FS_HAS_BLOCKS(inode))
err = f2fs_truncate(inode, true);
@@ -354,6 +354,12 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_unlock_op(sbi);
}
+ /* give more chances, if ENOMEM case */
+ if (err == -ENOMEM) {
+ err = 0;
+ goto retry;
+ }
+
sb_end_intwrite(inode->i_sb);
no_delete:
stat_dec_inline_xattr(inode);
@@ -368,26 +374,11 @@ no_delete:
if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
if (is_inode_flag_set(fi, FI_FREE_NID)) {
- if (err && err != -ENOENT)
- alloc_nid_done(sbi, inode->i_ino);
- else
- alloc_nid_failed(sbi, inode->i_ino);
+ alloc_nid_failed(sbi, inode->i_ino);
clear_inode_flag(fi, FI_FREE_NID);
}
-
- if (err && err != -ENOENT) {
- if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) {
- /*
- * get here because we failed to release resource
- * of inode previously, reminder our user to run fsck
- * for fixing.
- */
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_msg(sbi->sb, KERN_WARNING,
- "inode (ino:%lu) resource leak, run fsck "
- "to fix this issue!", inode->i_ino);
- }
- }
+ f2fs_bug_on(sbi, err &&
+ !exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
out_clear:
fscrypt_put_encryption_info(inode, NULL);
clear_inode(inode);
@@ -397,37 +388,32 @@ out_clear:
void handle_failed_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int err = 0;
+ struct node_info ni;
- clear_nlink(inode);
- make_bad_inode(inode);
+ /* don't make bad inode, since it becomes a regular file. */
unlock_new_inode(inode);
- i_size_write(inode, 0);
- if (F2FS_HAS_BLOCKS(inode))
- err = f2fs_truncate(inode, false);
-
- if (!err)
- err = remove_inode_page(inode);
-
/*
- * if we skip truncate_node in remove_inode_page bacause we failed
- * before, it's better to find another way to release resource of
- * this inode (e.g. valid block count, node block or nid). Here we
- * choose to add this inode to orphan list, so that we can call iput
- * for releasing in orphan recovery flow.
- *
* Note: we should add inode to orphan list before f2fs_unlock_op()
* so we can prevent losing this orphan when encoutering checkpoint
* and following suddenly power-off.
*/
- if (err && err != -ENOENT) {
- err = acquire_orphan_inode(sbi);
- if (!err)
+ get_node_info(sbi, inode->i_ino, &ni);
+
+ if (ni.blk_addr != NULL_ADDR) {
+ int err = acquire_orphan_inode(sbi);
+ if (err) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Too many orphan inodes, run fsck to fix.");
+ } else {
add_orphan_inode(sbi, inode->i_ino);
+ }
+ alloc_nid_done(sbi, inode->i_ino);
+ } else {
+ set_inode_flag(F2FS_I(inode), FI_FREE_NID);
}
- set_inode_flag(F2FS_I(inode), FI_FREE_NID);
f2fs_unlock_op(sbi);
/* iput will drop the inode object */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 1a33de9d84b1..1f21aae80c40 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -407,6 +407,29 @@ cache:
up_write(&nm_i->nat_tree_lock);
}
+/*
+ * readahead MAX_RA_NODE number of node pages.
+ */
+static void ra_node_pages(struct page *parent, int start, int n)
+{
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ struct blk_plug plug;
+ int i, end;
+ nid_t nid;
+
+ blk_start_plug(&plug);
+
+ /* Then, try readahead for siblings of the desired node */
+ end = start + n;
+ end = min(end, NIDS_PER_BLOCK);
+ for (i = start; i < end; i++) {
+ nid = get_nid(parent, i, false);
+ ra_node_page(sbi, nid);
+ }
+
+ blk_finish_plug(&plug);
+}
+
pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
{
const long direct_index = ADDRS_PER_INODE(dn->inode);
@@ -707,6 +730,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
return PTR_ERR(page);
}
+ ra_node_pages(page, ofs, NIDS_PER_BLOCK);
+
rn = F2FS_NODE(page);
if (depth < 3) {
for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
@@ -784,6 +809,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
}
+ ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
+
/* free direct nodes linked to a partial indirect node */
for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
child_nid = get_nid(pages[idx], i, false);
@@ -832,7 +859,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
-restart:
+
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
@@ -896,10 +923,7 @@ skip_partial:
if (offset[1] == 0 &&
ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
lock_page(page);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
- goto restart;
- }
+ BUG_ON(page->mapping != NODE_MAPPING(sbi));
f2fs_wait_on_page_writeback(page, NODE, true);
ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
@@ -998,7 +1022,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return ERR_PTR(-EPERM);
- page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
+ page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -1090,7 +1114,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
if (apage)
return;
- apage = grab_cache_page(NODE_MAPPING(sbi), nid);
+ apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
if (!apage)
return;
@@ -1098,29 +1122,6 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_put_page(apage, err ? 1 : 0);
}
-/*
- * readahead MAX_RA_NODE number of node pages.
- */
-static void ra_node_pages(struct page *parent, int start)
-{
- struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
- struct blk_plug plug;
- int i, end;
- nid_t nid;
-
- blk_start_plug(&plug);
-
- /* Then, try readahead for siblings of the desired node */
- end = start + MAX_RA_NODE;
- end = min(end, NIDS_PER_BLOCK);
- for (i = start; i < end; i++) {
- nid = get_nid(parent, i, false);
- ra_node_page(sbi, nid);
- }
-
- blk_finish_plug(&plug);
-}
-
static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
struct page *parent, int start)
{
@@ -1131,7 +1132,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
return ERR_PTR(-ENOENT);
f2fs_bug_on(sbi, check_nid_range(sbi, nid));
repeat:
- page = grab_cache_page(NODE_MAPPING(sbi), nid);
+ page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -1144,7 +1145,7 @@ repeat:
}
if (parent)
- ra_node_pages(parent, start + 1);
+ ra_node_pages(parent, start + 1, MAX_RA_NODE);
lock_page(page);
@@ -1196,19 +1197,17 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode;
struct page *page;
+ int ret;
/* should flush inline_data before evict_inode */
inode = ilookup(sbi->sb, ino);
if (!inode)
return;
- page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0);
+ page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
if (!page)
goto iput_out;
- if (!trylock_page(page))
- goto release_out;
-
if (!PageUptodate(page))
goto page_out;
@@ -1218,24 +1217,214 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
if (!clear_page_dirty_for_io(page))
goto page_out;
- if (!f2fs_write_inline_data(inode, page))
- inode_dec_dirty_pages(inode);
- else
+ ret = f2fs_write_inline_data(inode, page);
+ inode_dec_dirty_pages(inode);
+ if (ret)
set_page_dirty(page);
page_out:
- unlock_page(page);
-release_out:
- f2fs_put_page(page, 0);
+ f2fs_put_page(page, 1);
iput_out:
iput(inode);
}
-int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
- struct writeback_control *wbc)
+void move_node_page(struct page *node_page, int gc_type)
+{
+ if (gc_type == FG_GC) {
+ struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 1,
+ .for_reclaim = 0,
+ };
+
+ set_page_dirty(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
+
+ f2fs_bug_on(sbi, PageWriteback(node_page));
+ if (!clear_page_dirty_for_io(node_page))
+ goto out_page;
+
+ if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
+ unlock_page(node_page);
+ goto release_page;
+ } else {
+ /* set page dirty and write it */
+ if (!PageWriteback(node_page))
+ set_page_dirty(node_page);
+ }
+out_page:
+ unlock_page(node_page);
+release_page:
+ f2fs_put_page(node_page, 0);
+}
+
+static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
{
pgoff_t index, end;
struct pagevec pvec;
- int step = ino ? 2 : 0;
+ struct page *last_page = NULL;
+
+ pagevec_init(&pvec, 0);
+ index = 0;
+ end = ULONG_MAX;
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_put_page(last_page, 0);
+ pagevec_release(&pvec);
+ return ERR_PTR(-EIO);
+ }
+
+ if (!IS_DNODE(page) || !is_cold_node(page))
+ continue;
+ if (ino_of_node(page) != ino)
+ continue;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (ino_of_node(page) != ino)
+ goto continue_unlock;
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (last_page)
+ f2fs_put_page(last_page, 0);
+
+ get_page(page);
+ last_page = page;
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return last_page;
+}
+
+int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
+ struct writeback_control *wbc, bool atomic)
+{
+ pgoff_t index, end;
+ struct pagevec pvec;
+ int ret = 0;
+ struct page *last_page = NULL;
+ bool marked = false;
+
+ if (atomic) {
+ last_page = last_fsync_dnode(sbi, ino);
+ if (IS_ERR_OR_NULL(last_page))
+ return PTR_ERR_OR_ZERO(last_page);
+ }
+retry:
+ pagevec_init(&pvec, 0);
+ index = 0;
+ end = ULONG_MAX;
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_put_page(last_page, 0);
+ pagevec_release(&pvec);
+ return -EIO;
+ }
+
+ if (!IS_DNODE(page) || !is_cold_node(page))
+ continue;
+ if (ino_of_node(page) != ino)
+ continue;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (ino_of_node(page) != ino)
+ goto continue_unlock;
+
+ if (!PageDirty(page) && page != last_page) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ f2fs_wait_on_page_writeback(page, NODE, true);
+ BUG_ON(PageWriteback(page));
+
+ if (!atomic || page == last_page) {
+ set_fsync_mark(page, 1);
+ if (IS_INODE(page))
+ set_dentry_mark(page,
+ need_dentry_mark(sbi, ino));
+ /* may be written by other thread */
+ if (!PageDirty(page))
+ set_page_dirty(page);
+ }
+
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
+ if (ret) {
+ unlock_page(page);
+ f2fs_put_page(last_page, 0);
+ break;
+ }
+ if (page == last_page) {
+ f2fs_put_page(page, 0);
+ marked = true;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+
+ if (ret || marked)
+ break;
+ }
+ if (!ret && atomic && !marked) {
+ f2fs_msg(sbi->sb, KERN_DEBUG,
+ "Retry to write fsync mark: ino=%u, idx=%lx",
+ ino, last_page->index);
+ lock_page(last_page);
+ set_page_dirty(last_page);
+ unlock_page(last_page);
+ goto retry;
+ }
+ return ret ? -EIO: 0;
+}
+
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
+{
+ pgoff_t index, end;
+ struct pagevec pvec;
+ int step = 0;
int nwritten = 0;
pagevec_init(&pvec, 0);
@@ -1274,15 +1463,8 @@ next_step:
if (step == 2 && (!IS_DNODE(page) ||
!is_cold_node(page)))
continue;
-
- /*
- * If an fsync mode,
- * we should not skip writing node pages.
- */
lock_node:
- if (ino && ino_of_node(page) == ino)
- lock_page(page);
- else if (!trylock_page(page))
+ if (!trylock_page(page))
continue;
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
@@ -1290,8 +1472,6 @@ continue_unlock:
unlock_page(page);
continue;
}
- if (ino && ino_of_node(page) != ino)
- goto continue_unlock;
if (!PageDirty(page)) {
/* someone wrote it for us */
@@ -1299,7 +1479,7 @@ continue_unlock:
}
/* flush inline_data */
- if (!ino && is_inline_node(page)) {
+ if (is_inline_node(page)) {
clear_inline_node(page);
unlock_page(page);
flush_inline_data(sbi, ino_of_node(page));
@@ -1312,17 +1492,8 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- /* called by fsync() */
- if (ino && IS_DNODE(page)) {
- set_fsync_mark(page, 1);
- if (IS_INODE(page))
- set_dentry_mark(page,
- need_dentry_mark(sbi, ino));
- nwritten++;
- } else {
- set_fsync_mark(page, 0);
- set_dentry_mark(page, 0);
- }
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
unlock_page(page);
@@ -1470,7 +1641,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
diff = nr_pages_to_write(sbi, NODE, wbc);
wbc->sync_mode = WB_SYNC_NONE;
- sync_node_pages(sbi, 0, wbc);
+ sync_node_pages(sbi, wbc);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return 0;
@@ -1524,7 +1695,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
struct nat_entry *ne;
- bool allocated = false;
if (!available_free_memory(sbi, FREE_NIDS))
return -1;
@@ -1538,8 +1708,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
ne = __lookup_nat_cache(nm_i, nid);
if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
- allocated = true;
- if (allocated)
return 0;
}
@@ -1672,6 +1840,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i = NULL;
retry:
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_ALLOC_NID))
+ return false;
+#endif
if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
return false;
@@ -1846,7 +2018,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
if (unlikely(old_ni.blk_addr != NULL_ADDR))
return -EINVAL;
- ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
+ ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
if (!ipage)
return -ENOMEM;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 011942f94d64..3d7216d7a288 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab;
bool space_for_roll_forward(struct f2fs_sb_info *sbi)
{
- if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
- > sbi->user_block_count)
+ s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
+
+ if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
return false;
return true;
}
@@ -67,7 +68,30 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
return NULL;
}
-static int recover_dentry(struct inode *inode, struct page *ipage)
+static struct fsync_inode_entry *add_fsync_inode(struct list_head *head,
+ struct inode *inode)
+{
+ struct fsync_inode_entry *entry;
+
+ entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
+ if (!entry)
+ return NULL;
+
+ entry->inode = inode;
+ list_add_tail(&entry->list, head);
+
+ return entry;
+}
+
+static void del_fsync_inode(struct fsync_inode_entry *entry)
+{
+ iput(entry->inode);
+ list_del(&entry->list);
+ kmem_cache_free(fsync_entry_slab, entry);
+}
+
+static int recover_dentry(struct inode *inode, struct page *ipage,
+ struct list_head *dir_list)
{
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -75,18 +99,29 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
struct qstr name;
struct page *page;
struct inode *dir, *einode;
+ struct fsync_inode_entry *entry;
int err = 0;
- dir = f2fs_iget(inode->i_sb, pino);
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
+ entry = get_fsync_inode(dir_list, pino);
+ if (!entry) {
+ dir = f2fs_iget(inode->i_sb, pino);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto out;
+ }
+
+ entry = add_fsync_inode(dir_list, dir);
+ if (!entry) {
+ err = -ENOMEM;
+ iput(dir);
+ goto out;
+ }
}
- if (file_enc_name(inode)) {
- iput(dir);
+ dir = entry->inode;
+
+ if (file_enc_name(inode))
return 0;
- }
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
@@ -94,7 +129,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
if (unlikely(name.len > F2FS_NAME_LEN)) {
WARN_ON(1);
err = -ENAMETOOLONG;
- goto out_err;
+ goto out;
}
retry:
de = f2fs_find_entry(dir, &name, &page);
@@ -120,23 +155,12 @@ retry:
goto retry;
}
err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
- if (err)
- goto out_err;
-
- if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) {
- iput(dir);
- } else {
- add_dirty_dir_inode(dir);
- set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
- }
goto out;
out_unmap_put:
f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
-out_err:
- iput(dir);
out:
f2fs_msg(inode->i_sb, KERN_NOTICE,
"%s: ino = %x, name = %s, dir = %lx, err = %d",
@@ -198,6 +222,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
+ struct inode *inode;
struct page *page = NULL;
block_t blkaddr;
int err = 0;
@@ -206,8 +231,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- ra_meta_pages(sbi, blkaddr, 1, META_POR, true);
-
while (1) {
struct fsync_inode_entry *entry;
@@ -233,35 +256,32 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
break;
}
- /* add this fsync inode to the list */
- entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
- if (!entry) {
- err = -ENOMEM;
- break;
- }
/*
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
- entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
- if (IS_ERR(entry->inode)) {
- err = PTR_ERR(entry->inode);
- kmem_cache_free(fsync_entry_slab, entry);
+ inode = f2fs_iget(sbi->sb, ino_of_node(page));
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
if (err == -ENOENT) {
err = 0;
goto next;
}
break;
}
- list_add_tail(&entry->list, head);
+
+ /* add this fsync inode to the list */
+ entry = add_fsync_inode(head, inode);
+ if (!entry) {
+ err = -ENOMEM;
+ iput(inode);
+ break;
+ }
}
entry->blkaddr = blkaddr;
- if (IS_INODE(page)) {
- entry->last_inode = blkaddr;
- if (is_dent_dnode(page))
- entry->last_dentry = blkaddr;
- }
+ if (IS_INODE(page) && is_dent_dnode(page))
+ entry->last_dentry = blkaddr;
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
@@ -277,11 +297,8 @@ static void destroy_fsync_dnodes(struct list_head *head)
{
struct fsync_inode_entry *entry, *tmp;
- list_for_each_entry_safe(entry, tmp, head, list) {
- iput(entry->inode);
- list_del(&entry->list);
- kmem_cache_free(fsync_entry_slab, entry);
- }
+ list_for_each_entry_safe(entry, tmp, head, list)
+ del_fsync_inode(entry);
}
static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
@@ -444,8 +461,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
*/
if (dest == NEW_ADDR) {
truncate_data_blocks_range(&dn, 1);
- err = reserve_new_block(&dn);
- f2fs_bug_on(sbi, err);
+ reserve_new_block(&dn);
continue;
}
@@ -454,6 +470,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (src == NULL_ADDR) {
err = reserve_new_block(&dn);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ while (err)
+ err = reserve_new_block(&dn);
+#endif
/* We should not get -ENOSPC */
f2fs_bug_on(sbi, err);
}
@@ -486,7 +506,8 @@ out:
return err;
}
-static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
+ struct list_head *dir_list)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
@@ -513,7 +534,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
break;
}
- entry = get_fsync_inode(head, ino_of_node(page));
+ entry = get_fsync_inode(inode_list, ino_of_node(page));
if (!entry)
goto next;
/*
@@ -521,10 +542,10 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
* In this case, we can lose the latest inode(x).
* So, call recover_inode for the inode update.
*/
- if (entry->last_inode == blkaddr)
+ if (IS_INODE(page))
recover_inode(entry->inode, page);
if (entry->last_dentry == blkaddr) {
- err = recover_dentry(entry->inode, page);
+ err = recover_dentry(entry->inode, page, dir_list);
if (err) {
f2fs_put_page(page, 1);
break;
@@ -536,11 +557,8 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
break;
}
- if (entry->blkaddr == blkaddr) {
- iput(entry->inode);
- list_del(&entry->list);
- kmem_cache_free(fsync_entry_slab, entry);
- }
+ if (entry->blkaddr == blkaddr)
+ del_fsync_inode(entry);
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
@@ -551,12 +569,14 @@ next:
return err;
}
-int recover_fsync_data(struct f2fs_sb_info *sbi)
+int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
struct list_head inode_list;
+ struct list_head dir_list;
block_t blkaddr;
int err;
+ int ret = 0;
bool need_writecp = false;
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -565,6 +585,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
return -ENOMEM;
INIT_LIST_HEAD(&inode_list);
+ INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
mutex_lock(&sbi->cp_mutex);
@@ -573,21 +594,22 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list);
- if (err)
+ if (err || list_empty(&inode_list))
goto out;
- if (list_empty(&inode_list))
+ if (check_only) {
+ ret = 1;
goto out;
+ }
need_writecp = true;
/* step #2: recover data */
- err = recover_data(sbi, &inode_list);
+ err = recover_data(sbi, &inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
destroy_fsync_dnodes(&inode_list);
- kmem_cache_destroy(fsync_entry_slab);
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
@@ -625,5 +647,8 @@ out:
} else {
mutex_unlock(&sbi->cp_mutex);
}
- return err;
+
+ destroy_fsync_dnodes(&dir_list);
+ kmem_cache_destroy(fsync_entry_slab);
+ return ret ? ret: err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 540669d6978e..2e6f537a0e7d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -223,9 +223,11 @@ static int __revoke_inmem_pages(struct inode *inode,
f2fs_put_dnode(&dn);
}
next:
- ClearPageUptodate(page);
+ /* we don't need to invalidate this in the sccessful status */
+ if (drop || recover)
+ ClearPageUptodate(page);
set_page_private(page, 0);
- ClearPageUptodate(page);
+ ClearPagePrivate(page);
f2fs_put_page(page, 1);
list_del(&cur->list);
@@ -239,6 +241,8 @@ void drop_inmem_pages(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+ clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+
mutex_lock(&fi->inmem_lock);
__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
mutex_unlock(&fi->inmem_lock);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 975c33df65c7..7a756ff5a36d 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -158,16 +158,17 @@ struct victim_sel_policy {
};
struct seg_entry {
- unsigned short valid_blocks; /* # of valid blocks */
+ unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */
+ unsigned int valid_blocks:10; /* # of valid blocks */
+ unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */
+ unsigned int padding:6; /* padding */
unsigned char *cur_valid_map; /* validity bitmap of blocks */
/*
* # of valid blocks and the validity bitmap stored in the the last
* checkpoint pack. This information is used by the SSR mode.
*/
- unsigned short ckpt_valid_blocks;
- unsigned char *ckpt_valid_map;
+ unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */
unsigned char *discard_map;
- unsigned char type; /* segment type like CURSEG_XXX_TYPE */
unsigned long long mtime; /* modification time of the segment */
};
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 006f87d69921..74cc8520b8b1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -39,6 +39,30 @@ static struct proc_dir_entry *f2fs_proc_root;
static struct kmem_cache *f2fs_inode_cachep;
static struct kset *f2fs_kset;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+struct f2fs_fault_info f2fs_fault;
+
+char *fault_name[FAULT_MAX] = {
+ [FAULT_KMALLOC] = "kmalloc",
+ [FAULT_PAGE_ALLOC] = "page alloc",
+ [FAULT_ALLOC_NID] = "alloc nid",
+ [FAULT_ORPHAN] = "orphan",
+ [FAULT_BLOCK] = "no more block",
+ [FAULT_DIR_DEPTH] = "too big dir depth",
+};
+
+static void f2fs_build_fault_attr(unsigned int rate)
+{
+ if (rate) {
+ atomic_set(&f2fs_fault.inject_ops, 0);
+ f2fs_fault.inject_rate = rate;
+ f2fs_fault.inject_type = (1 << FAULT_MAX) - 1;
+ } else {
+ memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info));
+ }
+}
+#endif
+
/* f2fs-wide shrinker description */
static struct shrinker f2fs_shrinker_info = {
.scan_objects = f2fs_shrink_scan,
@@ -68,6 +92,7 @@ enum {
Opt_noextent_cache,
Opt_noinline_data,
Opt_data_flush,
+ Opt_fault_injection,
Opt_err,
};
@@ -93,6 +118,7 @@ static match_table_t f2fs_tokens = {
{Opt_noextent_cache, "noextent_cache"},
{Opt_noinline_data, "noinline_data"},
{Opt_data_flush, "data_flush"},
+ {Opt_fault_injection, "fault_injection=%u"},
{Opt_err, NULL},
};
@@ -102,6 +128,10 @@ enum {
SM_INFO, /* struct f2fs_sm_info */
NM_INFO, /* struct f2fs_nm_info */
F2FS_SBI, /* struct f2fs_sb_info */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ FAULT_INFO_RATE, /* struct f2fs_fault_info */
+ FAULT_INFO_TYPE, /* struct f2fs_fault_info */
+#endif
};
struct f2fs_attr {
@@ -123,6 +153,11 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
return (unsigned char *)NM_I(sbi);
else if (struct_type == F2FS_SBI)
return (unsigned char *)sbi;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ else if (struct_type == FAULT_INFO_RATE ||
+ struct_type == FAULT_INFO_TYPE)
+ return (unsigned char *)&f2fs_fault;
+#endif
return NULL;
}
@@ -172,6 +207,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret < 0)
return ret;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
+ return -EINVAL;
+#endif
*ui = t;
return count;
}
@@ -237,6 +276,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
+F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
+#endif
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
@@ -273,6 +316,22 @@ static struct kobj_type f2fs_ktype = {
.release = f2fs_sb_release,
};
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+/* sysfs for f2fs fault injection */
+static struct kobject f2fs_fault_inject;
+
+static struct attribute *f2fs_fault_attrs[] = {
+ ATTR_LIST(inject_rate),
+ ATTR_LIST(inject_type),
+ NULL
+};
+
+static struct kobj_type f2fs_fault_ktype = {
+ .default_attrs = f2fs_fault_attrs,
+ .sysfs_ops = &f2fs_attr_ops,
+};
+#endif
+
void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
struct va_format vaf;
@@ -300,6 +359,10 @@ static int parse_options(struct super_block *sb, char *options)
char *p, *name;
int arg = 0;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_build_fault_attr(0);
+#endif
+
if (!options)
return 0;
@@ -433,6 +496,16 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_data_flush:
set_opt(sbi, DATA_FLUSH);
break;
+ case Opt_fault_injection:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_build_fault_attr(arg);
+#else
+ f2fs_msg(sb, KERN_INFO,
+ "FAULT_INJECTION was not selected");
+#endif
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -453,9 +526,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_once((void *) fi);
+ if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) {
+ kmem_cache_free(f2fs_inode_cachep, fi);
+ return NULL;
+ }
+
/* Initialize f2fs-specific inode info */
fi->vfs_inode.i_version = 1;
- atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
init_rwsem(&fi->i_sem);
@@ -530,15 +607,27 @@ static void f2fs_i_callback(struct rcu_head *head)
static void f2fs_destroy_inode(struct inode *inode)
{
+ percpu_counter_destroy(&F2FS_I(inode)->dirty_pages);
call_rcu(&inode->i_rcu, f2fs_i_callback);
}
+static void destroy_percpu_info(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ for (i = 0; i < NR_COUNT_TYPE; i++)
+ percpu_counter_destroy(&sbi->nr_pages[i]);
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
+ percpu_counter_destroy(&sbi->total_valid_inode_count);
+}
+
static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
kobject_del(&sbi->s_kobj);
@@ -568,15 +657,14 @@ static void f2fs_put_super(struct super_block *sb)
* normally superblock is clean, so we need to release this.
* In addition, EIO will skip do checkpoint, we need this as well.
*/
- release_ino_entry(sbi);
+ release_ino_entry(sbi, true);
release_discard_addrs(sbi);
f2fs_leave_shrinker(sbi);
mutex_unlock(&sbi->umount_mutex);
/* our cp_error case, we can wait for any writeback page */
- if (get_pages(sbi, F2FS_WRITEBACK))
- f2fs_flush_merged_bios(sbi);
+ f2fs_flush_merged_bios(sbi);
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -593,6 +681,8 @@ static void f2fs_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->raw_super);
+
+ destroy_percpu_info(sbi);
kfree(sbi);
}
@@ -745,19 +835,47 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
return 0;
}
-static int segment_info_open_fs(struct inode *inode, struct file *file)
+static int segment_bits_seq_show(struct seq_file *seq, void *offset)
{
- return single_open(file, segment_info_seq_show, PDE_DATA(inode));
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned int total_segs =
+ le32_to_cpu(sbi->raw_super->segment_count_main);
+ int i, j;
+
+ seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
+ "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+ for (i = 0; i < total_segs; i++) {
+ struct seg_entry *se = get_seg_entry(sbi, i);
+
+ seq_printf(seq, "%-10d", i);
+ seq_printf(seq, "%d|%-3u|", se->type,
+ get_valid_blocks(sbi, i, 1));
+ for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
+ seq_printf(seq, "%x ", se->cur_valid_map[j]);
+ seq_putc(seq, '\n');
+ }
+ return 0;
}
-static const struct file_operations f2fs_seq_segment_info_fops = {
- .owner = THIS_MODULE,
- .open = segment_info_open_fs,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+#define F2FS_PROC_FILE_DEF(_name) \
+static int _name##_open_fs(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, _name##_seq_show, PDE_DATA(inode)); \
+} \
+ \
+static const struct file_operations f2fs_seq_##_name##_fops = { \
+ .owner = THIS_MODULE, \
+ .open = _name##_open_fs, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
};
+F2FS_PROC_FILE_DEF(segment_info);
+F2FS_PROC_FILE_DEF(segment_bits);
+
static void default_options(struct f2fs_sb_info *sbi)
{
/* init some FS parameters */
@@ -791,13 +909,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
active_logs = sbi->active_logs;
- if (*flags & MS_RDONLY) {
- set_opt(sbi, FASTBOOT);
- set_sbi_flag(sbi, SBI_IS_DIRTY);
+ /* recover superblocks we couldn't write due to previous RO mount */
+ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+ err = f2fs_commit_super(sbi, false);
+ f2fs_msg(sb, KERN_INFO,
+ "Try to recover all the superblocks, ret: %d", err);
+ if (!err)
+ clear_sbi_flag(sbi, SBI_NEED_SB_WRITE);
}
- sync_filesystem(sb);
-
sbi->mount_opt.opt = 0;
default_options(sbi);
@@ -829,7 +949,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
if (sbi->gc_thread) {
stop_gc_thread(sbi);
- f2fs_sync_fs(sb, 1);
need_restart_gc = true;
}
} else if (!sbi->gc_thread) {
@@ -839,6 +958,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
+ if (*flags & MS_RDONLY) {
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
+ sync_inodes_sb(sb);
+
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ set_sbi_flag(sbi, SBI_IS_CLOSE);
+ f2fs_sync_fs(sb, 1);
+ clear_sbi_flag(sbi, SBI_IS_CLOSE);
+ }
+
/*
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
@@ -852,8 +981,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
skip:
/* Update the POSIXACL Flag */
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+
return 0;
restore_gc:
if (need_restart_gc) {
@@ -893,6 +1023,12 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
ctx, len, NULL);
}
+static int f2fs_key_prefix(struct inode *inode, u8 **key)
+{
+ *key = F2FS_I_SB(inode)->key_prefix;
+ return F2FS_I_SB(inode)->key_prefix_size;
+}
+
static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
void *fs_data)
{
@@ -909,6 +1045,7 @@ static unsigned f2fs_max_namelen(struct inode *inode)
static struct fscrypt_operations f2fs_cryptops = {
.get_context = f2fs_get_context,
+ .key_prefix = f2fs_key_prefix,
.set_context = f2fs_set_context,
.is_encrypted = f2fs_encrypted_inode,
.empty_dir = f2fs_empty_dir,
@@ -998,11 +1135,12 @@ static int __f2fs_commit_super(struct buffer_head *bh,
return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
}
-static inline bool sanity_check_area_boundary(struct super_block *sb,
+static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
struct buffer_head *bh)
{
struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
(bh->b_data + F2FS_SUPER_OFFSET);
+ struct super_block *sb = sbi->sb;
u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr);
@@ -1081,6 +1219,7 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
segment0_blkaddr) >> log_blocks_per_seg);
if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) {
+ set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
res = "internally";
} else {
err = __f2fs_commit_super(bh, NULL);
@@ -1098,11 +1237,12 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
return false;
}
-static int sanity_check_raw_super(struct super_block *sb,
+static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
struct buffer_head *bh)
{
struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
(bh->b_data + F2FS_SUPER_OFFSET);
+ struct super_block *sb = sbi->sb;
unsigned int blocksize;
if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
@@ -1169,7 +1309,7 @@ static int sanity_check_raw_super(struct super_block *sb,
}
/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
- if (sanity_check_area_boundary(sb, bh))
+ if (sanity_check_area_boundary(sbi, bh))
return 1;
return 0;
@@ -1201,7 +1341,6 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
static void init_sb_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = sbi->raw_super;
- int i;
sbi->log_sectors_per_block =
le32_to_cpu(raw_super->log_sectors_per_block);
@@ -1221,9 +1360,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->cur_victim_sec = NULL_SECNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
- for (i = 0; i < NR_COUNT_TYPE; i++)
- atomic_set(&sbi->nr_pages[i], 0);
-
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
@@ -1231,6 +1367,30 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
+ F2FS_KEY_DESC_PREFIX_SIZE);
+ sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
+#endif
+}
+
+static int init_percpu_info(struct f2fs_sb_info *sbi)
+{
+ int i, err;
+
+ for (i = 0; i < NR_COUNT_TYPE; i++) {
+ err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
+ if (err)
+ return err;
+ }
+
+ err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL);
+ if (err)
+ return err;
+
+ return percpu_counter_init(&sbi->total_valid_inode_count, 0,
+ GFP_KERNEL);
}
/*
@@ -1239,10 +1399,11 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
* to get the first valid one. If any one of them is broken, we pass
* them recovery flag back to the caller.
*/
-static int read_raw_super_block(struct super_block *sb,
+static int read_raw_super_block(struct f2fs_sb_info *sbi,
struct f2fs_super_block **raw_super,
int *valid_super_block, int *recovery)
{
+ struct super_block *sb = sbi->sb;
int block;
struct buffer_head *bh;
struct f2fs_super_block *super;
@@ -1262,7 +1423,7 @@ static int read_raw_super_block(struct super_block *sb,
}
/* sanity checking of raw super */
- if (sanity_check_raw_super(sb, bh)) {
+ if (sanity_check_raw_super(sbi, bh)) {
f2fs_msg(sb, KERN_ERR,
"Can't find valid F2FS filesystem in %dth superblock",
block + 1);
@@ -1298,6 +1459,12 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
struct buffer_head *bh;
int err;
+ if ((recover && f2fs_readonly(sbi->sb)) ||
+ bdev_read_only(sbi->sb->s_bdev)) {
+ set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+ return -EROFS;
+ }
+
/* write back-up superblock first */
bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1);
if (!bh)
@@ -1323,7 +1490,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
struct inode *root;
- long err;
+ int err;
bool retry = true, need_fsck = false;
char *options = NULL;
int recovery, i, valid_super_block;
@@ -1340,6 +1507,8 @@ try_onemore:
if (!sbi)
return -ENOMEM;
+ sbi->sb = sb;
+
/* Load the checksum driver */
sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
@@ -1355,7 +1524,7 @@ try_onemore:
goto free_sbi;
}
- err = read_raw_super_block(sb, &raw_super, &valid_super_block,
+ err = read_raw_super_block(sbi, &raw_super, &valid_super_block,
&recovery);
if (err)
goto free_sbi;
@@ -1390,7 +1559,6 @@ try_onemore:
memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
/* init f2fs-specific super block info */
- sbi->sb = sb;
sbi->raw_super = raw_super;
sbi->valid_super_block = valid_super_block;
mutex_init(&sbi->gc_mutex);
@@ -1415,6 +1583,10 @@ try_onemore:
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
+ err = init_percpu_info(sbi);
+ if (err)
+ goto free_options;
+
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
@@ -1431,13 +1603,13 @@ try_onemore:
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
- sbi->total_valid_inode_count =
- le32_to_cpu(sbi->ckpt->valid_inode_count);
+ percpu_counter_set(&sbi->total_valid_inode_count,
+ le32_to_cpu(sbi->ckpt->valid_inode_count));
sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
sbi->total_valid_block_count =
le64_to_cpu(sbi->ckpt->valid_block_count);
sbi->last_valid_block_count = sbi->total_valid_block_count;
- sbi->alloc_valid_block_count = 0;
+
for (i = 0; i < NR_INODE_TYPE; i++) {
INIT_LIST_HEAD(&sbi->inode_list[i]);
spin_lock_init(&sbi->inode_lock[i]);
@@ -1515,9 +1687,12 @@ try_onemore:
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
- if (sbi->s_proc)
+ if (sbi->s_proc) {
proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
&f2fs_seq_segment_info_fops, sb);
+ proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_segment_bits_fops, sb);
+ }
sbi->s_kobj.kset = f2fs_kset;
init_completion(&sbi->s_kobj_unregister);
@@ -1541,14 +1716,24 @@ try_onemore:
if (need_fsck)
set_sbi_flag(sbi, SBI_NEED_FSCK);
- err = recover_fsync_data(sbi);
- if (err) {
+ err = recover_fsync_data(sbi, false);
+ if (err < 0) {
need_fsck = true;
f2fs_msg(sb, KERN_ERR,
- "Cannot recover all fsync data errno=%ld", err);
+ "Cannot recover all fsync data errno=%d", err);
+ goto free_kobj;
+ }
+ } else {
+ err = recover_fsync_data(sbi, true);
+
+ if (!f2fs_readonly(sb) && err > 0) {
+ err = -EINVAL;
+ f2fs_msg(sb, KERN_ERR,
+ "Need to recover fsync data");
goto free_kobj;
}
}
+
/* recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -1565,10 +1750,10 @@ try_onemore:
kfree(options);
/* recover broken superblock */
- if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
+ if (recovery) {
err = f2fs_commit_super(sbi, true);
f2fs_msg(sb, KERN_INFO,
- "Try to recover %dth superblock, ret: %ld",
+ "Try to recover %dth superblock, ret: %d",
sbi->valid_super_block ? 1 : 2, err);
}
@@ -1583,6 +1768,7 @@ free_kobj:
free_proc:
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
f2fs_destroy_stats(sbi);
@@ -1603,6 +1789,7 @@ free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
free_options:
+ destroy_percpu_info(sbi);
kfree(options);
free_sb_buf:
kfree(raw_super);
@@ -1688,6 +1875,16 @@ static int __init init_f2fs_fs(void)
err = -ENOMEM;
goto free_extent_cache;
}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_fault_inject.kset = f2fs_kset;
+ f2fs_build_fault_attr(0);
+ err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype,
+ NULL, "fault_injection");
+ if (err) {
+ f2fs_fault_inject.kset = NULL;
+ goto free_kset;
+ }
+#endif
err = register_shrinker(&f2fs_shrinker_info);
if (err)
goto free_kset;
@@ -1706,6 +1903,10 @@ free_filesystem:
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
free_kset:
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (f2fs_fault_inject.kset)
+ kobject_put(&f2fs_fault_inject);
+#endif
kset_unregister(f2fs_kset);
free_extent_cache:
destroy_extent_cache();
@@ -1725,14 +1926,17 @@ static void __exit exit_f2fs_fs(void)
{
remove_proc_entry("fs/f2fs", NULL);
f2fs_destroy_root_stats();
- unregister_shrinker(&f2fs_shrinker_info);
unregister_filesystem(&f2fs_fs_type);
+ unregister_shrinker(&f2fs_shrinker_info);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ kobject_put(&f2fs_fault_inject);
+#endif
+ kset_unregister(f2fs_kset);
destroy_extent_cache();
destroy_checkpoint_caches();
destroy_segment_manager_caches();
destroy_node_manager_caches();
destroy_inodecache();
- kset_unregister(f2fs_kset);
f2fs_destroy_trace_ios();
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 17fd2b1a6848..e3decae3acfb 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
}
static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name, const void *value,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
size_t size, int flags)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
@@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
default:
return -EINVAL;
}
- return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+ return f2fs_setxattr(inode, handler->flags, name,
value, size, NULL, flags);
}
@@ -95,11 +96,10 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
}
static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name, const void *value,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
-
if (!inode_owner_or_capable(inode))
return -EPERM;
if (value == NULL)
@@ -498,7 +498,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
free = free + ENTRY_SIZE(here);
if (unlikely(free < newsize)) {
- error = -ENOSPC;
+ error = -E2BIG;
goto exit;
}
}
@@ -526,7 +526,6 @@ static int __f2fs_setxattr(struct inode *inode, int index,
* Before we come here, old entry is removed.
* We just write new entry.
*/
- memset(last, 0, newsize);
last->e_name_index = index;
last->e_name_len = len;
memcpy(last->e_name, name, len);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 592cea54cea0..989a2cef6b76 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -931,7 +931,8 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
* This is WB_SYNC_NONE writeback, so if allocation fails just
* wakeup the thread for old dirty data writeback
*/
- work = kzalloc(sizeof(*work), GFP_ATOMIC);
+ work = kzalloc(sizeof(*work),
+ GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (!work) {
trace_writeback_nowork(wb);
wb_wakeup(wb);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 3078b679fcd1..c8c4f79c7ce1 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -887,6 +887,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
put_page(results[i]);
}
+ wake_up_bit(&cookie->flags, 0);
+
_leave("");
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b9419058108f..ccd4971cc6c1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1719,10 +1719,10 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
return fuse_update_attributes(inode, stat, NULL, NULL);
}
-static int fuse_setxattr(struct dentry *entry, const char *name,
- const void *value, size_t size, int flags)
+static int fuse_setxattr(struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
struct fuse_setxattr_in inarg;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 8524c0e322fc..37b7bc14c8da 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -977,7 +977,7 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
if (!list_empty(&bd->bd_list) && !buffer_pinned(bh))
list_del_init(&bd->bd_list);
else
- gfs2_remove_from_journal(bh, current->journal_info, 0);
+ gfs2_remove_from_journal(bh, REMOVE_JDATA);
}
bh->b_bdev = NULL;
clear_buffer_mapped(bh);
@@ -1063,7 +1063,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
rv = gfs2_glock_nq(&gh);
if (rv)
- return rv;
+ goto out_uninit;
rv = gfs2_ok_for_dio(ip, offset);
if (rv != 1)
goto out; /* dio not valid, fall back to buffered i/o */
@@ -1102,6 +1102,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
gfs2_get_block_direct, NULL, NULL, 0);
out:
gfs2_glock_dq(&gh);
+out_uninit:
gfs2_holder_uninit(&gh);
return rv;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 4a01f30e9995..271d93905bac 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -783,12 +783,15 @@ static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
u64 *leaf_out)
{
__be64 *hash;
+ int error;
hash = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(hash))
- return PTR_ERR(hash);
- *leaf_out = be64_to_cpu(*(hash + index));
- return 0;
+ error = PTR_ERR_OR_ZERO(hash);
+
+ if (!error)
+ *leaf_out = be64_to_cpu(*(hash + index));
+
+ return error;
}
static int get_first_leaf(struct gfs2_inode *dip, u32 index,
@@ -798,7 +801,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
int error;
error = get_leaf_nr(dip, index, &leaf_no);
- if (!IS_ERR_VALUE(error))
+ if (!error)
error = get_leaf(dip, leaf_no, bh_out);
return error;
@@ -1014,7 +1017,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
index = name->hash >> (32 - dip->i_depth);
error = get_leaf_nr(dip, index, &leaf_no);
- if (IS_ERR_VALUE(error))
+ if (error)
return error;
/* Get the old leaf block */
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e53b723abd3b..e0f98e483aec 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -160,7 +160,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
error = gfs2_glock_nq(&gh);
if (error)
- return error;
+ goto out_uninit;
fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
@@ -169,6 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
error = -EFAULT;
gfs2_glock_dq(&gh);
+out_uninit:
gfs2_holder_uninit(&gh);
return error;
}
@@ -953,6 +954,30 @@ out_uninit:
return ret;
}
+static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct inode *inode = in->f_mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ inode_lock(inode);
+
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ if (ret) {
+ inode_unlock(inode);
+ return ret;
+ }
+
+ gfs2_glock_dq_uninit(&gh);
+ inode_unlock(inode);
+
+ return generic_file_splice_read(in, ppos, pipe, len, flags);
+}
+
+
static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
@@ -1115,7 +1140,7 @@ const struct file_operations gfs2_file_fops = {
.fsync = gfs2_fsync,
.lock = gfs2_lock,
.flock = gfs2_flock,
- .splice_read = generic_file_splice_read,
+ .splice_read = gfs2_file_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
@@ -1143,7 +1168,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = gfs2_file_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4b73bd101bdc..706fd9352f36 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -218,7 +218,7 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
*
*/
-static inline void do_error(struct gfs2_glock *gl, const int ret)
+static void do_error(struct gfs2_glock *gl, const int ret)
{
struct gfs2_holder *gh, *tmp;
@@ -475,7 +475,14 @@ __acquires(&gl->gl_lockref.lock)
if (sdp->sd_lockstruct.ls_ops->lm_lock) {
/* lock_dlm */
ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
- if (ret) {
+ if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED &&
+ target == LM_ST_UNLOCKED &&
+ test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) {
+ finish_xmote(gl, target);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+ }
+ else if (ret) {
pr_err("lm_lock ret %d\n", ret);
GLOCK_BUG_ON(gl, 1);
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 437fd73e381e..5db59d444838 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -286,17 +286,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
static int inode_go_demote_ok(const struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct gfs2_holder *gh;
if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
return 0;
- if (!list_empty(&gl->gl_holders)) {
- gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
- if (gh->gh_list.next != &gl->gl_holders)
- return 0;
- }
-
return 1;
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 72e9c64ae371..21dc784f66c2 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -93,12 +93,12 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
int error;
inode = iget_locked(sb, (unsigned long)no_addr);
- ip = GFS2_I(inode);
- ip->i_no_addr = no_addr;
-
if (!inode)
return ERR_PTR(-ENOMEM);
+ ip = GFS2_I(inode);
+ ip->i_no_addr = no_addr;
+
if (inode->i_state & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
ip->i_no_formal_ino = no_formal_ino;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0448524c11bc..8eaadabbc771 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -325,18 +325,19 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
return 0;
}
-void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
+void gfs2_remove_from_journal(struct buffer_head *bh, int meta)
{
struct address_space *mapping = bh->b_page->mapping;
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
struct gfs2_bufdata *bd = bh->b_private;
+ struct gfs2_trans *tr = current->journal_info;
int was_pinned = 0;
if (test_clear_buffer_pinned(bh)) {
trace_gfs2_pin(bd, 0);
atomic_dec(&sdp->sd_log_pinned);
list_del_init(&bd->bd_list);
- if (meta)
+ if (meta == REMOVE_META)
tr->tr_num_buf_rm++;
else
tr->tr_num_databuf_rm++;
@@ -376,7 +377,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
if (bh) {
lock_buffer(bh);
gfs2_log_lock(sdp);
- gfs2_remove_from_journal(bh, current->journal_info, 1);
+ gfs2_remove_from_journal(bh, REMOVE_META);
gfs2_log_unlock(sdp);
unlock_buffer(bh);
brelse(bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index c5086c8af5ed..ffdf6aa3509d 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -57,8 +57,12 @@ extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
int create);
-extern void gfs2_remove_from_journal(struct buffer_head *bh,
- struct gfs2_trans *tr, int meta);
+enum {
+ REMOVE_JDATA = 0,
+ REMOVE_META = 1,
+};
+
+extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head **bhp);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 99a0bdac8796..5bd216901e89 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -73,8 +73,7 @@ static const char valid_change[16] = {
};
static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
- const struct gfs2_inode *ip, bool nowrap,
- const struct gfs2_alloc_parms *ap);
+ const struct gfs2_inode *ip, bool nowrap);
/**
@@ -1511,7 +1510,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
return;
- ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
+ ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
if (ret == 0) {
rs->rs_rbm = rbm;
rs->rs_free = extlen;
@@ -1638,7 +1637,6 @@ fail:
* @ip: If set, check for reservations
* @nowrap: Stop looking at the end of the rgrp, rather than wrapping
* around until we've reached the starting point.
- * @ap: the allocation parameters
*
* Side effects:
* - If looking for free blocks, we set GBF_FULL on each bitmap which
@@ -1650,8 +1648,7 @@ fail:
*/
static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
- const struct gfs2_inode *ip, bool nowrap,
- const struct gfs2_alloc_parms *ap)
+ const struct gfs2_inode *ip, bool nowrap)
{
struct buffer_head *bh;
int initial_bii;
@@ -1772,7 +1769,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
while (1) {
down_write(&sdp->sd_log_flush_lock);
error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
- true, NULL);
+ true);
up_write(&sdp->sd_log_flush_lock);
if (error == -ENOSPC)
break;
@@ -2329,12 +2326,11 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
int error;
gfs2_set_alloc_start(&rbm, ip, dinode);
- error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
+ error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
if (error == -ENOSPC) {
gfs2_set_alloc_start(&rbm, ip, dinode);
- error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
- NULL);
+ error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
}
/* Since all blocks are reserved in advance, this shouldn't happen */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index cf645835710f..aee4485ad8a9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -68,6 +68,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
fs_err(sdp, "telling LM to unmount\n");
lm->lm_unmount(sdp);
}
+ set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
fs_err(sdp, "withdrawn\n");
dump_stack();
}
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index f42ab53bd30d..3a2853504084 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
}
static int gfs2_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int ret;
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 064f92f17efc..d9a86919fdf6 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -13,10 +13,10 @@
#include "hfs_fs.h"
#include "btree.h"
-int hfs_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int hfs_setxattr(struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
struct hfs_find_data fd;
hfs_cat_rec rec;
struct hfs_cat_file *file;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index fa3eed86837c..ee2f385811c8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -212,7 +212,7 @@ extern void hfs_evict_inode(struct inode *);
extern void hfs_delete_inode(struct inode *);
/* attr.c */
-extern int hfs_setxattr(struct dentry *dentry, const char *name,
+extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
const void *value, size_t size, int flags);
extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size);
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 4f118d282a7a..d37bb88dc746 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
return len;
}
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags,
const char *prefix, size_t prefixlen)
{
@@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
return -ENOMEM;
strcpy(xattr_name, prefix);
strcpy(xattr_name + prefixlen, name);
- res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size,
- flags);
+ res = __hfsplus_setxattr(inode, xattr_name, value, size, flags);
kfree(xattr_name);
return res;
}
@@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
}
static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
/*
* Don't allow setting properly prefixed attributes
@@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
* creates), so we pass the name through unmodified (after
* ensuring it doesn't conflict with another namespace).
*/
- return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
+ return __hfsplus_setxattr(inode, name, buffer, size, flags);
}
const struct xattr_handler hfsplus_xattr_osx_handler = {
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index d04ba6f58df2..68f6b539371f 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[];
int __hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags,
const char *prefix, size_t prefixlen);
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index ae2ca8c2e335..37b3efa733ef 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler,
}
static int hfsplus_security_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ return hfsplus_setxattr(inode, name, buffer, size, flags,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index eae2947060aa..94519d6c627d 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
}
static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ return hfsplus_setxattr(inode, name, buffer, size, flags,
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 3c9eec3e4c7b..fae6c0ea0030 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler,
}
static int hfsplus_user_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ return hfsplus_setxattr(inode, name, buffer, size, flags,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 458cf463047b..82067ca22f2b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/bitmap.h>
#include <linux/slab.h>
+#include <linux/seq_file.h>
/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -453,10 +454,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
int lowercase, eas, chk, errs, chkdsk, timeshift;
int o;
struct hpfs_sb_info *sbi = hpfs_sb(s);
- char *new_opts = kstrdup(data, GFP_KERNEL);
-
- if (!new_opts)
- return -ENOMEM;
sync_filesystem(s);
@@ -493,17 +490,44 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
- replace_mount_options(s, new_opts);
-
hpfs_unlock(s);
return 0;
out_err:
hpfs_unlock(s);
- kfree(new_opts);
return -EINVAL;
}
+static int hpfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb);
+
+ seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid));
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid));
+ seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777));
+ if (sbi->sb_lowercase)
+ seq_printf(seq, ",case=lower");
+ if (!sbi->sb_chk)
+ seq_printf(seq, ",check=none");
+ if (sbi->sb_chk == 2)
+ seq_printf(seq, ",check=strict");
+ if (!sbi->sb_err)
+ seq_printf(seq, ",errors=continue");
+ if (sbi->sb_err == 2)
+ seq_printf(seq, ",errors=panic");
+ if (!sbi->sb_chkdsk)
+ seq_printf(seq, ",chkdsk=no");
+ if (sbi->sb_chkdsk == 2)
+ seq_printf(seq, ",chkdsk=always");
+ if (!sbi->sb_eas)
+ seq_printf(seq, ",eas=no");
+ if (sbi->sb_eas == 1)
+ seq_printf(seq, ",eas=ro");
+ if (sbi->sb_timeshift)
+ seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift);
+ return 0;
+}
+
/* Super operations */
static const struct super_operations hpfs_sops =
@@ -514,7 +538,7 @@ static const struct super_operations hpfs_sops =
.put_super = hpfs_put_super,
.statfs = hpfs_statfs,
.remount_fs = hpfs_remount_fs,
- .show_options = generic_show_options,
+ .show_options = hpfs_show_options,
};
static int hpfs_fill_super(struct super_block *s, void *options, int silent)
@@ -537,8 +561,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
int o;
- save_mount_options(s, options);
-
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi) {
return -ENOMEM;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2ad98d6e19f4..70078096117d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal,
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ if (!(jinode->i_flags & JI_WRITE_DATA))
+ continue;
mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
@@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ if (!(jinode->i_flags & JI_WAIT_DATA))
+ continue;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 435f0b26ac20..b31852f76f46 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_inode_add_write);
+EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2c56c3e32194..1749519b362f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
/*
* File inode in the inode list of the handle's transaction
*/
-int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
+ unsigned long flags)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
* and if jinode->i_next_transaction == transaction, commit code
* will only file the inode where we want it.
*/
- if (jinode->i_transaction == transaction ||
- jinode->i_next_transaction == transaction)
+ if ((jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction) &&
+ (jinode->i_flags & flags) == flags)
return 0;
spin_lock(&journal->j_list_lock);
-
+ jinode->i_flags |= flags;
+ /* Is inode already attached where we need it? */
if (jinode->i_transaction == transaction ||
jinode->i_next_transaction == transaction)
goto done;
@@ -2523,6 +2526,17 @@ done:
return 0;
}
+int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+{
+ return jbd2_journal_file_inode(handle, jinode,
+ JI_WRITE_DATA | JI_WAIT_DATA);
+}
+
+int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+{
+ return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+}
+
/*
* File truncate and transaction commit interact with each other in a
* non-trivial way. If a transaction writing data block A is
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 3ed9a4b49778..c2332e30f218 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
}
static int jffs2_security_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+ return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
name, buffer, size, flags);
}
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 4ebecff1d922..5d6030826c52 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
}
static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+ return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED,
name, buffer, size, flags);
}
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index bce249e1b277..9d027b4abcf9 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
}
static int jffs2_user_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+ return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER,
name, buffer, size, flags);
}
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index beb182b503b3..0bf3c33aedff 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler,
}
static int jfs_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
-
name = xattr_full_name(handler, name);
return __jfs_xattr_set(inode, name, value, size, flags);
}
@@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler,
}
static int jfs_xattr_set_os2(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
-
if (is_known_namespace(name))
return -EOPNOTSUPP;
return __jfs_xattr_set(inode, name, value, size, flags);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 68a44315d949..8a652404eef6 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -753,7 +753,8 @@ int kernfs_add_one(struct kernfs_node *kn)
ps_iattr = parent->iattr;
if (ps_iattr) {
struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
- ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+ ktime_get_real_ts(&ps_iattrs->ia_ctime);
+ ps_iattrs->ia_mtime = ps_iattrs->ia_ctime;
}
mutex_unlock(&kernfs_mutex);
@@ -1279,8 +1280,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
/* update timestamps on the parent */
if (ps_iattr) {
- ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
- ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+ ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime);
+ ps_iattr->ia_iattr.ia_mtime =
+ ps_iattr->ia_iattr.ia_ctime;
}
kernfs_put(pos);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 7247252ee9b1..e1574008adc9 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -190,15 +190,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
char *buf;
buf = of->prealloc_buf;
- if (!buf)
+ if (buf)
+ mutex_lock(&of->prealloc_mutex);
+ else
buf = kmalloc(len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
/*
* @of->mutex nests outside active ref and is used both to ensure that
- * the ops aren't called concurrently for the same open file, and
- * to provide exclusive access to ->prealloc_buf (when that exists).
+ * the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) {
@@ -214,21 +215,23 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
else
len = -EINVAL;
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
+
if (len < 0)
- goto out_unlock;
+ goto out_free;
if (copy_to_user(user_buf, buf, len)) {
len = -EFAULT;
- goto out_unlock;
+ goto out_free;
}
*ppos += len;
- out_unlock:
- kernfs_put_active(of->kn);
- mutex_unlock(&of->mutex);
out_free:
- if (buf != of->prealloc_buf)
+ if (buf == of->prealloc_buf)
+ mutex_unlock(&of->prealloc_mutex);
+ else
kfree(buf);
return len;
}
@@ -284,15 +287,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
}
buf = of->prealloc_buf;
- if (!buf)
+ if (buf)
+ mutex_lock(&of->prealloc_mutex);
+ else
buf = kmalloc(len + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
+ if (copy_from_user(buf, user_buf, len)) {
+ len = -EFAULT;
+ goto out_free;
+ }
+ buf[len] = '\0'; /* guarantee string termination */
+
/*
* @of->mutex nests outside active ref and is used both to ensure that
- * the ops aren't called concurrently for the same open file, and
- * to provide exclusive access to ->prealloc_buf (when that exists).
+ * the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) {
@@ -301,26 +311,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
goto out_free;
}
- if (copy_from_user(buf, user_buf, len)) {
- len = -EFAULT;
- goto out_unlock;
- }
- buf[len] = '\0'; /* guarantee string termination */
-
ops = kernfs_ops(of->kn);
if (ops->write)
len = ops->write(of, buf, len, *ppos);
else
len = -EINVAL;
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
+
if (len > 0)
*ppos += len;
-out_unlock:
- kernfs_put_active(of->kn);
- mutex_unlock(&of->mutex);
out_free:
- if (buf != of->prealloc_buf)
+ if (buf == of->prealloc_buf)
+ mutex_unlock(&of->prealloc_mutex);
+ else
kfree(buf);
return len;
}
@@ -687,6 +693,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
error = -ENOMEM;
if (!of->prealloc_buf)
goto err_free;
+ mutex_init(&of->prealloc_mutex);
}
/*
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b5247226732b..63b925d5ba1e 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -54,7 +54,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
iattrs->ia_mode = kn->mode;
iattrs->ia_uid = GLOBAL_ROOT_UID;
iattrs->ia_gid = GLOBAL_ROOT_GID;
- iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+ ktime_get_real_ts(&iattrs->ia_atime);
+ iattrs->ia_mtime = iattrs->ia_atime;
+ iattrs->ia_ctime = iattrs->ia_atime;
simple_xattrs_init(&kn->iattr->xattrs);
out_unlock:
@@ -157,10 +160,11 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
return 0;
}
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_node *kn = inode->i_private;
struct kernfs_iattrs *attrs;
void *secdata;
int error;
@@ -172,11 +176,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
- error = security_inode_setsecurity(d_inode(dentry), suffix,
+ error = security_inode_setsecurity(inode, suffix,
value, size, flags);
if (error)
return error;
- error = security_inode_getsecctx(d_inode(dentry),
+ error = security_inode_getsecctx(inode,
&secdata, &secdata_len);
if (error)
return error;
@@ -236,16 +240,18 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime =
+ inode->i_ctime = current_fs_time(inode->i_sb);
}
static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
{
+ struct super_block *sb = inode->i_sb;
inode->i_uid = iattr->ia_uid;
inode->i_gid = iattr->ia_gid;
- inode->i_atime = iattr->ia_atime;
- inode->i_mtime = iattr->ia_mtime;
- inode->i_ctime = iattr->ia_ctime;
+ inode->i_atime = timespec_trunc(iattr->ia_atime, sb->s_time_gran);
+ inode->i_mtime = timespec_trunc(iattr->ia_mtime, sb->s_time_gran);
+ inode->i_ctime = timespec_trunc(iattr->ia_ctime, sb->s_time_gran);
}
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 45c9192c276e..37159235ac10 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -81,7 +81,8 @@ int kernfs_iop_permission(struct inode *inode, int mask);
int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
size_t size, int flags);
int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode,
diff --git a/fs/libfs.c b/fs/libfs.c
index 8765ff1adc07..3db2721144c2 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1118,8 +1118,9 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
return -EPERM;
}
-static int empty_dir_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
return -EOPNOTSUPP;
}
diff --git a/fs/namei.c b/fs/namei.c
index 9d193d336c9f..70580ab1445c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -35,6 +35,7 @@
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
+#include <linux/bitops.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -1415,21 +1416,28 @@ static void follow_mount(struct path *path)
}
}
+static int path_parent_directory(struct path *path)
+{
+ struct dentry *old = path->dentry;
+ /* rare case of legitimate dget_parent()... */
+ path->dentry = dget_parent(path->dentry);
+ dput(old);
+ if (unlikely(!path_connected(path)))
+ return -ENOENT;
+ return 0;
+}
+
static int follow_dotdot(struct nameidata *nd)
{
while(1) {
- struct dentry *old = nd->path.dentry;
-
if (nd->path.dentry == nd->root.dentry &&
nd->path.mnt == nd->root.mnt) {
break;
}
if (nd->path.dentry != nd->path.mnt->mnt_root) {
- /* rare case of legitimate dget_parent()... */
- nd->path.dentry = dget_parent(nd->path.dentry);
- dput(old);
- if (unlikely(!path_connected(&nd->path)))
- return -ENOENT;
+ int ret = path_parent_directory(&nd->path);
+ if (ret)
+ return ret;
break;
}
if (!follow_up(&nd->path))
@@ -1797,74 +1805,144 @@ static int walk_component(struct nameidata *nd, int flags)
#include <asm/word-at-a-time.h>
-#ifdef CONFIG_64BIT
+#ifdef HASH_MIX
-static inline unsigned int fold_hash(unsigned long hash)
-{
- return hash_64(hash, 32);
-}
+/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
+
+#elif defined(CONFIG_64BIT)
+/*
+ * Register pressure in the mixing function is an issue, particularly
+ * on 32-bit x86, but almost any function requires one state value and
+ * one temporary. Instead, use a function designed for two state values
+ * and no temporaries.
+ *
+ * This function cannot create a collision in only two iterations, so
+ * we have two iterations to achieve avalanche. In those two iterations,
+ * we have six layers of mixing, which is enough to spread one bit's
+ * influence out to 2^6 = 64 state bits.
+ *
+ * Rotate constants are scored by considering either 64 one-bit input
+ * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
+ * probability of that delta causing a change to each of the 128 output
+ * bits, using a sample of random initial states.
+ *
+ * The Shannon entropy of the computed probabilities is then summed
+ * to produce a score. Ideally, any input change has a 50% chance of
+ * toggling any given output bit.
+ *
+ * Mixing scores (in bits) for (12,45):
+ * Input delta: 1-bit 2-bit
+ * 1 round: 713.3 42542.6
+ * 2 rounds: 2753.7 140389.8
+ * 3 rounds: 5954.1 233458.2
+ * 4 rounds: 7862.6 256672.2
+ * Perfect: 8192 258048
+ * (64*128) (64*63/2 * 128)
+ */
+#define HASH_MIX(x, y, a) \
+ ( x ^= (a), \
+ y ^= x, x = rol64(x,12),\
+ x += y, y = rol64(y,45),\
+ y *= 9 )
/*
- * This is George Marsaglia's XORSHIFT generator.
- * It implements a maximum-period LFSR in only a few
- * instructions. It also has the property (required
- * by hash_name()) that mix_hash(0) = 0.
+ * Fold two longs into one 32-bit hash value. This must be fast, but
+ * latency isn't quite as critical, as there is a fair bit of additional
+ * work done before the hash value is used.
*/
-static inline unsigned long mix_hash(unsigned long hash)
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
- hash ^= hash << 13;
- hash ^= hash >> 7;
- hash ^= hash << 17;
- return hash;
+ y ^= x * GOLDEN_RATIO_64;
+ y *= GOLDEN_RATIO_64;
+ return y >> 32;
}
#else /* 32-bit case */
-#define fold_hash(x) (x)
+/*
+ * Mixing scores (in bits) for (7,20):
+ * Input delta: 1-bit 2-bit
+ * 1 round: 330.3 9201.6
+ * 2 rounds: 1246.4 25475.4
+ * 3 rounds: 1907.1 31295.1
+ * 4 rounds: 2042.3 31718.6
+ * Perfect: 2048 31744
+ * (32*64) (32*31/2 * 64)
+ */
+#define HASH_MIX(x, y, a) \
+ ( x ^= (a), \
+ y ^= x, x = rol32(x, 7),\
+ x += y, y = rol32(y,20),\
+ y *= 9 )
-static inline unsigned long mix_hash(unsigned long hash)
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
- hash ^= hash << 13;
- hash ^= hash >> 17;
- hash ^= hash << 5;
- return hash;
+ /* Use arch-optimized multiply if one exists */
+ return __hash_32(y ^ __hash_32(x));
}
#endif
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/*
+ * Return the hash of a string of known length. This is carfully
+ * designed to match hash_name(), which is the more critical function.
+ * In particular, we must end by hashing a final word containing 0..7
+ * payload bytes, to match the way that hash_name() iterates until it
+ * finds the delimiter after the name.
+ */
+unsigned int full_name_hash(const char *name, unsigned int len)
{
- unsigned long a, hash = 0;
+ unsigned long a, x = 0, y = 0;
for (;;) {
+ if (!len)
+ goto done;
a = load_unaligned_zeropad(name);
if (len < sizeof(unsigned long))
break;
- hash = mix_hash(hash + a);
+ HASH_MIX(x, y, a);
name += sizeof(unsigned long);
len -= sizeof(unsigned long);
- if (!len)
- goto done;
}
- hash += a & bytemask_from_count(len);
+ x ^= a & bytemask_from_count(len);
done:
- return fold_hash(hash);
+ return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const char *name)
+{
+ unsigned long a = 0, x = 0, y = 0, adata, mask, len;
+ const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+
+ len = -sizeof(unsigned long);
+ do {
+ HASH_MIX(x, y, a);
+ len += sizeof(unsigned long);
+ a = load_unaligned_zeropad(name+len);
+ } while (!has_zero(a, &adata, &constants));
+
+ adata = prep_zero_mask(a, adata, &constants);
+ mask = create_zero_mask(adata);
+ x ^= a & zero_bytemask(mask);
+
+ return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+}
+EXPORT_SYMBOL(hashlen_string);
+
/*
* Calculate the length and hash of the path component, and
* return the "hash_len" as the result.
*/
static inline u64 hash_name(const char *name)
{
- unsigned long a, b, adata, bdata, mask, hash, len;
+ unsigned long a = 0, b, x = 0, y = 0, adata, bdata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- hash = a = 0;
len = -sizeof(unsigned long);
do {
- hash = mix_hash(hash + a);
+ HASH_MIX(x, y, a);
len += sizeof(unsigned long);
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
@@ -1872,25 +1950,40 @@ static inline u64 hash_name(const char *name)
adata = prep_zero_mask(a, adata, &constants);
bdata = prep_zero_mask(b, bdata, &constants);
-
mask = create_zero_mask(adata | bdata);
+ x ^= a & zero_bytemask(mask);
- hash += a & zero_bytemask(mask);
- len += find_zero(mask);
- return hashlen_create(fold_hash(hash), len);
+ return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
-#else
+#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const char *name, unsigned int len)
{
unsigned long hash = init_name_hash();
while (len--)
- hash = partial_name_hash(*name++, hash);
+ hash = partial_name_hash((unsigned char)*name++, hash);
return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const char *name)
+{
+ unsigned long hash = init_name_hash();
+ unsigned long len = 0, c;
+
+ c = (unsigned char)*name;
+ while (c) {
+ len++;
+ hash = partial_name_hash(c, hash);
+ c = (unsigned char)name[len];
+ }
+ return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(hashlen_string);
+
/*
* We know there's a real path component here of at least
* one character.
@@ -1934,7 +2027,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
int type;
err = may_lookup(nd);
- if (err)
+ if (err)
return err;
hash_len = hash_name(name);
@@ -2428,6 +2521,34 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+ /* Find something mounted on "pts" in the same directory as
+ * the input path.
+ */
+ struct dentry *child, *parent;
+ struct qstr this;
+ int ret;
+
+ ret = path_parent_directory(path);
+ if (ret)
+ return ret;
+
+ parent = path->dentry;
+ this.name = "pts";
+ this.len = 3;
+ child = d_hash_and_lookup(parent, &this);
+ if (!child)
+ return -ENOENT;
+
+ path->dentry = child;
+ dput(parent);
+ follow_mount(path);
+ return 0;
+}
+#endif
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
@@ -2909,9 +3030,13 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
}
if (*opened & FILE_CREATED)
fsnotify_create(dir, dentry);
- path->dentry = dentry;
- path->mnt = nd->path.mnt;
- return 1;
+ if (unlikely(d_is_negative(dentry))) {
+ error = -ENOENT;
+ } else {
+ path->dentry = dentry;
+ path->mnt = nd->path.mnt;
+ return 1;
+ }
}
}
dput(dentry);
@@ -3080,9 +3205,7 @@ static int do_last(struct nameidata *nd,
int acc_mode = op->acc_mode;
unsigned seq;
struct inode *inode;
- struct path save_parent = { .dentry = NULL, .mnt = NULL };
struct path path;
- bool retried = false;
int error;
nd->flags &= ~LOOKUP_PARENT;
@@ -3125,7 +3248,6 @@ static int do_last(struct nameidata *nd,
return -EISDIR;
}
-retry_lookup:
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
error = mnt_want_write(nd->path.mnt);
if (!error)
@@ -3177,6 +3299,10 @@ retry_lookup:
got_write = false;
}
+ error = follow_managed(&path, nd);
+ if (unlikely(error < 0))
+ return error;
+
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
@@ -3192,10 +3318,6 @@ retry_lookup:
return -EEXIST;
}
- error = follow_managed(&path, nd);
- if (unlikely(error < 0))
- return error;
-
seq = 0; /* out of RCU mode, so the value doesn't matter */
inode = d_backing_inode(path.dentry);
finish_lookup:
@@ -3206,23 +3328,14 @@ finish_lookup:
if (unlikely(error))
return error;
- if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
- path_to_nameidata(&path, nd);
- } else {
- save_parent.dentry = nd->path.dentry;
- save_parent.mnt = mntget(path.mnt);
- nd->path.dentry = path.dentry;
-
- }
+ path_to_nameidata(&path, nd);
nd->inode = inode;
nd->seq = seq;
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
finish_open:
error = complete_walk(nd);
- if (error) {
- path_put(&save_parent);
+ if (error)
return error;
- }
audit_inode(nd->name, nd->path.dentry, 0);
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
@@ -3245,13 +3358,9 @@ finish_open_created:
goto out;
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file, current_cred());
- if (!error) {
- *opened |= FILE_OPENED;
- } else {
- if (error == -EOPENSTALE)
- goto stale_open;
+ if (error)
goto out;
- }
+ *opened |= FILE_OPENED;
opened:
error = open_check_o_direct(file);
if (!error)
@@ -3267,26 +3376,7 @@ out:
}
if (got_write)
mnt_drop_write(nd->path.mnt);
- path_put(&save_parent);
return error;
-
-stale_open:
- /* If no saved parent or already retried then can't retry */
- if (!save_parent.dentry || retried)
- goto out;
-
- BUG_ON(save_parent.dentry != dir);
- path_put(&nd->path);
- nd->path = save_parent;
- nd->inode = dir->d_inode;
- save_parent.mnt = NULL;
- save_parent.dentry = NULL;
- if (got_write) {
- mnt_drop_write(nd->path.mnt);
- got_write = false;
- }
- retried = true;
- goto retry_lookup;
}
static int do_tmpfile(struct nameidata *nd, unsigned flags,
@@ -3627,6 +3717,8 @@ retry:
switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+ if (!error)
+ ima_post_path_mknod(dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(path.dentry->d_inode,dentry,mode,
@@ -4540,7 +4632,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link)
out:
return len;
}
-EXPORT_SYMBOL(readlink_copy);
/*
* A helper for ->readlink(). This should be used *ONLY* for symlinks that
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fb1691b4355..a7ec92c051f5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2409,8 +2409,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
}
if (type->fs_flags & FS_USERNS_VISIBLE) {
- if (!fs_fully_visible(type, &mnt_flags))
+ if (!fs_fully_visible(type, &mnt_flags)) {
+ put_filesystem(type);
return -EPERM;
+ }
}
}
@@ -3271,7 +3273,7 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
struct inode *inode = child->mnt_mountpoint->d_inode;
/* Only worry about locked mounts */
- if (!(mnt_flags & MNT_LOCKED))
+ if (!(child->mnt.mnt_flags & MNT_LOCKED))
continue;
/* Is the directory permanetly empty? */
if (!is_empty_dir_inode(inode))
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 618ced381a14..aaa2e8d3df6f 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -217,7 +217,8 @@ static u32 initiate_file_draining(struct nfs_client *clp,
}
if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
- &args->cbl_range)) {
+ &args->cbl_range,
+ be32_to_cpu(args->cbl_stateid.seqid))) {
rv = NFS4_OK;
goto unlock;
}
@@ -500,8 +501,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
cps->slot = slot;
/* The ca_maxresponsesize_cached is 0 with no DRC */
- if (args->csa_cachethis != 0)
- return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ if (args->csa_cachethis != 0) {
+ status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ goto out_unlock;
+ }
/*
* Check for pending referring calls. If a match is found, a
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 976c90608e56..d81f96aacd51 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
p = read_buf(xdr, NFS4_STATEID_SIZE);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(stateid, p, NFS4_STATEID_SIZE);
+ memcpy(stateid->data, p, NFS4_STATEID_SIZE);
return 0;
}
+static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
{
__be32 *p;
@@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
__be32 *p;
__be32 status;
- status = decode_stateid(xdr, &args->stateid);
+ status = decode_delegation_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
goto out;
p = read_buf(xdr, 4);
@@ -227,6 +233,11 @@ out:
}
#if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
struct xdr_stream *xdr,
@@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
}
p = xdr_decode_hyper(p, &args->cbl_range.offset);
p = xdr_decode_hyper(p, &args->cbl_range.length);
- status = decode_stateid(xdr, &args->cbl_stateid);
+ status = decode_layout_stateid(xdr, &args->cbl_stateid);
if (unlikely(status != 0))
goto out;
} else if (args->cbl_recall_type == RETURN_FSID) {
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5166adcfc0fb..322c2585bc34 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp)
/**
* nfs4_copy_delegation_stateid - Copy inode's state ID information
- * @dst: stateid data structure to fill in
* @inode: inode to check
* @flags: delegation type requirement
+ * @dst: stateid data structure to fill in
+ * @cred: optional argument to retrieve credential
*
* Returns "true" and fills in "dst->data" * if inode had a delegation,
* otherwise "false" is returned.
*/
-bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
- fmode_t flags)
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
+ nfs4_stateid *dst, struct rpc_cred **cred)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
@@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
if (ret) {
nfs4_stateid_copy(dst, &delegation->stateid);
nfs_mark_delegation_referenced(delegation);
+ if (cred)
+ *cred = get_rpccred(delegation->cred);
}
rcu_read_unlock();
return ret;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 333063e032f0..64724d252a79 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
-bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 741a92c470bb..979b3c4dee6a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -87,6 +87,7 @@ struct nfs_direct_req {
int mirror_count;
ssize_t count, /* bytes actually processed */
+ max_count, /* max expected count */
bytes_left, /* bytes left to be sent */
io_start, /* start of IO */
error; /* any reported error */
@@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
int i;
ssize_t count;
+ WARN_ON_ONCE(dreq->count >= dreq->max_count);
+
if (dreq->mirror_count == 1) {
dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
dreq->count += hdr->good_bytes;
@@ -275,7 +278,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq)
{
- cinfo->lock = &dreq->inode->i_lock;
+ cinfo->inode = dreq->inode;
cinfo->mds = &dreq->mds_cinfo;
cinfo->ds = &dreq->ds_cinfo;
cinfo->dreq = dreq;
@@ -591,7 +594,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
goto out_unlock;
dreq->inode = inode;
- dreq->bytes_left = count;
+ dreq->bytes_left = dreq->max_count = count;
dreq->io_start = iocb->ki_pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -630,13 +633,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
struct list_head *list,
struct nfs_commit_info *cinfo)
{
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
#ifdef CONFIG_NFS_V4_1
if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
#endif
nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
}
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
@@ -671,13 +674,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
if (!nfs_pageio_add_request(&desc, req)) {
nfs_list_remove_request(req);
nfs_list_add_request(req, &failed);
- spin_lock(cinfo.lock);
+ spin_lock(&cinfo.inode->i_lock);
dreq->flags = 0;
if (desc.pg_error < 0)
dreq->error = desc.pg_error;
else
dreq->error = -EIO;
- spin_unlock(cinfo.lock);
+ spin_unlock(&cinfo.inode->i_lock);
}
nfs_release_request(req);
}
@@ -1023,7 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
goto out_unlock;
dreq->inode = inode;
- dreq->bytes_left = iov_iter_count(iter);
+ dreq->bytes_left = dreq->max_count = iov_iter_count(iter);
dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3384dc8e6683..aa59757389dc 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
}
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
if (cinfo->ds->nbuckets >= size)
goto out;
for (i = 0; i < cinfo->ds->nbuckets; i++) {
@@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
swap(cinfo->ds->buckets, buckets);
cinfo->ds->nbuckets = size;
out:
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
kfree(buckets);
return 0;
}
@@ -890,6 +890,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
+ false,
GFP_KERNEL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -915,6 +916,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
+ false,
GFP_NOFS);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0cb1abd535e3..0e8018bc9880 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -26,6 +26,8 @@
#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
+static struct group_info *ff_zero_group;
+
static struct pnfs_layout_hdr *
ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
{
@@ -53,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
kfree(FF_LAYOUT_FROM_HDR(lo));
}
-static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
__be32 *p;
p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
if (unlikely(p == NULL))
return -ENOBUFS;
- memcpy(stateid, p, NFS4_STATEID_SIZE);
+ stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
+ memcpy(stateid->data, p, NFS4_STATEID_SIZE);
dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
p[0], p[1], p[2], p[3]);
return 0;
@@ -211,10 +214,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{
+ struct rpc_cred *cred;
+
ff_layout_remove_mirror(mirror);
kfree(mirror->fh_versions);
- if (mirror->cred)
- put_rpccred(mirror->cred);
+ cred = rcu_access_pointer(mirror->ro_cred);
+ if (cred)
+ put_rpccred(cred);
+ cred = rcu_access_pointer(mirror->rw_cred);
+ if (cred)
+ put_rpccred(cred);
nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
kfree(mirror);
}
@@ -290,6 +299,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
{
u64 new_end, old_end;
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+ return false;
if (new->pls_range.iomode != old->pls_range.iomode)
return false;
old_end = pnfs_calc_offset_end(old->pls_range.offset,
@@ -310,8 +321,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
new_end);
if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
set_bit(NFS_LSEG_ROC, &new->pls_flags);
- if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
- set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
return true;
}
@@ -407,8 +416,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid devid;
struct nfs4_deviceid_node *idnode;
- u32 ds_count;
- u32 fh_count;
+ struct auth_cred acred = { .group_info = ff_zero_group };
+ struct rpc_cred __rcu *cred;
+ u32 ds_count, fh_count, id;
int j;
rc = -EIO;
@@ -456,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array[i]->efficiency = be32_to_cpup(p);
/* stateid */
- rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+ rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
if (rc)
goto out_err_free;
@@ -484,24 +494,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array[i]->fh_versions_cnt = fh_count;
/* user */
- rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+ rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
+ acred.uid = make_kuid(&init_user_ns, id);
+
/* group */
- rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+ rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
+ acred.gid = make_kgid(&init_user_ns, id);
+
+ /* find the cred for it */
+ rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags));
+ if (IS_ERR(cred)) {
+ rc = PTR_ERR(cred);
+ goto out_err_free;
+ }
+
+ if (lgr->range.iomode == IOMODE_READ)
+ rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+ else
+ rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+
mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
if (mirror != fls->mirror_array[i]) {
+ /* swap cred ptrs so free_mirror will clean up old */
+ if (lgr->range.iomode == IOMODE_READ) {
+ cred = xchg(&mirror->ro_cred, cred);
+ rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+ } else {
+ cred = xchg(&mirror->rw_cred, cred);
+ rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+ }
ff_layout_free_mirror(fls->mirror_array[i]);
fls->mirror_array[i] = mirror;
}
- dprintk("%s: uid %d gid %d\n", __func__,
- fls->mirror_array[i]->uid,
- fls->mirror_array[i]->gid);
+ dprintk("%s: iomode %s uid %u gid %u\n", __func__,
+ lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
+ from_kuid(&init_user_ns, acred.uid),
+ from_kgid(&init_user_ns, acred.gid));
}
p = xdr_inline_decode(&stream, 4);
@@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
else {
int i;
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
if (cinfo->ds->nbuckets != 0)
kfree(buckets);
else {
@@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
NFS_INVALID_STABLE_HOW;
}
}
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
return 0;
}
}
@@ -786,6 +821,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
}
static void
+ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req,
+ bool strict_iomode)
+{
+retry_strict:
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_READ,
+ strict_iomode,
+ GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ }
+
+ /* If we don't have checking, do get a IOMODE_RW
+ * segment, and the server wants to avoid READs
+ * there, then retry!
+ */
+ if (pgio->pg_lseg && !strict_iomode &&
+ ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
+ strict_iomode = true;
+ goto retry_strict;
+ }
+}
+
+static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -795,26 +860,23 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
/* Use full layout for now */
- if (!pgio->pg_lseg) {
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- req->wb_context,
- 0,
- NFS4_MAX_UINT64,
- IOMODE_READ,
- GFP_KERNEL);
- if (IS_ERR(pgio->pg_lseg)) {
- pgio->pg_error = PTR_ERR(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
- return;
- }
- }
+ if (!pgio->pg_lseg)
+ ff_layout_pg_get_read(pgio, req, false);
+ else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg))
+ ff_layout_pg_get_read(pgio, req, true);
+
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
- if (!ds)
- goto out_mds;
+ if (!ds) {
+ if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+ goto out_pnfs;
+ else
+ goto out_mds;
+ }
+
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
pgio->pg_mirror_idx = ds_idx;
@@ -828,6 +890,12 @@ out_mds:
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
nfs_pageio_reset_read_mds(pgio);
+ return;
+
+out_pnfs:
+ pnfs_set_lo_fail(pgio->pg_lseg);
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
}
static void
@@ -847,6 +915,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
+ false,
GFP_NOFS);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -870,8 +939,12 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
for (i = 0; i < pgio->pg_mirror_count; i++) {
ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
- if (!ds)
- goto out_mds;
+ if (!ds) {
+ if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+ goto out_pnfs;
+ else
+ goto out_mds;
+ }
pgm = &pgio->pg_mirrors[i];
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
@@ -883,6 +956,12 @@ out_mds:
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
nfs_pageio_reset_write_mds(pgio);
+ return;
+
+out_pnfs:
+ pnfs_set_lo_fail(pgio->pg_lseg);
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
}
static unsigned int
@@ -895,6 +974,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
+ false,
GFP_NOFS);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1067,8 +1147,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
- if (ff_layout_no_fallback_to_mds(lseg) ||
- ff_layout_has_available_ds(lseg))
+ if (ff_layout_avoid_mds_available_ds(lseg))
return -NFS4ERR_RESET_TO_PNFS;
reset:
dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1215,8 +1294,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
hdr->pgio_mirror_idx + 1,
&hdr->pgio_mirror_idx))
goto out_eagain;
- set_bit(NFS_LAYOUT_RETURN_REQUESTED,
- &hdr->lseg->pls_layout->plh_flags);
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1260,7 +1337,7 @@ ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
}
static bool
-ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
{
/* No mirroring for now */
struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -1297,16 +1374,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
rpc_exit(task, -EIO);
return -EIO;
}
- if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
- dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
- if (ff_layout_has_available_ds(hdr->lseg))
- pnfs_read_resend_pnfs(hdr);
- else
- ff_layout_reset_read(hdr);
- rpc_exit(task, 0);
+ if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
+ rpc_exit(task, -EHOSTDOWN);
return -EAGAIN;
}
- hdr->pgio_done_cb = ff_layout_read_done_cb;
ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
@@ -1496,14 +1567,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EIO;
}
- if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
- bool retry_pnfs;
-
- retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
- dprintk("%s task %u reset io to %s\n", __func__,
- task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
- ff_layout_reset_write(hdr, retry_pnfs);
- rpc_exit(task, 0);
+ if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
+ rpc_exit(task, -EHOSTDOWN);
return -EAGAIN;
}
@@ -1712,7 +1777,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
goto out_failed;
ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
- if (IS_ERR(ds_cred))
+ if (!ds_cred)
goto out_failed;
vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1720,6 +1785,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+ hdr->pgio_done_cb = ff_layout_read_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
@@ -1737,11 +1803,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
vers == 3 ? &ff_layout_read_call_ops_v3 :
&ff_layout_read_call_ops_v4,
0, RPC_TASK_SOFTCONN);
-
+ put_rpccred(ds_cred);
return PNFS_ATTEMPTED;
out_failed:
- if (ff_layout_has_available_ds(lseg))
+ if (ff_layout_avoid_mds_available_ds(lseg))
return PNFS_TRY_AGAIN;
return PNFS_NOT_ATTEMPTED;
}
@@ -1769,7 +1835,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
return PNFS_NOT_ATTEMPTED;
ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
- if (IS_ERR(ds_cred))
+ if (!ds_cred)
return PNFS_NOT_ATTEMPTED;
vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1798,6 +1864,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
vers == 3 ? &ff_layout_write_call_ops_v3 :
&ff_layout_write_call_ops_v4,
sync, RPC_TASK_SOFTCONN);
+ put_rpccred(ds_cred);
return PNFS_ATTEMPTED;
}
@@ -1824,7 +1891,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct rpc_clnt *ds_clnt;
struct rpc_cred *ds_cred;
u32 idx;
- int vers;
+ int vers, ret;
struct nfs_fh *fh;
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
@@ -1838,7 +1905,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
goto out_err;
ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
- if (IS_ERR(ds_cred))
+ if (!ds_cred)
goto out_err;
vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1854,10 +1921,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
if (fh)
data->args.fh = fh;
- return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+ ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
vers == 3 ? &ff_layout_commit_call_ops_v3 :
&ff_layout_commit_call_ops_v4,
how, RPC_TASK_SOFTCONN);
+ put_rpccred(ds_cred);
+ return ret;
out_err:
pnfs_generic_prepare_to_resend_writes(data);
pnfs_generic_commit_release(data);
@@ -2223,6 +2292,11 @@ static int __init nfs4flexfilelayout_init(void)
{
printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
__func__);
+ if (!ff_zero_group) {
+ ff_zero_group = groups_alloc(0);
+ if (!ff_zero_group)
+ return -ENOMEM;
+ }
return pnfs_register_layoutdriver(&flexfilelayout_type);
}
@@ -2231,6 +2305,10 @@ static void __exit nfs4flexfilelayout_exit(void)
printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
__func__);
pnfs_unregister_layoutdriver(&flexfilelayout_type);
+ if (ff_zero_group) {
+ put_group_info(ff_zero_group);
+ ff_zero_group = NULL;
+ }
}
MODULE_ALIAS("nfs-layouttype4-4");
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index dd353bb7dc0a..1bcdb15d0c41 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -10,7 +10,8 @@
#define FS_NFS_NFS4FLEXFILELAYOUT_H
#define FF_FLAGS_NO_LAYOUTCOMMIT 1
-#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_READ_IO 4
#include "../pnfs.h"
@@ -76,9 +77,8 @@ struct nfs4_ff_layout_mirror {
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
nfs4_stateid stateid;
- u32 uid;
- u32 gid;
- struct rpc_cred *cred;
+ struct rpc_cred __rcu *ro_cred;
+ struct rpc_cred __rcu *rw_cred;
atomic_t ref;
spinlock_t lock;
struct nfs4_ff_layoutstat read_stat;
@@ -154,6 +154,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
}
static inline bool
+ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
+}
+
+static inline bool
ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
{
return nfs4_test_deviceid_unavailable(node);
@@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
u32 ds_idx, struct rpc_cred *mdscred);
bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
+
#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index add0e5a70bd6..0aa36be71fce 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
return e1->opnum < e2->opnum ? -1 : 1;
if (e1->status != e2->status)
return e1->status < e2->status ? -1 : 1;
- ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+ ret = memcmp(e1->stateid.data, e2->stateid.data,
+ sizeof(e1->stateid.data));
if (ret != 0)
return ret;
ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
@@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
return 0;
}
-/* currently we only support AUTH_NONE and AUTH_SYS */
-static rpc_authflavor_t
-nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+static struct rpc_cred *
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
{
- if (mirror->uid == (u32)-1)
- return RPC_AUTH_NULL;
- return RPC_AUTH_UNIX;
-}
+ struct rpc_cred *cred, __rcu **pcred;
-/* fetch cred for NFSv3 DS */
-static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
- struct nfs4_pnfs_ds *ds)
-{
- if (ds->ds_clp && !mirror->cred &&
- mirror->mirror_ds->ds_versions[0].version == 3) {
- struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
- struct rpc_cred *cred;
- struct auth_cred acred = {
- .uid = make_kuid(&init_user_ns, mirror->uid),
- .gid = make_kgid(&init_user_ns, mirror->gid),
- };
-
- /* AUTH_NULL ignores acred */
- cred = auth->au_ops->lookup_cred(auth, &acred, 0);
- if (IS_ERR(cred)) {
- dprintk("%s: lookup_cred failed with %ld\n",
- __func__, PTR_ERR(cred));
- return PTR_ERR(cred);
- } else {
- if (cmpxchg(&mirror->cred, NULL, cred))
- put_rpccred(cred);
- }
- }
- return 0;
+ if (iomode == IOMODE_READ)
+ pcred = &mirror->ro_cred;
+ else
+ pcred = &mirror->rw_cred;
+
+ rcu_read_lock();
+ do {
+ cred = rcu_dereference(*pcred);
+ if (!cred)
+ break;
+
+ cred = get_rpccred_rcu(cred);
+ } while(!cred);
+ rcu_read_unlock();
+ return cred;
}
struct nfs_fh *
@@ -356,7 +343,23 @@ out:
return fh;
}
-/* Upon return, either ds is connected, or ds is NULL */
+/**
+ * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
+ * @lseg: the layout segment we're operating on
+ * @ds_idx: index of the DS to use
+ * @fail_return: return layout on connect failure?
+ *
+ * Try to prepare a DS connection to accept an RPC call. This involves
+ * selecting a mirror to use and connecting the client to it if it's not
+ * already connected.
+ *
+ * Since we only need a single functioning mirror to satisfy a read, we don't
+ * want to return the layout if there is one. For writes though, any down
+ * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
+ * between the two cases.
+ *
+ * Returns a pointer to a connected DS object on success or NULL on failure.
+ */
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
bool fail_return)
@@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
struct inode *ino = lseg->pls_layout->plh_inode;
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
- rpc_authflavor_t flavor;
if (!ff_layout_mirror_valid(lseg, mirror)) {
pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
@@ -383,9 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
smp_rmb();
if (ds->ds_clp)
- goto out_update_creds;
-
- flavor = nfs4_ff_layout_choose_authflavor(mirror);
+ goto out;
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
@@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
dataserver_retrans,
mirror->mirror_ds->ds_versions[0].version,
mirror->mirror_ds->ds_versions[0].minor_version,
- flavor);
+ RPC_AUTH_UNIX);
/* connect success, check rsize/wsize limit */
if (ds->ds_clp) {
@@ -410,20 +410,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
- if (!fail_return) {
- if (ff_layout_has_available_ds(lseg))
- set_bit(NFS_LAYOUT_RETURN_REQUESTED,
- &lseg->pls_layout->plh_flags);
- else
- pnfs_error_mark_layout_for_return(ino, lseg);
- } else
+ if (fail_return || !ff_layout_has_available_ds(lseg))
pnfs_error_mark_layout_for_return(ino, lseg);
ds = NULL;
- goto out;
}
-out_update_creds:
- if (ff_layout_update_mirror_cred(mirror, ds))
- ds = NULL;
out:
return ds;
}
@@ -433,16 +423,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
struct rpc_cred *mdscred)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
- struct rpc_cred *cred = ERR_PTR(-EINVAL);
-
- if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
- goto out;
+ struct rpc_cred *cred;
- if (mirror && mirror->cred)
- cred = mirror->cred;
- else
- cred = mdscred;
-out:
+ if (mirror) {
+ cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
+ if (!cred)
+ cred = get_rpccred(mdscred);
+ } else {
+ cred = get_rpccred(mdscred);
+ }
return cred;
}
@@ -562,6 +551,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
return ff_rw_layout_has_available_ds(lseg);
}
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
+{
+ return ff_layout_no_fallback_to_mds(lseg) ||
+ ff_layout_has_available_ds(lseg);
+}
+
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+ return lseg->pls_range.iomode == IOMODE_RW &&
+ ff_layout_no_read_on_rw(lseg);
+}
+
module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
"retries a request before it attempts further "
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f1d1d2c472e9..5154fa65a2f2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -477,6 +477,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b587ccd31083..b6cd15314bab 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -13,6 +13,7 @@
/* nfs4.2proc.c */
int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t);
int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index dff83460e5a6..aa03ed09ba06 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
return err;
}
+static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
+ struct nfs_lock_context *src_lock,
+ struct file *dst, loff_t pos_dst,
+ struct nfs_lock_context *dst_lock,
+ size_t count)
+{
+ struct nfs42_copy_args args = {
+ .src_fh = NFS_FH(file_inode(src)),
+ .src_pos = pos_src,
+ .dst_fh = NFS_FH(file_inode(dst)),
+ .dst_pos = pos_dst,
+ .count = count,
+ };
+ struct nfs42_copy_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct inode *dst_inode = file_inode(dst);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ int status;
+
+ status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+ src_lock, FMODE_READ);
+ if (status)
+ return status;
+
+ status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
+ if (status)
+ return status;
+
+ status = nfs4_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == -ENOTSUPP)
+ server->caps &= ~NFS_CAP_COPY;
+ if (status)
+ return status;
+
+ if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
+ status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
+ if (status)
+ return status;
+ }
+
+ truncate_pagecache_range(dst_inode, pos_dst,
+ pos_dst + res.write_res.count);
+
+ return res.write_res.count;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+ struct file *dst, loff_t pos_dst,
+ size_t count)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(dst));
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs4_exception src_exception = { };
+ struct nfs4_exception dst_exception = { };
+ ssize_t err, err2;
+
+ if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
+ return -EOPNOTSUPP;
+
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.inode = file_inode(src);
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.inode = file_inode(dst);
+ dst_exception.state = dst_lock->open_context->state;
+
+ do {
+ inode_lock(file_inode(dst));
+ err = _nfs42_proc_copy(src, pos_src, src_lock,
+ dst, pos_dst, dst_lock, count);
+ inode_unlock(file_inode(dst));
+
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
+}
+
static loff_t _nfs42_proc_llseek(struct file *filep,
struct nfs_lock_context *lock, loff_t offset, int whence)
{
@@ -232,7 +337,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
* with the current stateid.
*/
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
} else
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0ca482a51e53..6dc6f2aea0d6 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -9,9 +9,22 @@
#define encode_fallocate_maxsz (encode_stateid_maxsz + \
2 /* offset */ + \
2 /* length */)
+#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 /* wr_count */ + \
+ 1 /* wr_committed */ + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE))
#define encode_allocate_maxsz (op_encode_hdr_maxsz + \
encode_fallocate_maxsz)
#define decode_allocate_maxsz (op_decode_hdr_maxsz)
+#define encode_copy_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 + 2 + 2 + 1 + 1 + 1)
+#define decode_copy_maxsz (op_decode_hdr_maxsz + \
+ NFS42_WRITE_RES_SIZE + \
+ 1 /* cr_consecutive */ + \
+ 1 /* cr_synchronous */)
#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
encode_fallocate_maxsz)
#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
@@ -49,6 +62,16 @@
decode_putfh_maxsz + \
decode_allocate_maxsz + \
decode_getattr_maxsz)
+#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_copy_maxsz)
+#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_copy_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_deallocate_maxsz + \
@@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr,
encode_fallocate(xdr, args);
}
+static void encode_copy(struct xdr_stream *xdr,
+ struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+ encode_uint64(xdr, args->src_pos);
+ encode_uint64(xdr, args->dst_pos);
+ encode_uint64(xdr, args->count);
+
+ encode_uint32(xdr, 1); /* consecutive = true */
+ encode_uint32(xdr, 1); /* synchronous = true */
+ encode_uint32(xdr, 0); /* src server list */
+}
+
static void encode_deallocate(struct xdr_stream *xdr,
struct nfs42_falloc_args *args,
struct compound_hdr *hdr)
@@ -182,6 +222,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
}
/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_copy_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_copy(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode DEALLOCATE request
*/
static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
@@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
return decode_op_hdr(xdr, OP_ALLOCATE);
}
+static int decode_write_response(struct xdr_stream *xdr,
+ struct nfs42_write_res *res)
+{
+ __be32 *p;
+ int stateids;
+
+ p = xdr_inline_decode(xdr, 4 + 8 + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ stateids = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &res->count);
+ res->verifier.committed = be32_to_cpup(p);
+ return decode_verifier(xdr, &res->verifier.verifier);
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+ struct nfs42_copy_res *res) {
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ res->consecutive = be32_to_cpup(p++);
+ res->synchronous = be32_to_cpup(p++);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_COPY);
+ if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+ status = decode_copy_requirements(xdr, res);
+ if (status)
+ return status;
+ return NFS4ERR_OFFLOAD_NO_REQS;
+ } else if (status)
+ return status;
+
+ status = decode_write_response(xdr, &res->write_res);
+ if (status)
+ return status;
+
+ return decode_copy_requirements(xdr, res);
+}
+
static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_DEALLOCATE);
@@ -331,6 +447,36 @@ out:
}
/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_copy_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_copy(xdr, res);
+out:
+ return status;
+}
+
+/*
* Decode DEALLOCATE request
*/
static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4afdee420d25..768456fa1b17 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -438,8 +438,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
struct nfs41_server_scope **);
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
- fmode_t, const struct nfs_lockowner *);
+extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
+ const struct nfs_lockowner *, nfs4_stateid *,
+ struct rpc_cred **);
extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -496,12 +497,15 @@ extern struct svc_version nfs4_callback_version4;
static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
{
- memcpy(dst, src, sizeof(*dst));
+ memcpy(dst->data, src->data, sizeof(dst->data));
+ dst->type = src->type;
}
static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
{
- return memcmp(dst, src, sizeof(*dst)) == 0;
+ if (dst->type != src->type)
+ return false;
+ return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
}
static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index d0390516467c..014b0e41ace5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
}
#ifdef CONFIG_NFS_V4_2
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count, unsigned int flags)
+{
+ struct inode *in_inode = file_inode(file_in);
+ struct inode *out_inode = file_inode(file_out);
+ int ret;
+
+ if (in_inode == out_inode)
+ return -EINVAL;
+
+ /* flush any pending writes */
+ ret = nfs_sync_inode(in_inode);
+ if (ret)
+ return ret;
+ ret = nfs_sync_inode(out_inode);
+ if (ret)
+ return ret;
+
+ return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
+}
+
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
{
loff_t ret;
@@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = {
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
#ifdef CONFIG_NFS_V4_2
+ .copy_file_range = nfs4_copy_file_range,
.llseek = nfs4_file_llseek,
.fallocate = nfs42_fallocate,
.clone_file_range = nfs42_clone_file_range,
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 5ba22c6b0ffa..c444285bb1b1 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -201,7 +201,7 @@ int nfs_idmap_init(void)
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 084e8570da18..de97567795a5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -74,6 +74,17 @@
#define NFS4_POLL_RETRY_MIN (HZ/10)
#define NFS4_POLL_RETRY_MAX (15*HZ)
+/* file attributes which can be mapped to nfs attributes */
+#define NFS4_VALID_ATTRS (ATTR_MODE \
+ | ATTR_UID \
+ | ATTR_GID \
+ | ATTR_SIZE \
+ | ATTR_ATIME \
+ | ATTR_MTIME \
+ | ATTR_CTIME \
+ | ATTR_ATIME_SET \
+ | ATTR_MTIME_SET)
+
struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -416,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
+ case -NFS4ERR_RECALLCONFLICT:
exception->delay = 1;
return 0;
@@ -2558,15 +2570,20 @@ static int _nfs4_do_open(struct inode *dir,
if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
(opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
nfs4_exclusive_attrset(opendata, sattr, &label);
-
- nfs_fattr_init(opendata->o_res.f_attr);
- status = nfs4_do_setattr(state->inode, cred,
- opendata->o_res.f_attr, sattr,
- state, label, olabel);
- if (status == 0) {
- nfs_setattr_update_inode(state->inode, sattr,
- opendata->o_res.f_attr);
- nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ /*
+ * send create attributes which was not set by open
+ * with an extra setattr.
+ */
+ if (sattr->ia_valid & NFS4_VALID_ATTRS) {
+ nfs_fattr_init(opendata->o_res.f_attr);
+ status = nfs4_do_setattr(state->inode, cred,
+ opendata->o_res.f_attr, sattr,
+ state, label, olabel);
+ if (status == 0) {
+ nfs_setattr_update_inode(state->inode, sattr,
+ opendata->o_res.f_attr);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ }
}
}
if (opened && opendata->file_created)
@@ -2676,6 +2693,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
.rpc_resp = &res,
.rpc_cred = cred,
};
+ struct rpc_cred *delegation_cred = NULL;
unsigned long timestamp = jiffies;
fmode_t fmode;
bool truncate;
@@ -2691,7 +2709,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
fmode = truncate ? FMODE_WRITE : FMODE_READ;
- if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
+ if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
/* Use that stateid */
} else if (truncate && state != NULL) {
struct nfs_lockowner lockowner = {
@@ -2700,13 +2718,17 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
};
if (!nfs4_valid_open_stateid(state))
return -EBADF;
- if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
- &lockowner) == -EIO)
+ if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
+ &arg.stateid, &delegation_cred) == -EIO)
return -EBADF;
} else
nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+ if (delegation_cred)
+ msg.rpc_cred = delegation_cred;
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+
+ put_rpccred(delegation_cred);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
trace_nfs4_setattr(inode, &arg.stateid, status);
@@ -4285,7 +4307,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid,
if (l_ctx != NULL)
lockowner = &l_ctx->lockowner;
- return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner);
+ return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL);
}
EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
@@ -4993,12 +5015,11 @@ static int nfs4_do_set_security_label(struct inode *inode,
}
static int
-nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
{
struct nfs4_label ilabel, *olabel = NULL;
struct nfs_fattr fattr;
struct rpc_cred *cred;
- struct inode *inode = d_inode(dentry);
int status;
if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -6054,6 +6075,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
{
struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs4_state_owner *sp = state->owner;
unsigned char fl_flags = request->fl_flags;
int status = -ENOLCK;
@@ -6068,6 +6090,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
status = do_vfs_lock(state->inode, request);
if (status < 0)
goto out;
+ mutex_lock(&sp->so_delegreturn_mutex);
down_read(&nfsi->rwsem);
if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
/* Yes: cache locks! */
@@ -6075,9 +6098,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
request->fl_flags = fl_flags & ~FL_SLEEP;
status = do_vfs_lock(state->inode, request);
up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
goto out;
}
up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
out:
request->fl_flags = fl_flags;
@@ -6255,11 +6280,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
- struct dentry *dentry, const char *key,
- const void *buf, size_t buflen,
- int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
{
- return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
+ return nfs4_proc_set_acl(inode, buf, buflen);
}
static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
@@ -6277,12 +6302,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
- struct dentry *dentry, const char *key,
- const void *buf, size_t buflen,
- int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
{
if (security_ismaclabel(key))
- return nfs4_set_security_label(dentry, buf, buflen);
+ return nfs4_set_security_label(inode, buf, buflen);
return -EOPNOTSUPP;
}
@@ -7351,9 +7376,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
* always set csa_cachethis to FALSE because the current implementation
* of the back channel DRC only supports caching the CB_SEQUENCE operation.
*/
-static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+ struct rpc_clnt *clnt)
{
unsigned int max_rqst_sz, max_resp_sz;
+ unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -7371,8 +7398,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
/* Back channel attributes */
- args->bc_attrs.max_rqst_sz = PAGE_SIZE;
- args->bc_attrs.max_resp_sz = PAGE_SIZE;
+ args->bc_attrs.max_rqst_sz = max_bc_payload;
+ args->bc_attrs.max_resp_sz = max_bc_payload;
args->bc_attrs.max_resp_sz_cached = 0;
args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
@@ -7476,7 +7503,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
};
int status;
- nfs4_init_channel_attrs(&args);
+ nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -7820,40 +7847,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
struct nfs4_session *session = nfs4_get_session(server);
- int ret;
dprintk("--> %s\n", __func__);
- /* Note the is a race here, where a CB_LAYOUTRECALL can come in
- * right now covering the LAYOUTGET we are about to send.
- * However, that is not so catastrophic, and there seems
- * to be no way to prevent it completely.
- */
- if (nfs41_setup_sequence(session, &lgp->args.seq_args,
- &lgp->res.seq_res, task))
- return;
- ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
- NFS_I(lgp->args.inode)->layout,
- &lgp->args.range,
- lgp->args.ctx->state);
- if (ret < 0)
- rpc_exit(task, ret);
+ nfs41_setup_sequence(session, &lgp->args.seq_args,
+ &lgp->res.seq_res, task);
+ dprintk("<-- %s\n", __func__);
}
static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ nfs41_sequence_done(task, &lgp->res.seq_res);
+ dprintk("<-- %s\n", __func__);
+}
+
+static int
+nfs4_layoutget_handle_exception(struct rpc_task *task,
+ struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
+{
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
struct pnfs_layout_hdr *lo;
- struct nfs4_state *state = NULL;
- unsigned long timeo, now, giveup;
+ int status = task->tk_status;
dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
- if (!nfs41_sequence_done(task, &lgp->res.seq_res))
- goto out;
-
- switch (task->tk_status) {
+ switch (status) {
case 0:
goto out;
@@ -7863,57 +7884,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
* retry go inband.
*/
case -NFS4ERR_LAYOUTUNAVAILABLE:
- task->tk_status = -ENODATA;
+ status = -ENODATA;
goto out;
/*
* NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
* length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
*/
case -NFS4ERR_BADLAYOUT:
- goto out_overflow;
+ status = -EOVERFLOW;
+ goto out;
/*
* NFS4ERR_LAYOUTTRYLATER is a conflict with another client
* (or clients) writing to the same RAID stripe except when
* the minlength argument is 0 (see RFC5661 section 18.43.3).
+ *
+ * Treat it like we would RECALLCONFLICT -- we retry for a little
+ * while, and then eventually give up.
*/
case -NFS4ERR_LAYOUTTRYLATER:
- if (lgp->args.minlength == 0)
- goto out_overflow;
- /*
- * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
- * existing layout before getting a new one).
- */
- case -NFS4ERR_RECALLCONFLICT:
- timeo = rpc_get_timeout(task->tk_client);
- giveup = lgp->args.timestamp + timeo;
- now = jiffies;
- if (time_after(giveup, now)) {
- unsigned long delay;
-
- /* Delay for:
- * - Not less then NFS4_POLL_RETRY_MIN.
- * - One last time a jiffie before we give up
- * - exponential backoff (time_now minus start_attempt)
- */
- delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
- min((giveup - now - 1),
- now - lgp->args.timestamp));
-
- dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
- __func__, delay);
- rpc_delay(task, delay);
- /* Do not call nfs4_async_handle_error() */
- goto out_restart;
+ if (lgp->args.minlength == 0) {
+ status = -EOVERFLOW;
+ goto out;
}
- break;
+ /* Fallthrough */
+ case -NFS4ERR_RECALLCONFLICT:
+ nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
+ exception);
+ status = -ERECALLCONFLICT;
+ goto out;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
+ exception->timeout = 0;
spin_lock(&inode->i_lock);
if (nfs4_stateid_match(&lgp->args.stateid,
&lgp->args.ctx->state->stateid)) {
spin_unlock(&inode->i_lock);
/* If the open stateid was bad, then recover it. */
- state = lgp->args.ctx->state;
+ exception->state = lgp->args.ctx->state;
break;
}
lo = NFS_I(inode)->layout;
@@ -7926,25 +7933,21 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
* with the current stateid.
*/
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
} else
spin_unlock(&inode->i_lock);
- goto out_restart;
+ status = -EAGAIN;
+ goto out;
}
- if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
- goto out_restart;
+
+ status = nfs4_handle_exception(server, status, exception);
+ if (exception->retry)
+ status = -EAGAIN;
out:
dprintk("<-- %s\n", __func__);
- return;
-out_restart:
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- return;
-out_overflow:
- task->tk_status = -EOVERFLOW;
- goto out;
+ return status;
}
static size_t max_response_pages(struct nfs_server *server)
@@ -8013,7 +8016,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
};
struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
{
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
@@ -8033,6 +8036,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
.flags = RPC_TASK_ASYNC,
};
struct pnfs_layout_segment *lseg = NULL;
+ struct nfs4_exception exception = { .timeout = *timeout };
int status = 0;
dprintk("--> %s\n", __func__);
@@ -8046,7 +8050,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
return ERR_PTR(-ENOMEM);
}
lgp->args.layout.pglen = max_pages * PAGE_SIZE;
- lgp->args.timestamp = jiffies;
lgp->res.layoutp = &lgp->args.layout;
lgp->res.seq_res.sr_slot = NULL;
@@ -8056,13 +8059,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
if (IS_ERR(task))
return ERR_CAST(task);
status = nfs4_wait_for_completion_rpc_task(task);
- if (status == 0)
- status = task->tk_status;
+ if (status == 0) {
+ status = nfs4_layoutget_handle_exception(task, lgp, &exception);
+ *timeout = exception.timeout;
+ }
+
trace_nfs4_layoutget(lgp->args.ctx,
&lgp->args.range,
&lgp->res.range,
&lgp->res.stateid,
status);
+
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
lseg = pnfs_layout_process(lgp);
@@ -8118,7 +8125,8 @@ static void nfs4_layoutreturn_release(void *calldata)
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
+ be32_to_cpu(lrp->args.stateid.seqid));
pnfs_mark_layout_returned_if_empty(lo);
if (lrp->res.lrs_present)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
@@ -8653,6 +8661,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
static bool nfs41_match_stateid(const nfs4_stateid *s1,
const nfs4_stateid *s2)
{
+ if (s1->type != s2->type)
+ return false;
+
if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
return false;
@@ -8793,6 +8804,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1
| NFS_CAP_ALLOCATE
+ | NFS_CAP_COPY
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d854693a15b0..9679f4749364 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -65,7 +65,10 @@
#define OPENOWNER_POOL_SIZE 8
-const nfs4_stateid zero_stateid;
+const nfs4_stateid zero_stateid = {
+ { .data = { 0 } },
+ .type = NFS4_SPECIAL_STATEID_TYPE,
+};
static DEFINE_MUTEX(nfs_clid_init_mutex);
int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -985,15 +988,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
* Byte-range lock aware utility to initialize the stateid of read/write
* requests.
*/
-int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
- fmode_t fmode, const struct nfs_lockowner *lockowner)
+int nfs4_select_rw_stateid(struct nfs4_state *state,
+ fmode_t fmode, const struct nfs_lockowner *lockowner,
+ nfs4_stateid *dst, struct rpc_cred **cred)
{
- int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+ int ret;
+
+ if (cred != NULL)
+ *cred = NULL;
+ ret = nfs4_copy_lock_stateid(dst, state, lockowner);
if (ret == -EIO)
/* A lost lock - don't even consider delegations */
goto out;
/* returns true if delegation stateid found and copied */
- if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) {
+ if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) {
ret = 0;
goto out;
}
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2c8d05dae5b1..9c150b153782 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
{ PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
{ PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
+ { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \
+ { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \
{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
TRACE_EVENT(pnfs_update_layout,
@@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout,
u64 count,
enum pnfs_iomode iomode,
struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
enum pnfs_update_layout_reason reason
),
- TP_ARGS(inode, pos, count, iomode, lo, reason),
+ TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u64, fileid)
@@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout,
__field(enum pnfs_iomode, iomode)
__field(int, layoutstateid_seq)
__field(u32, layoutstateid_hash)
+ __field(long, lseg)
__field(enum pnfs_update_layout_reason, reason)
),
TP_fast_assign(
@@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout,
__entry->layoutstateid_seq = 0;
__entry->layoutstateid_hash = 0;
}
+ __entry->lseg = (long)lseg;
),
TP_printk(
"fileid=%02x:%02x:%llu fhandle=0x%08x "
"iomode=%s pos=%llu count=%llu "
- "layoutstateid=%d:0x%08x (%s)",
+ "layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout,
(unsigned long long)__entry->pos,
(unsigned long long)__entry->count,
__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ __entry->lseg,
show_pnfs_update_layout_reason(__entry->reason)
)
);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 88474a4fc669..661e753fe1c9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
}
+static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_OPEN_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LOCK_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
{
int status;
@@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
if (!status)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_open_stateid(xdr, &res->stateid);
return status;
}
@@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
if (status == -EIO)
goto out;
if (status == 0) {
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_lock_stateid(xdr, &res->stateid);
if (unlikely(status))
goto out;
} else if (status == -NFS4ERR_DENIED)
@@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
if (status != -EIO)
nfs_increment_lock_seqid(status, res->seqid);
if (status == 0)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_lock_stateid(xdr, &res->stateid);
return status;
}
@@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
__be32 *p;
int status;
- status = decode_stateid(xdr, &res->delegation);
+ status = decode_delegation_stateid(xdr, &res->delegation);
if (unlikely(status))
return status;
p = xdr_inline_decode(xdr, 4);
@@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
nfs_increment_open_seqid(status, res->seqid);
if (status)
return status;
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_open_stateid(xdr, &res->stateid);
if (unlikely(status))
return status;
@@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
if (!status)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_open_stateid(xdr, &res->stateid);
return status;
}
@@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
if (!status)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_open_stateid(xdr, &res->stateid);
return status;
}
@@ -5838,6 +5856,12 @@ out_overflow:
}
#if defined(CONFIG_NFS_V4_1)
+static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
static int decode_getdeviceinfo(struct xdr_stream *xdr,
struct nfs4_getdeviceinfo_res *res)
{
@@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
if (unlikely(!p))
goto out_overflow;
res->return_on_close = be32_to_cpup(p);
- decode_stateid(xdr, &res->stateid);
+ decode_layout_stateid(xdr, &res->stateid);
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
@@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
goto out_overflow;
res->lrs_present = be32_to_cpup(p);
if (res->lrs_present)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_layout_stateid(xdr, &res->stateid);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -7515,6 +7539,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
PROC(CLONE, enc_clone, dec_clone),
+ PROC(COPY, enc_copy, dec_copy),
#endif /* CONFIG_NFS_V4_2 */
};
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 1f6db4231057..174dd4cf5747 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
* long write-back delay. This will be adjusted in
* update_nfs_request below if the region is not locked. */
req->wb_page = page;
- req->wb_index = page_file_index(page);
- get_page(page);
+ if (page) {
+ req->wb_index = page_file_index(page);
+ get_page(page);
+ }
req->wb_offset = offset;
req->wb_pgbase = offset;
req->wb_bytes = count;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 89a5ef4df08a..0c7e0d45a4de 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
};
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+ return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
}
static int
@@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
spin_lock(&inode->i_lock);
pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
- pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
+ pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
@@ -522,13 +522,35 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
return rv;
}
-/* Returns count of number of matching invalid lsegs remaining in list
- * after call.
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+ return (s32)(s1 - s2) > 0;
+}
+
+/**
+ * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
+ * @lo: layout header containing the lsegs
+ * @tmp_list: list head where doomed lsegs should go
+ * @recall_range: optional recall range argument to match (may be NULL)
+ * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
+ *
+ * Walk the list of lsegs in the layout header, and tear down any that should
+ * be destroyed. If "recall_range" is specified then the segment must match
+ * that range. If "seq" is non-zero, then only match segments that were handed
+ * out at or before that sequence.
+ *
+ * Returns number of matching invalid lsegs remaining in list after scanning
+ * it and purging them.
*/
int
pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- const struct pnfs_layout_range *recall_range)
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
{
struct pnfs_layout_segment *lseg, *next;
int remaining = 0;
@@ -540,10 +562,12 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (!recall_range ||
should_free_lseg(&lseg->pls_range, recall_range)) {
- dprintk("%s: freeing lseg %p iomode %d "
+ if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+ continue;
+ dprintk("%s: freeing lseg %p iomode %d seq %u"
"offset %llu length %llu\n", __func__,
- lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
- lseg->pls_range.length);
+ lseg, lseg->pls_range.iomode, lseg->pls_seq,
+ lseg->pls_range.offset, lseg->pls_range.length);
if (!mark_lseg_invalid(lseg, tmp_list))
remaining++;
}
@@ -730,15 +754,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
pnfs_destroy_layouts_byclid(clp, false);
}
-/*
- * Compare 2 layout stateid sequence ids, to see which is newer,
- * taking into account wraparound issues.
- */
-static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
-{
- return (s32)(s1 - s2) > 0;
-}
-
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -781,50 +796,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
-int
-pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- const struct pnfs_layout_range *range,
- struct nfs4_state *open_state)
-{
- int status = 0;
-
- dprintk("--> %s\n", __func__);
- spin_lock(&lo->plh_inode->i_lock);
- if (pnfs_layoutgets_blocked(lo)) {
- status = -EAGAIN;
- } else if (!nfs4_valid_open_stateid(open_state)) {
- status = -EBADF;
- } else if (list_empty(&lo->plh_segs) ||
- test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
- int seq;
-
- do {
- seq = read_seqbegin(&open_state->seqlock);
- nfs4_stateid_copy(dst, &open_state->stateid);
- } while (read_seqretry(&open_state->seqlock, seq));
- } else
- nfs4_stateid_copy(dst, &lo->plh_stateid);
- spin_unlock(&lo->plh_inode->i_lock);
- dprintk("<-- %s\n", __func__);
- return status;
-}
-
/*
-* Get layout from server.
-* for now, assume that whole file layouts are requested.
-* arg->offset: 0
-* arg->length: all ones
-*/
+ * Get layout from server.
+ * for now, assume that whole file layouts are requested.
+ * arg->offset: 0
+ * arg->length: all ones
+ */
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
+ nfs4_stateid *stateid,
const struct pnfs_layout_range *range,
- gfp_t gfp_flags)
+ long *timeout, gfp_t gfp_flags)
{
struct inode *ino = lo->plh_inode;
struct nfs_server *server = NFS_SERVER(ino);
struct nfs4_layoutget *lgp;
- struct pnfs_layout_segment *lseg;
loff_t i_size;
dprintk("--> %s\n", __func__);
@@ -834,40 +821,31 @@ send_layoutget(struct pnfs_layout_hdr *lo,
* store in lseg. If we race with a concurrent seqid morphing
* op, then re-send the LAYOUTGET.
*/
- do {
- lgp = kzalloc(sizeof(*lgp), gfp_flags);
- if (lgp == NULL)
- return NULL;
-
- i_size = i_size_read(ino);
-
- lgp->args.minlength = PAGE_SIZE;
- if (lgp->args.minlength > range->length)
- lgp->args.minlength = range->length;
- if (range->iomode == IOMODE_READ) {
- if (range->offset >= i_size)
- lgp->args.minlength = 0;
- else if (i_size - range->offset < lgp->args.minlength)
- lgp->args.minlength = i_size - range->offset;
- }
- lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- pnfs_copy_range(&lgp->args.range, range);
- lgp->args.type = server->pnfs_curr_ld->id;
- lgp->args.inode = ino;
- lgp->args.ctx = get_nfs_open_context(ctx);
- lgp->gfp_flags = gfp_flags;
- lgp->cred = lo->plh_lc_cred;
-
- lseg = nfs4_proc_layoutget(lgp, gfp_flags);
- } while (lseg == ERR_PTR(-EAGAIN));
-
- if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
- lseg = NULL;
- else
- pnfs_layout_clear_fail_bit(lo,
- pnfs_iomode_to_fail_bit(range->iomode));
+ lgp = kzalloc(sizeof(*lgp), gfp_flags);
+ if (lgp == NULL)
+ return ERR_PTR(-ENOMEM);
- return lseg;
+ i_size = i_size_read(ino);
+
+ lgp->args.minlength = PAGE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
+ }
+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+ pnfs_copy_range(&lgp->args.range, range);
+ lgp->args.type = server->pnfs_curr_ld->id;
+ lgp->args.inode = ino;
+ lgp->args.ctx = get_nfs_open_context(ctx);
+ nfs4_stateid_copy(&lgp->args.stateid, stateid);
+ lgp->gfp_flags = gfp_flags;
+ lgp->cred = lo->plh_lc_cred;
+
+ return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
}
static void pnfs_clear_layoutcommit(struct inode *inode,
@@ -899,6 +877,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false;
lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
pnfs_get_layout_hdr(lo);
clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
return true;
@@ -969,6 +948,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
bool send;
nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ stateid.seqid = cpu_to_be32(lo->plh_return_seq);
iomode = lo->plh_return_iomode;
send = pnfs_prepare_layoutreturn(lo);
spin_unlock(&inode->i_lock);
@@ -1012,7 +992,7 @@ _pnfs_return_layout(struct inode *ino)
pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs);
pnfs_clear_layoutcommit(ino, &tmp_list);
- pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+ pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
struct pnfs_layout_range range = {
@@ -1341,23 +1321,28 @@ out_existing:
/*
* iomode matching rules:
- * iomode lseg match
- * ----- ----- -----
- * ANY READ true
- * ANY RW true
- * RW READ false
- * RW RW true
- * READ READ true
- * READ RW true
+ * iomode lseg strict match
+ * iomode
+ * ----- ----- ------ -----
+ * ANY READ N/A true
+ * ANY RW N/A true
+ * RW READ N/A false
+ * RW RW N/A true
+ * READ READ N/A true
+ * READ RW true false
+ * READ RW false true
*/
static bool
pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
- const struct pnfs_layout_range *range)
+ const struct pnfs_layout_range *range,
+ bool strict_iomode)
{
struct pnfs_layout_range range1;
if ((range->iomode == IOMODE_RW &&
ls_range->iomode != IOMODE_RW) ||
+ (range->iomode != ls_range->iomode &&
+ strict_iomode == true) ||
!pnfs_lseg_range_intersecting(ls_range, range))
return 0;
@@ -1372,7 +1357,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
*/
static struct pnfs_layout_segment *
pnfs_find_lseg(struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range)
+ struct pnfs_layout_range *range,
+ bool strict_iomode)
{
struct pnfs_layout_segment *lseg, *ret = NULL;
@@ -1381,7 +1367,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
- pnfs_lseg_range_match(&lseg->pls_range, range)) {
+ pnfs_lseg_range_match(&lseg->pls_range, range,
+ strict_iomode)) {
ret = pnfs_get_lseg(lseg);
break;
}
@@ -1498,6 +1485,7 @@ pnfs_update_layout(struct inode *ino,
loff_t pos,
u64 count,
enum pnfs_iomode iomode,
+ bool strict_iomode,
gfp_t gfp_flags)
{
struct pnfs_layout_range arg = {
@@ -1505,27 +1493,30 @@ pnfs_update_layout(struct inode *ino,
.offset = pos,
.length = count,
};
- unsigned pg_offset;
+ unsigned pg_offset, seq;
struct nfs_server *server = NFS_SERVER(ino);
struct nfs_client *clp = server->nfs_client;
- struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_hdr *lo = NULL;
struct pnfs_layout_segment *lseg = NULL;
+ nfs4_stateid stateid;
+ long timeout = 0;
+ unsigned long giveup = jiffies + rpc_get_timeout(server->client);
bool first;
if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
- trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_NO_PNFS);
goto out;
}
if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
- trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
goto out;
}
if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
- trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_MDSTHRESH);
goto out;
}
@@ -1536,14 +1527,14 @@ lookup_again:
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
- trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
}
/* Do we even need to bother with this? */
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_BULK_RECALL);
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
@@ -1551,14 +1542,34 @@ lookup_again:
/* if LAYOUTGET already failed once we don't try again */
if (pnfs_layout_io_test_failed(lo, iomode)) {
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
goto out_unlock;
}
- first = list_empty(&lo->plh_segs);
- if (first) {
- /* The first layoutget for the file. Need to serialize per
+ lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+ goto out_unlock;
+ }
+
+ if (!nfs4_valid_open_stateid(ctx->state)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+ goto out_unlock;
+ }
+
+ /*
+ * Choose a stateid for the LAYOUTGET. If we don't have a layout
+ * stateid, or it has been invalidated, then we must use the open
+ * stateid.
+ */
+ if (lo->plh_stateid.seqid == 0 ||
+ test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+
+ /*
+ * The first layoutget for the file. Need to serialize per
* RFC 5661 Errata 3208.
*/
if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
@@ -1567,18 +1578,17 @@ lookup_again:
wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
TASK_UNINTERRUPTIBLE);
pnfs_put_layout_hdr(lo);
+ dprintk("%s retrying\n", __func__);
goto lookup_again;
}
+
+ first = true;
+ do {
+ seq = read_seqbegin(&ctx->state->seqlock);
+ nfs4_stateid_copy(&stateid, &ctx->state->stateid);
+ } while (read_seqretry(&ctx->state->seqlock, seq));
} else {
- /* Check to see if the layout for the given range
- * already exists
- */
- lseg = pnfs_find_lseg(lo, &arg);
- if (lseg) {
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
- PNFS_UPDATE_LAYOUT_FOUND_CACHED);
- goto out_unlock;
- }
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
}
/*
@@ -1593,15 +1603,17 @@ lookup_again:
pnfs_clear_first_layoutget(lo);
pnfs_put_layout_hdr(lo);
dprintk("%s retrying\n", __func__);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ lseg, PNFS_UPDATE_LAYOUT_RETRY);
goto lookup_again;
}
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_RETURN);
goto out_put_layout_hdr;
}
if (pnfs_layoutgets_blocked(lo)) {
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_BLOCKED);
goto out_unlock;
}
@@ -1626,10 +1638,36 @@ lookup_again:
if (arg.length != NFS4_MAX_UINT64)
arg.length = PAGE_ALIGN(arg.length);
- lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
- atomic_dec(&lo->plh_outstanding);
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+ if (IS_ERR(lseg)) {
+ switch(PTR_ERR(lseg)) {
+ case -ERECALLCONFLICT:
+ if (time_after(jiffies, giveup))
+ lseg = NULL;
+ /* Fallthrough */
+ case -EAGAIN:
+ pnfs_put_layout_hdr(lo);
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+ goto lookup_again;
+ }
+ /* Fallthrough */
+ default:
+ if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ }
+ }
+ } else {
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ }
+
+ atomic_dec(&lo->plh_outstanding);
out_put_layout_hdr:
if (first)
pnfs_clear_first_layoutget(lo);
@@ -1678,38 +1716,36 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
LIST_HEAD(free_me);
- int status = -EINVAL;
if (!pnfs_sanity_check_layout_range(&res->range))
- goto out;
+ return ERR_PTR(-EINVAL);
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
- if (!lseg || IS_ERR(lseg)) {
+ if (IS_ERR_OR_NULL(lseg)) {
if (!lseg)
- status = -ENOMEM;
- else
- status = PTR_ERR(lseg);
- dprintk("%s: Could not allocate layout: error %d\n",
- __func__, status);
- goto out;
+ lseg = ERR_PTR(-ENOMEM);
+
+ dprintk("%s: Could not allocate layout: error %ld\n",
+ __func__, PTR_ERR(lseg));
+ return lseg;
}
init_lseg(lo, lseg);
lseg->pls_range = res->range;
+ lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
spin_lock(&ino->i_lock);
if (pnfs_layoutgets_blocked(lo)) {
dprintk("%s forget reply due to state\n", __func__);
- goto out_forget_reply;
+ goto out_forget;
}
if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
- status = -EAGAIN;
- goto out_forget_reply;
+ goto out_forget;
}
pnfs_set_layout_stateid(lo, &res->stateid, false);
} else {
@@ -1718,7 +1754,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
* inode invalid, and don't bother validating the stateid
* sequence number.
*/
- pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+ pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0);
nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
@@ -1735,18 +1771,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me);
return lseg;
-out:
- return ERR_PTR(status);
-out_forget_reply:
+out_forget:
spin_unlock(&ino->i_lock);
lseg->pls_layout = lo;
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
- goto out;
+ return ERR_PTR(-EAGAIN);
}
static void
-pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+ u32 seq)
{
if (lo->plh_return_iomode == iomode)
return;
@@ -1754,6 +1789,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
iomode = IOMODE_ANY;
lo->plh_return_iomode = iomode;
set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+ lo->plh_return_seq = seq;
}
/**
@@ -1769,7 +1806,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- const struct pnfs_layout_range *return_range)
+ const struct pnfs_layout_range *return_range,
+ u32 seq)
{
struct pnfs_layout_segment *lseg, *next;
int remaining = 0;
@@ -1792,8 +1830,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
continue;
remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
- pnfs_set_plh_return_iomode(lo, return_range->iomode);
}
+
+ if (remaining)
+ pnfs_set_plh_return_info(lo, return_range->iomode, seq);
+
return remaining;
}
@@ -1810,13 +1851,14 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
bool return_now = false;
spin_lock(&inode->i_lock);
- pnfs_set_plh_return_iomode(lo, range.iomode);
+ pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+ if (!pnfs_mark_matching_lsegs_return(lo, &free_me,
+ &range, lseg->pls_seq)) {
nfs4_stateid stateid;
enum pnfs_iomode iomode = lo->plh_return_iomode;
@@ -1849,6 +1891,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
req_offset(req),
rd_size,
IOMODE_READ,
+ false,
GFP_KERNEL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1873,6 +1916,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
req_offset(req),
wb_size,
IOMODE_RW,
+ false,
GFP_NOFS);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -2143,12 +2187,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
}
/* Resend all requests through pnfs. */
-int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
{
struct nfs_pageio_descriptor pgio;
- nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
- return nfs_pageio_resend(&pgio, hdr);
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ nfs_pageio_init_read(&pgio, hdr->inode, false,
+ hdr->completion_ops);
+ hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
+ }
}
EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
@@ -2158,12 +2205,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
enum pnfs_try_status trypnfs;
- int err = 0;
trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
if (trypnfs == PNFS_TRY_AGAIN)
- err = pnfs_read_resend_pnfs(hdr);
- if (trypnfs == PNFS_NOT_ATTEMPTED || err)
+ pnfs_read_resend_pnfs(hdr);
+ if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status)
pnfs_read_through_mds(desc, hdr);
}
@@ -2405,7 +2451,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
spin_lock(&inode->i_lock);
if (!NFS_I(inode)->layout) {
spin_unlock(&inode->i_lock);
- goto out;
+ goto out_clear_layoutstats;
}
hdr = NFS_I(inode)->layout;
pnfs_get_layout_hdr(hdr);
@@ -2434,6 +2480,7 @@ out_free:
kfree(data);
out_put:
pnfs_put_layout_hdr(hdr);
+out_clear_layoutstats:
smp_mb__before_atomic();
clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
smp_mb__after_atomic();
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1ac1db5f6dad..b21bd0bee784 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -64,6 +64,7 @@ struct pnfs_layout_segment {
struct list_head pls_lc_list;
struct pnfs_layout_range pls_range;
atomic_t pls_refcount;
+ u32 pls_seq;
unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
struct work_struct pls_work;
@@ -194,6 +195,7 @@ struct pnfs_layout_hdr {
unsigned long plh_flags;
nfs4_stateid plh_stateid;
u32 plh_barrier; /* ignore lower seqids */
+ u32 plh_return_seq;
enum pnfs_iomode plh_return_iomode;
loff_t plh_lwb; /* last write byte for layoutcommit */
struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
struct rpc_cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
/* pnfs.c */
@@ -258,16 +260,14 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new,
bool update_barrier);
-int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
- struct pnfs_layout_hdr *lo,
- const struct pnfs_layout_range *range,
- struct nfs4_state *open_state);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- const struct pnfs_layout_range *recall_range);
+ const struct pnfs_layout_range *recall_range,
+ u32 seq);
int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- const struct pnfs_layout_range *recall_range);
+ const struct pnfs_layout_range *recall_range,
+ u32 seq);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -282,12 +282,13 @@ int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
void pnfs_ld_write_done(struct nfs_pgio_header *);
void pnfs_ld_read_done(struct nfs_pgio_header *);
-int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
u64 count,
enum pnfs_iomode iomode,
+ bool strict_iomode,
gfp_t gfp_flags);
void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 4aaed890048f..0dfc476da3e1 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
/* The generic layer is about to remove the req from the commit list.
* If this will make the bucket empty, it will need to put the lseg reference.
- * Note this must be called holding the inode (/cinfo) lock
+ * Note this must be called holding i_lock
*/
void
pnfs_generic_clear_request_commit(struct nfs_page *req,
@@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
if (!nfs_lock_request(req))
continue;
kref_get(&req->wb_kref);
- if (cond_resched_lock(cinfo->lock))
+ if (cond_resched_lock(&cinfo->inode->i_lock))
list_safe_reset_next(req, tmp, wb_list);
nfs_request_remove_commit_list(req, cinfo);
clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
struct list_head *dst = &bucket->committing;
int ret;
- lockdep_assert_held(cinfo->lock);
+ lockdep_assert_held(&cinfo->inode->i_lock);
ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
if (ret) {
cinfo->ds->nwritten -= ret;
@@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
{
int i, rv = 0, cnt;
- lockdep_assert_held(cinfo->lock);
+ lockdep_assert_held(&cinfo->inode->i_lock);
for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
cinfo, max);
@@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
struct pnfs_layout_segment *freeme;
int i;
- lockdep_assert_held(cinfo->lock);
+ lockdep_assert_held(&cinfo->inode->i_lock);
restart:
for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
if (pnfs_generic_transfer_commit_list(&b->written, dst,
cinfo, 0)) {
freeme = b->wlseg;
b->wlseg = NULL;
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
pnfs_put_lseg(freeme);
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
goto restart;
}
}
@@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
LIST_HEAD(pages);
int i;
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
for (i = idx; i < fl_cinfo->nbuckets; i++) {
bucket = &fl_cinfo->buckets[i];
if (list_empty(&bucket->committing))
@@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
freeme = bucket->clseg;
bucket->clseg = NULL;
list_splice_init(&bucket->committing, &pages);
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
nfs_retry_commit(&pages, freeme, cinfo, i);
pnfs_put_lseg(freeme);
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
}
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
}
static unsigned int
@@ -238,14 +238,31 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages,
struct pnfs_commit_bucket *bucket;
bucket = &cinfo->ds->buckets[data->ds_commit_index];
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
list_splice_init(&bucket->committing, pages);
data->lseg = bucket->clseg;
bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
}
+/* Helper function for pnfs_generic_commit_pagelist to catch an empty
+ * page list. This can happen when two commits race. */
+static bool
+pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
+ struct nfs_commit_data *data,
+ struct nfs_commit_info *cinfo)
+{
+ if (list_empty(pages)) {
+ if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
+ wake_up_atomic_t(&cinfo->mds->rpcs_out);
+ nfs_commitdata_release(data);
+ return true;
+ }
+
+ return false;
+}
+
/* This follows nfs_commit_list pretty closely */
int
pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -280,6 +297,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
list_for_each_entry_safe(data, tmp, &list, pages) {
list_del_init(&data->pages);
if (data->ds_commit_index < 0) {
+ /* another commit raced with us */
+ if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
+ data, cinfo))
+ continue;
+
nfs_init_commit(data, mds_pages, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
@@ -288,6 +310,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
LIST_HEAD(pages);
pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+
+ /* another commit raced with us */
+ if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
+ data, cinfo))
+ continue;
+
nfs_init_commit(data, &pages, data->lseg, cinfo);
initiate_commit(data, how);
}
@@ -874,12 +902,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
struct list_head *list;
struct pnfs_commit_bucket *buckets;
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
buckets = cinfo->ds->buckets;
list = &buckets[ds_commit_idx].written;
if (list_empty(list)) {
if (!pnfs_is_valid_lseg(lseg)) {
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
cinfo->completion_ops->resched_write(cinfo, req);
return;
}
@@ -896,7 +924,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
cinfo->ds->nwritten++;
nfs_request_add_commit_list_locked(req, list, cinfo);
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1268280244e..2137e0202f25 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = {
enum {
Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
+ Opt_xprt_rdma6,
Opt_xprt_err
};
@@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = {
{ Opt_xprt_tcp, "tcp" },
{ Opt_xprt_tcp6, "tcp6" },
{ Opt_xprt_rdma, "rdma" },
+ { Opt_xprt_rdma6, "rdma6" },
{ Opt_xprt_err, NULL }
};
@@ -1456,6 +1458,8 @@ static int nfs_parse_mount_options(char *raw,
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
break;
+ case Opt_xprt_rdma6:
+ protofamily = AF_INET6;
case Opt_xprt_rdma:
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
@@ -2408,6 +2412,11 @@ static int nfs_compare_super_address(struct nfs_server *server1,
struct nfs_server *server2)
{
struct sockaddr *sap1, *sap2;
+ struct rpc_xprt *xprt1 = server1->client->cl_xprt;
+ struct rpc_xprt *xprt2 = server2->client->cl_xprt;
+
+ if (!net_eq(xprt1->xprt_net, xprt2->xprt_net))
+ return 0;
sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5f4fd53e5764..e1c74d3db64d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req)
static int wb_priority(struct writeback_control *wbc)
{
int ret = 0;
- if (wbc->for_reclaim)
- return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
+
if (wbc->sync_mode == WB_SYNC_ALL)
ret = FLUSH_COND_STABLE;
return ret;
@@ -737,7 +736,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
head = req->wb_head;
spin_lock(&inode->i_lock);
- if (likely(!PageSwapCache(head->wb_page))) {
+ if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
set_page_private(head->wb_page, 0);
ClearPagePrivate(head->wb_page);
smp_mb__after_atomic();
@@ -759,7 +758,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
static void
nfs_mark_request_dirty(struct nfs_page *req)
{
- __set_page_dirty_nobuffers(req->wb_page);
+ if (req->wb_page)
+ __set_page_dirty_nobuffers(req->wb_page);
}
/*
@@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
* number of outstanding requests requiring a commit as well as
* the MM page stats.
*
- * The caller must hold the cinfo->lock, and the nfs_page lock.
+ * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
*/
void
nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -832,10 +832,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
void
nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
- spin_unlock(cinfo->lock);
- nfs_mark_page_unstable(req->wb_page, cinfo);
+ spin_unlock(&cinfo->inode->i_lock);
+ if (req->wb_page)
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -864,7 +865,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
struct inode *inode)
{
- cinfo->lock = &inode->i_lock;
+ cinfo->inode = inode;
cinfo->mds = &NFS_I(inode)->commit_info;
cinfo->ds = pnfs_get_ds_info(inode);
cinfo->dreq = NULL;
@@ -967,7 +968,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
return cinfo->mds->ncommit;
}
-/* cinfo->lock held by caller */
+/* cinfo->inode->i_lock held by caller */
int
nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_commit_info *cinfo, int max)
@@ -979,7 +980,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
if (!nfs_lock_request(req))
continue;
kref_get(&req->wb_kref);
- if (cond_resched_lock(cinfo->lock))
+ if (cond_resched_lock(&cinfo->inode->i_lock))
list_safe_reset_next(req, tmp, wb_list);
nfs_request_remove_commit_list(req, cinfo);
nfs_list_add_request(req, dst);
@@ -1005,7 +1006,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
{
int ret = 0;
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
if (cinfo->mds->ncommit > 0) {
const int max = INT_MAX;
@@ -1013,7 +1014,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
cinfo, max);
ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
}
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
return ret;
}
@@ -1709,6 +1710,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
{
struct nfs_commit_data *data;
+ /* another commit raced with us */
+ if (list_empty(head))
+ return 0;
+
data = nfs_commitdata_alloc();
if (!data)
@@ -1724,6 +1729,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
return -ENOMEM;
}
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
+{
+ struct inode *inode = file_inode(file);
+ struct nfs_open_context *open;
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ int ret;
+
+ open = get_nfs_open_context(nfs_file_open_context(file));
+ req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out_put;
+ }
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+
+ memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
+ nfs_request_add_commit_list(req, &cinfo);
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret > 0)
+ ret = 0;
+
+ nfs_free_request(req);
+out_put:
+ put_nfs_open_context(open);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_commit_file);
+
/*
* COMMIT call returned
*/
@@ -1748,7 +1783,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- nfs_clear_page_commit(req->wb_page);
+ if (req->wb_page)
+ nfs_clear_page_commit(req->wb_page);
dprintk("NFS: commit (%s/%llu %d@%lld)",
req->wb_context->dentry->d_sb->s_id,
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e55b5242614d..31f3df193bdb 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -290,7 +290,7 @@ out_free_buf:
return error;
}
-#define NFSD_MDS_PR_KEY 0x0100000000000000
+#define NFSD_MDS_PR_KEY 0x0100000000000000ULL
/*
* We use the client ID as a unique key for the reservations.
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 93d5853f8c99..dba2ff8eaa68 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -379,7 +379,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
*/
hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
- - hdr;
+ + rqstp->rq_arg.tail[0].iov_len - hdr;
/*
* Round the length of the data which was specified up to
* the next multiple of XDR units and then compare that
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7389cb1d7409..04c68d900324 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -710,22 +710,6 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc
}
}
-static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
-{
- struct rpc_xprt *xprt;
-
- if (args->protocol != XPRT_TRANSPORT_BC_TCP)
- return rpc_create(args);
-
- xprt = args->bc_xprt->xpt_bc_xprt;
- if (xprt) {
- xprt_get(xprt);
- return rpc_create_xprt(args, xprt);
- }
-
- return rpc_create(args);
-}
-
static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
int maxtime = max_cb_time(clp->net);
@@ -768,7 +752,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.authflavor = ses->se_cb_sec.flavor;
}
/* Create RPC client */
- client = create_backchannel_client(&args);
+ client = rpc_create(&args);
if (IS_ERR(client)) {
dprintk("NFSD: couldn't create callback client: %ld\n",
PTR_ERR(client));
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 825c7bc8d789..953c0755cb37 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -289,7 +289,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
status = nfserr_bad_stateid;
mutex_lock(&ls->ls_mutex);
- if (stateid->si_generation > stid->sc_stateid.si_generation)
+ if (nfsd4_stateid_generation_after(stateid, &stid->sc_stateid))
goto out_unlock_stid;
if (layout_type != ls->ls_layout_type)
goto out_unlock_stid;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0462eeddfff9..70d0b9b33031 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3480,12 +3480,17 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
}
static struct nfs4_ol_stateid *
-init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
- struct nfsd4_open *open)
+init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
{
struct nfs4_openowner *oo = open->op_openowner;
struct nfs4_ol_stateid *retstp = NULL;
+ struct nfs4_ol_stateid *stp;
+
+ stp = open->op_stp;
+ /* We are moving these outside of the spinlocks to avoid the warnings */
+ mutex_init(&stp->st_mutex);
+ mutex_lock(&stp->st_mutex);
spin_lock(&oo->oo_owner.so_client->cl_lock);
spin_lock(&fp->fi_lock);
@@ -3493,6 +3498,8 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
retstp = nfsd4_find_existing_open(fp, open);
if (retstp)
goto out_unlock;
+
+ open->op_stp = NULL;
atomic_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_locks);
@@ -3502,14 +3509,19 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
stp->st_openstp = NULL;
- init_rwsem(&stp->st_rwsem);
list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
list_add(&stp->st_perfile, &fp->fi_stateids);
out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
- return retstp;
+ if (retstp) {
+ mutex_lock(&retstp->st_mutex);
+ /* To keep mutex tracking happy */
+ mutex_unlock(&stp->st_mutex);
+ stp = retstp;
+ }
+ return stp;
}
/*
@@ -4305,7 +4317,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
struct nfs4_ol_stateid *stp = NULL;
- struct nfs4_ol_stateid *swapstp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
@@ -4335,32 +4346,28 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
*/
if (stp) {
/* Stateid was found, this is an OPEN upgrade */
- down_read(&stp->st_rwsem);
+ mutex_lock(&stp->st_mutex);
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
if (status) {
- up_read(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
goto out;
}
} else {
- stp = open->op_stp;
- open->op_stp = NULL;
- swapstp = init_open_stateid(stp, fp, open);
- if (swapstp) {
- nfs4_put_stid(&stp->st_stid);
- stp = swapstp;
- down_read(&stp->st_rwsem);
+ /* stp is returned locked. */
+ stp = init_open_stateid(fp, open);
+ /* See if we lost the race to some other thread */
+ if (stp->st_access_bmap != 0) {
status = nfs4_upgrade_open(rqstp, fp, current_fh,
stp, open);
if (status) {
- up_read(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
goto out;
}
goto upgrade_out;
}
- down_read(&stp->st_rwsem);
status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
if (status) {
- up_read(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
release_open_stateid(stp);
goto out;
}
@@ -4372,7 +4379,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
}
upgrade_out:
nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
- up_read(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
if (nfsd4_has_session(&resp->cstate)) {
if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4651,12 +4658,6 @@ grace_disallows_io(struct net *net, struct inode *inode)
return opens_in_grace(net) && mandatory_lock(inode);
}
-/* Returns true iff a is later than b: */
-static bool stateid_generation_after(stateid_t *a, stateid_t *b)
-{
- return (s32)(a->si_generation - b->si_generation) > 0;
-}
-
static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
{
/*
@@ -4670,7 +4671,7 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s
return nfs_ok;
/* If the client sends us a stateid from the future, it's buggy: */
- if (stateid_generation_after(in, ref))
+ if (nfsd4_stateid_generation_after(in, ref))
return nfserr_bad_stateid;
/*
* However, we could see a stateid from the past, even from a
@@ -4983,12 +4984,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
* revoked delegations are kept only for free_stateid.
*/
return nfserr_bad_stateid;
- down_write(&stp->st_rwsem);
+ mutex_lock(&stp->st_mutex);
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
if (status == nfs_ok)
status = nfs4_check_fh(current_fh, &stp->st_stid);
if (status != nfs_ok)
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
return status;
}
@@ -5036,7 +5037,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
return status;
oo = openowner(stp->st_stateowner);
if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
nfs4_put_stid(&stp->st_stid);
return nfserr_bad_stateid;
}
@@ -5068,12 +5069,12 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
oo = openowner(stp->st_stateowner);
status = nfserr_bad_stateid;
if (oo->oo_flags & NFS4_OO_CONFIRMED) {
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
goto put_stateid;
}
oo->oo_flags |= NFS4_OO_CONFIRMED;
nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
@@ -5149,7 +5150,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
status = nfs_ok;
put_stateid:
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
@@ -5202,7 +5203,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
nfsd4_close_open_stateid(stp);
@@ -5428,7 +5429,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
- init_rwsem(&stp->st_rwsem);
+ mutex_init(&stp->st_mutex);
list_add(&stp->st_locks, &open_stp->st_locks);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
spin_lock(&fp->fi_lock);
@@ -5597,7 +5598,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&open_stp, nn);
if (status)
goto out;
- up_write(&open_stp->st_rwsem);
+ mutex_unlock(&open_stp->st_mutex);
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
@@ -5606,7 +5607,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = lookup_or_create_lock_state(cstate, open_stp, lock,
&lock_stp, &new);
if (status == nfs_ok)
- down_write(&lock_stp->st_rwsem);
+ mutex_lock(&lock_stp->st_mutex);
} else {
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
@@ -5710,7 +5711,7 @@ out:
seqid_mutating_err(ntohl(status)))
lock_sop->lo_owner.so_seqid++;
- up_write(&lock_stp->st_rwsem);
+ mutex_unlock(&lock_stp->st_mutex);
/*
* If this is a new, never-before-used stateid, and we are
@@ -5880,7 +5881,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fput:
fput(filp);
put_stateid:
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index c050c53036a6..64053eadeb81 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -535,7 +535,7 @@ struct nfs4_ol_stateid {
unsigned char st_access_bmap;
unsigned char st_deny_bmap;
struct nfs4_ol_stateid *st_openstp;
- struct rw_semaphore st_rwsem;
+ struct mutex st_mutex;
};
static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
@@ -573,6 +573,11 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_SEQUENCE,
};
+/* Returns true iff a is later than b: */
+static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+ return (s32)(a->si_generation - b->si_generation) > 0;
+}
struct nfsd4_compound_state;
struct nfsd_net;
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 2ccbf5531554..1a85d94f5b25 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- * Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
*/
#include <linux/types.h>
@@ -58,7 +53,7 @@ nilfs_palloc_groups_count(const struct inode *inode)
* @inode: inode of metadata file using this allocator
* @entry_size: size of the persistent object
*/
-int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -73,13 +68,17 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
mi->mi_blocks_per_group =
DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
mi->mi_entries_per_block) + 1;
- /* Number of blocks in a group including entry blocks and
- a bitmap block */
+ /*
+ * Number of blocks in a group including entry blocks
+ * and a bitmap block
+ */
mi->mi_blocks_per_desc_block =
nilfs_palloc_groups_per_desc_block(inode) *
mi->mi_blocks_per_group + 1;
- /* Number of blocks per descriptor including the
- descriptor block */
+ /*
+ * Number of blocks per descriptor including the
+ * descriptor block
+ */
return 0;
}
@@ -389,7 +388,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
- unsigned bsize,
+ unsigned int bsize,
spinlock_t *lock)
{
int pos, end = bsize;
@@ -624,7 +623,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
+ "entry number %llu already freed: ino=%lu",
(unsigned long long)req->pr_entry_nr,
(unsigned long)inode->i_ino);
else
@@ -665,7 +664,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
+ "entry number %llu already freed: ino=%lu",
(unsigned long long)req->pr_entry_nr,
(unsigned long)inode->i_ino);
else
@@ -740,8 +739,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
unsigned long group, group_offset;
__u64 group_min_nr, last_nrs[8];
const unsigned long epg = nilfs_palloc_entries_per_group(inode);
- const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
- unsigned entry_start, end, pos;
+ const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block;
+ unsigned int entry_start, end, pos;
spinlock_t *lock;
int i, j, k, ret;
u32 nfree;
@@ -774,7 +773,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
+ "entry number %llu already freed: ino=%lu",
(unsigned long long)entry_nrs[j],
(unsigned long)inode->i_ino);
} else {
@@ -819,7 +818,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
last_nrs[k]);
if (ret && ret != -ENOENT) {
nilfs_warning(inode->i_sb, __func__,
- "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+ "failed to delete block of entry %llu: ino=%lu, err=%d",
(unsigned long long)last_nrs[k],
(unsigned long)inode->i_ino, ret);
}
@@ -838,7 +837,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
ret = nilfs_palloc_delete_bitmap_block(inode, group);
if (ret && ret != -ENOENT) {
nilfs_warning(inode->i_sb, __func__,
- "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+ "failed to delete bitmap block of group %lu: ino=%lu, err=%d",
group,
(unsigned long)inode->i_ino, ret);
}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 6e6f49aa53df..05149e606a78 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- * Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
*/
#ifndef _NILFS_ALLOC_H
@@ -42,7 +37,7 @@ nilfs_palloc_entries_per_group(const struct inode *inode)
return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
}
-int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned int);
int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
struct buffer_head **);
void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index a9fb3636c142..f2a7877e0c8c 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/fs.h>
@@ -46,7 +42,7 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
if (err == -EINVAL) {
nilfs_error(inode->i_sb, fname,
- "broken bmap (inode number=%lu)\n", inode->i_ino);
+ "broken bmap (inode number=%lu)", inode->i_ino);
err = -EIO;
}
return err;
@@ -97,7 +93,7 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
}
int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
- unsigned maxblocks)
+ unsigned int maxblocks)
{
int ret;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bfa817ce40b3..b6a4c8f93ac8 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_BMAP_H
@@ -61,7 +57,7 @@ struct nilfs_bmap_stats {
struct nilfs_bmap_operations {
int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
- unsigned);
+ unsigned int);
int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
int (*bop_delete)(struct nilfs_bmap *, __u64);
void (*bop_clear)(struct nilfs_bmap *);
@@ -126,10 +122,14 @@ struct nilfs_bmap {
/* pointer type */
#define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */
-#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single
- version) */
-#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple
- versions) */
+#define NILFS_BMAP_PTR_VS 1 /*
+ * virtual block number (single
+ * version)
+ */
+#define NILFS_BMAP_PTR_VM 2 /*
+ * virtual block number (has multiple
+ * versions)
+ */
#define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */
#define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0)
@@ -154,7 +154,7 @@ struct nilfs_bmap_store {
int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned int);
int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index e0c9daf9aa22..0576033699bc 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * This file was originally written by Seiji Kihara <kihara@osrg.net>
- * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
- * stabilization and simplification.
+ * Originally written by Seiji Kihara.
+ * Fully revised by Ryusuke Konishi for stabilization and simplification.
*
*/
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index d876b565ce64..2cc1b80e18f7 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Seiji Kihara.
+ * Revised by Ryusuke Konishi.
*/
#ifndef _NILFS_BTNODE_H
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 3a3821b00486..eccb1c89ccbb 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/slab.h>
@@ -689,7 +685,8 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
}
static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
- __u64 key, __u64 *ptrp, unsigned maxblocks)
+ __u64 key, __u64 *ptrp,
+ unsigned int maxblocks)
{
struct nilfs_btree_path *path;
struct nilfs_btree_node *node;
@@ -1032,12 +1029,12 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
if (ptr != NILFS_BMAP_INVALID_PTR)
/* sequential access */
return ptr;
- else {
- ptr = nilfs_btree_find_near(btree, path);
- if (ptr != NILFS_BMAP_INVALID_PTR)
- /* near */
- return ptr;
- }
+
+ ptr = nilfs_btree_find_near(btree, path);
+ if (ptr != NILFS_BMAP_INVALID_PTR)
+ /* near */
+ return ptr;
+
/* block group */
return nilfs_bmap_find_target_in_group(btree);
}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 22c02e35b6ef..df1a25faa83b 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_BTREE_H
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b6596cab9e99..8a3d3b65af3f 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/kernel.h>
@@ -41,6 +37,7 @@ static unsigned long
nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
{
__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+
do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
return (unsigned long)tcno;
}
@@ -50,6 +47,7 @@ static unsigned long
nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
{
__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+
return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
}
@@ -433,7 +431,8 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
}
static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz,
+ size_t nci)
{
struct nilfs_checkpoint *cp;
struct nilfs_cpinfo *ci = buf;
@@ -484,7 +483,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
}
static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz,
+ size_t nci)
{
struct buffer_head *bh;
struct nilfs_cpfile_header *header;
@@ -570,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
*/
ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz, size_t nci)
{
switch (mode) {
case NILFS_CHECKPOINT:
@@ -870,8 +870,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
void *kaddr;
int ret;
- /* CP number is invalid if it's zero or larger than the
- largest exist one.*/
+ /*
+ * CP number is invalid if it's zero or larger than the
+ * largest existing one.
+ */
if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
return -ENOENT;
down_read(&NILFS_MDT(cpfile)->mi_sem);
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index a242b9a314f9..0249744ae234 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_CPFILE_H
@@ -37,8 +33,8 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
int nilfs_cpfile_is_snapshot(struct inode *, __u64);
int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
- size_t);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *,
+ unsigned int, size_t);
int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7dc23f100e57..7367610ea807 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/types.h>
@@ -428,7 +424,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
return ret;
}
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
size_t nvi)
{
struct buffer_head *entry_bh;
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index cbd8e9732503..abbfdabcabea 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_DAT_H
@@ -51,7 +47,7 @@ void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
int nilfs_dat_mark_dirty(struct inode *, __u64);
int nilfs_dat_freev(struct inode *, __u64 *, size_t);
int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned int, size_t);
int nilfs_dat_read(struct super_block *sb, size_t entry_size,
struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 6723d45a631a..e506f4f7120a 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji.
*/
/*
* linux/fs/ext2/dir.c
@@ -50,7 +46,7 @@
* nilfs uses block-sized chunks. Arguably, sector-sized ones would be
* more robust, but we have what we have
*/
-static inline unsigned nilfs_chunk_size(struct inode *inode)
+static inline unsigned int nilfs_chunk_size(struct inode *inode)
{
return inode->i_sb->s_blocksize;
}
@@ -65,9 +61,9 @@ static inline void nilfs_put_page(struct page *page)
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
*/
-static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr)
{
- unsigned last_byte = inode->i_size;
+ unsigned int last_byte = inode->i_size;
last_byte -= page_nr << PAGE_SHIFT;
if (last_byte > PAGE_SIZE)
@@ -75,20 +71,22 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
+static int nilfs_prepare_chunk(struct page *page, unsigned int from,
+ unsigned int to)
{
loff_t pos = page_offset(page) + from;
+
return __block_write_begin(page, pos, to - from, nilfs_get_block);
}
static void nilfs_commit_chunk(struct page *page,
struct address_space *mapping,
- unsigned from, unsigned to)
+ unsigned int from, unsigned int to)
{
struct inode *dir = mapping->host;
loff_t pos = page_offset(page) + from;
- unsigned len = to - from;
- unsigned nr_dirty, copied;
+ unsigned int len = to - from;
+ unsigned int nr_dirty, copied;
int err;
nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
@@ -106,10 +104,10 @@ static bool nilfs_check_page(struct page *page)
{
struct inode *dir = page->mapping->host;
struct super_block *sb = dir->i_sb;
- unsigned chunk_size = nilfs_chunk_size(dir);
+ unsigned int chunk_size = nilfs_chunk_size(dir);
char *kaddr = page_address(page);
- unsigned offs, rec_len;
- unsigned limit = PAGE_SIZE;
+ unsigned int offs, rec_len;
+ unsigned int limit = PAGE_SIZE;
struct nilfs_dir_entry *p;
char *error;
@@ -259,7 +257,6 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
unsigned int offset = pos & ~PAGE_MASK;
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
-/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
return 0;
@@ -321,7 +318,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
{
const unsigned char *name = qstr->name;
int namelen = qstr->len;
- unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+ unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
struct page *page = NULL;
@@ -340,6 +337,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
n = start;
do {
char *kaddr;
+
page = nilfs_get_page(dir, n);
if (!IS_ERR(page)) {
kaddr = page_address(page);
@@ -410,8 +408,8 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
struct page *page, struct inode *inode)
{
- unsigned from = (char *) de - (char *) page_address(page);
- unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
+ unsigned int from = (char *)de - (char *)page_address(page);
+ unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len);
struct address_space *mapping = page->mapping;
int err;
@@ -433,15 +431,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
struct inode *dir = d_inode(dentry->d_parent);
const unsigned char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned chunk_size = nilfs_chunk_size(dir);
- unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+ unsigned int chunk_size = nilfs_chunk_size(dir);
+ unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned short rec_len, name_len;
struct page *page = NULL;
struct nilfs_dir_entry *de;
unsigned long npages = dir_pages(dir);
unsigned long n;
char *kaddr;
- unsigned from, to;
+ unsigned int from, to;
int err;
/*
@@ -533,13 +531,14 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
char *kaddr = page_address(page);
- unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
- unsigned to = ((char *)dir - kaddr) +
- nilfs_rec_len_from_disk(dir->rec_len);
- struct nilfs_dir_entry *pde = NULL;
- struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+ unsigned int from, to;
+ struct nilfs_dir_entry *de, *pde = NULL;
int err;
+ from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+ to = ((char *)dir - kaddr) + nilfs_rec_len_from_disk(dir->rec_len);
+ de = (struct nilfs_dir_entry *)(kaddr + from);
+
while ((char *)de < (char *)dir) {
if (de->rec_len == 0) {
nilfs_error(inode->i_sb, __func__,
@@ -572,7 +571,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
{
struct address_space *mapping = inode->i_mapping;
struct page *page = grab_cache_page(mapping, 0);
- unsigned chunk_size = nilfs_chunk_size(inode);
+ unsigned int chunk_size = nilfs_chunk_size(inode);
struct nilfs_dir_entry *de;
int err;
void *kaddr;
@@ -630,8 +629,8 @@ int nilfs_empty_dir(struct inode *inode)
while ((char *)de <= kaddr) {
if (de->rec_len == 0) {
nilfs_error(inode->i_sb, __func__,
- "zero-length directory entry "
- "(kaddr=%p, de=%p)\n", kaddr, de);
+ "zero-length directory entry (kaddr=%p, de=%p)",
+ kaddr, de);
goto not_empty;
}
if (de->inode != 0) {
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index ebf89fd8ac1a..251a44928405 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/errno.h>
@@ -62,7 +58,7 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
__u64 key, __u64 *ptrp,
- unsigned maxblocks)
+ unsigned int maxblocks)
{
struct inode *dat = NULL;
__u64 ptr, ptr2;
@@ -83,7 +79,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
ptr = blocknr;
}
- maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+ maxblocks = min_t(unsigned int, maxblocks,
+ NILFS_DIRECT_KEY_MAX - key + 1);
for (cnt = 1; cnt < maxblocks &&
(ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
NILFS_BMAP_INVALID_PTR;
@@ -110,9 +107,9 @@ nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
if (ptr != NILFS_BMAP_INVALID_PTR)
/* sequential access */
return ptr;
- else
- /* block group */
- return nilfs_bmap_find_target_in_group(direct);
+
+ /* block group */
+ return nilfs_bmap_find_target_in_group(direct);
}
static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index dc643de20a25..3015a6e78724 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_DIRECT_H
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
index 19ccbf9522ab..00107fdb9343 100644
--- a/fs/nilfs2/export.h
+++ b/fs/nilfs2/export.h
@@ -20,6 +20,6 @@ struct nilfs_fid {
u32 parent_gen;
u64 parent_ino;
-} __attribute__ ((packed));
+} __packed;
#endif
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 088ba001c6ef..547381f3ce13 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>,
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji and Ryusuke Konishi.
*/
#include <linux/fs.h>
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 0224b7826ace..693aded72498 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- * and Ryusuke Konishi <ryusuke@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Seiji Kihara, Amagai Yoshiji, and Ryusuke Konishi.
+ * Revised by Ryusuke Konishi.
*
*/
/*
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 6548c7851b48..1d2b1805327a 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
*
*/
@@ -68,8 +64,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /* 0 says find free inode from beginning of
- a group. dull code!! */
+ req.pr_entry_nr = 0; /*
+ * 0 says find free inode from beginning
+ * of a group. dull code!!
+ */
req.pr_entry_bh = NULL;
ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 679674d13372..23ad2f091e76 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
*
*/
@@ -36,6 +32,7 @@ static inline struct nilfs_inode *
nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
{
void *kaddr = kmap(ibh->b_page);
+
return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cfebcd2fc7f3..a0ebdb17e912 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -87,7 +83,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
__u64 blknum = 0;
int err = 0, ret;
- unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
+ unsigned int maxblocks = bh_result->b_size >> inode->i_blkbits;
down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
@@ -133,11 +129,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
/* Error handling should be detailed */
set_buffer_new(bh_result);
set_buffer_delay(bh_result);
- map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
- to proper value */
+ map_bh(bh_result, inode->i_sb, 0);
+ /* Disk block number must be changed to proper value */
+
} else if (ret == -ENOENT) {
- /* not found is not error (e.g. hole); must return without
- the mapped state flag. */
+ /*
+ * not found is not error (e.g. hole); must return without
+ * the mapped state flag.
+ */
;
} else {
err = ret;
@@ -167,7 +166,7 @@ static int nilfs_readpage(struct file *file, struct page *page)
* @nr_pages - number of pages to be read
*/
static int nilfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct list_head *pages, unsigned int nr_pages)
{
return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
}
@@ -226,7 +225,7 @@ static int nilfs_set_page_dirty(struct page *page)
int ret = __set_page_dirty_nobuffers(page);
if (page_has_buffers(page)) {
- unsigned nr_dirty = 0;
+ unsigned int nr_dirty = 0;
struct buffer_head *bh, *head;
/*
@@ -249,7 +248,7 @@ static int nilfs_set_page_dirty(struct page *page)
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
} else if (ret) {
- unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+ unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
nilfs_set_file_dirty(inode, nr_dirty);
}
@@ -291,8 +290,8 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- unsigned start = pos & (PAGE_SIZE - 1);
- unsigned nr_dirty;
+ unsigned int start = pos & (PAGE_SIZE - 1);
+ unsigned int nr_dirty;
int err;
nr_dirty = nilfs_page_count_clean_buffers(page, start,
@@ -399,23 +398,26 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
err = nilfs_init_acl(inode, dir);
if (unlikely(err))
- goto failed_after_creation; /* never occur. When supporting
- nilfs_init_acl(), proper cancellation of
- above jobs should be considered */
+ /*
+ * Never occur. When supporting nilfs_init_acl(),
+ * proper cancellation of above jobs should be considered.
+ */
+ goto failed_after_creation;
return inode;
failed_after_creation:
clear_nlink(inode);
unlock_new_inode(inode);
- iput(inode); /* raw_inode will be deleted through
- nilfs_evict_inode() */
+ iput(inode); /*
+ * raw_inode will be deleted through
+ * nilfs_evict_inode().
+ */
goto failed;
failed_ifile_create_inode:
make_bad_inode(inode);
- iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
- called */
+ iput(inode);
failed:
return ERR_PTR(err);
}
@@ -666,8 +668,10 @@ void nilfs_write_inode_common(struct inode *inode,
else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_device_code =
cpu_to_le64(huge_encode_dev(inode->i_rdev));
- /* When extending inode, nilfs->ns_inode_size should be checked
- for substitutions of appended fields */
+ /*
+ * When extending inode, nilfs->ns_inode_size should be checked
+ * for substitutions of appended fields.
+ */
}
void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
@@ -685,9 +689,12 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
nilfs_write_inode_common(inode, raw_inode, 0);
- /* XXX: call with has_bmap = 0 is a workaround to avoid
- deadlock of bmap. This delays update of i_bmap to just
- before writing */
+ /*
+ * XXX: call with has_bmap = 0 is a workaround to avoid
+ * deadlock of bmap. This delays update of i_bmap to just
+ * before writing.
+ */
+
nilfs_ifile_unmap_inode(ifile, ino, ibh);
}
@@ -752,14 +759,15 @@ void nilfs_truncate(struct inode *inode)
nilfs_mark_inode_dirty(inode);
nilfs_set_file_dirty(inode, 0);
nilfs_transaction_commit(sb);
- /* May construct a logical segment and may fail in sync mode.
- But truncate has no return value. */
+ /*
+ * May construct a logical segment and may fail in sync mode.
+ * But truncate has no return value.
+ */
}
static void nilfs_clear_inode(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
- struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
/*
* Free resources allocated in nilfs_read_inode(), here.
@@ -768,8 +776,8 @@ static void nilfs_clear_inode(struct inode *inode)
brelse(ii->i_bh);
ii->i_bh = NULL;
- if (mdi && mdi->mi_palloc_cache)
- nilfs_palloc_destroy_cache(inode);
+ if (nilfs_is_metadata_file_inode(inode))
+ nilfs_mdt_clear(inode);
if (test_bit(NILFS_I_BMAP, &ii->i_state))
nilfs_bmap_clear(ii->i_bmap);
@@ -811,8 +819,10 @@ void nilfs_evict_inode(struct inode *inode)
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
nilfs_transaction_commit(sb);
- /* May construct a logical segment and may fail in sync mode.
- But delete_inode has no return value. */
+ /*
+ * May construct a logical segment and may fail in sync mode.
+ * But delete_inode has no return value.
+ */
}
int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
@@ -856,6 +866,7 @@ out_err:
int nilfs_permission(struct inode *inode, int mask)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
+
if ((mask & MAY_WRITE) && root &&
root->cno != NILFS_CPTREE_CURRENT_CNO)
return -EROFS; /* snapshot is not writable */
@@ -906,7 +917,7 @@ int nilfs_inode_dirty(struct inode *inode)
return ret;
}
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
@@ -919,17 +930,23 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
spin_lock(&nilfs->ns_inode_lock);
if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
!test_bit(NILFS_I_BUSY, &ii->i_state)) {
- /* Because this routine may race with nilfs_dispose_list(),
- we have to check NILFS_I_QUEUED here, too. */
+ /*
+ * Because this routine may race with nilfs_dispose_list(),
+ * we have to check NILFS_I_QUEUED here, too.
+ */
if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
- /* This will happen when somebody is freeing
- this inode. */
+ /*
+ * This will happen when somebody is freeing
+ * this inode.
+ */
nilfs_warning(inode->i_sb, __func__,
- "cannot get inode (ino=%lu)\n",
+ "cannot get inode (ino=%lu)",
inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
- return -EINVAL; /* NILFS_I_DIRTY may remain for
- freeing inode */
+ return -EINVAL; /*
+ * NILFS_I_DIRTY may remain for
+ * freeing inode.
+ */
}
list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
set_bit(NILFS_I_QUEUED, &ii->i_state);
@@ -946,7 +963,7 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
nilfs_warning(inode->i_sb, __func__,
- "failed to reget inode block.\n");
+ "failed to reget inode block.");
return err;
}
nilfs_update_inode(inode, ibh, flags);
@@ -973,7 +990,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
if (is_bad_inode(inode)) {
nilfs_warning(inode->i_sb, __func__,
- "tried to mark bad_inode dirty. ignored.\n");
+ "tried to mark bad_inode dirty. ignored.");
dump_stack();
return;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index e8fe24882b5b..358b57e2cdf9 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/fs.h>
@@ -783,6 +779,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
size_t nmembs = argv->v_nmembs;
struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
struct nilfs_bdesc *bdescs = buf;
+ struct buffer_head *bh;
int ret, i;
for (i = 0; i < nmembs; i++) {
@@ -800,12 +797,16 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
/* skip dead block */
continue;
if (bdescs[i].bd_level == 0) {
- ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
- bdescs[i].bd_offset);
- if (ret < 0) {
+ ret = nilfs_mdt_get_block(nilfs->ns_dat,
+ bdescs[i].bd_offset,
+ false, NULL, &bh);
+ if (unlikely(ret)) {
WARN_ON(ret == -ENOENT);
return ret;
}
+ mark_buffer_dirty(bh);
+ nilfs_mdt_mark_dirty(nilfs->ns_dat);
+ put_bh(bh);
} else {
ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
bdescs[i].bd_level);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f6982b9153d5..3417d859a03c 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#include <linux/buffer_head.h>
@@ -32,6 +28,7 @@
#include "segment.h"
#include "page.h"
#include "mdt.h"
+#include "alloc.h" /* nilfs_palloc_destroy_cache() */
#include <trace/events/nilfs2.h>
@@ -393,34 +390,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
return ret;
}
-/**
- * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
- * @inode: inode of the meta data file
- * @block: block offset
- *
- * Return Value: On success, it returns 0. On error, the following negative
- * error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EIO - I/O error
- *
- * %-ENOENT - the specified block does not exist (hole block)
- */
-int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
-{
- struct buffer_head *bh;
- int err;
-
- err = nilfs_mdt_read_block(inode, block, 0, &bh);
- if (unlikely(err))
- return err;
- mark_buffer_dirty(bh);
- nilfs_mdt_mark_dirty(inode);
- brelse(bh);
- return 0;
-}
-
int nilfs_mdt_fetch_dirty(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -497,8 +466,32 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
return 0;
}
-void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
- unsigned header_size)
+/**
+ * nilfs_mdt_clear - do cleanup for the metadata file
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear(struct inode *inode)
+{
+ struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+ if (mdi->mi_palloc_cache)
+ nilfs_palloc_destroy_cache(inode);
+}
+
+/**
+ * nilfs_mdt_destroy - release resources used by the metadata file
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_destroy(struct inode *inode)
+{
+ struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+ kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+ kfree(mdi);
+}
+
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
+ unsigned int header_size)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 03246cac3338..3f67f3932097 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#ifndef _NILFS_MDT_H
@@ -57,8 +53,8 @@ struct nilfs_shadow_map {
struct nilfs_mdt_info {
struct rw_semaphore mi_sem;
struct blockgroup_lock *mi_bgl;
- unsigned mi_entry_size;
- unsigned mi_first_entry_offset;
+ unsigned int mi_entry_size;
+ unsigned int mi_first_entry_offset;
unsigned long mi_entries_per_block;
struct nilfs_palloc_cache *mi_palloc_cache;
struct nilfs_shadow_map *mi_shadow;
@@ -71,6 +67,11 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
return inode->i_private;
}
+static inline int nilfs_is_metadata_file_inode(const struct inode *inode)
+{
+ return inode->i_private != NULL;
+}
+
/* Default GFP flags using highmem */
#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
@@ -83,11 +84,13 @@ int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
struct buffer_head **out_bh);
int nilfs_mdt_delete_block(struct inode *, unsigned long);
int nilfs_mdt_forget_block(struct inode *, unsigned long);
-int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
int nilfs_mdt_fetch_dirty(struct inode *);
int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
-void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_clear(struct inode *inode);
+void nilfs_mdt_destroy(struct inode *inode);
+
+void nilfs_mdt_set_entry_size(struct inode *, unsigned int, unsigned int);
int nilfs_mdt_setup_shadow_map(struct inode *inode,
struct nilfs_shadow_map *shadow);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 3b2af05f9fb4..1ec8ae5995a5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji and Ryusuke Konishi.
*/
/*
* linux/fs/ext2/namei.c
@@ -49,6 +44,7 @@
static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
{
int err = nilfs_add_link(dentry, inode);
+
if (!err) {
d_instantiate(dentry, inode);
unlock_new_inode(inode);
@@ -143,7 +139,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
{
struct nilfs_transaction_info ti;
struct super_block *sb = dir->i_sb;
- unsigned l = strlen(symname)+1;
+ unsigned int l = strlen(symname) + 1;
struct inode *inode;
int err;
@@ -288,7 +284,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
if (!inode->i_nlink) {
nilfs_warning(inode->i_sb, __func__,
- "deleting nonexistent file (%lu), %d\n",
+ "deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 385704027575..b1d48bc0532d 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Koji Sato and Ryusuke Konishi.
*/
#ifndef _NILFS_H
@@ -69,8 +64,10 @@ struct nilfs_inode_info {
*/
struct rw_semaphore xattr_sem;
#endif
- struct buffer_head *i_bh; /* i_bh contains a new or dirty
- disk inode */
+ struct buffer_head *i_bh; /*
+ * i_bh contains a new or dirty
+ * disk inode.
+ */
struct nilfs_root *i_root;
struct inode vfs_inode;
};
@@ -100,8 +97,10 @@ enum {
NILFS_I_NEW = 0, /* Inode is newly created */
NILFS_I_DIRTY, /* The file is dirty */
NILFS_I_QUEUED, /* inode is in dirty_files list */
- NILFS_I_BUSY, /* inode is grabbed by a segment
- constructor */
+ NILFS_I_BUSY, /*
+ * Inode is grabbed by a segment
+ * constructor
+ */
NILFS_I_COLLECTED, /* All dirty blocks are collected */
NILFS_I_UPDATED, /* The file has been written back */
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
@@ -145,8 +144,10 @@ enum {
struct nilfs_transaction_info {
u32 ti_magic;
void *ti_save;
- /* This should never used. If this happens,
- one of other filesystems has a bug. */
+ /*
+ * This should never be used. If it happens,
+ * one of other filesystems has a bug.
+ */
unsigned short ti_flags;
unsigned short ti_count;
};
@@ -156,8 +157,10 @@ struct nilfs_transaction_info {
/* ti_flags */
#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */
-#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the
- end of transaction. */
+#define NILFS_TI_SYNC 0x0002 /*
+ * Force to construct segment at the
+ * end of transaction.
+ */
#define NILFS_TI_GC 0x0004 /* GC context */
#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */
#define NILFS_TI_WRITER 0x0010 /* Constructor context */
@@ -279,7 +282,7 @@ extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
int nilfs_permission(struct inode *inode, int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
extern int __nilfs_mark_inode_dirty(struct inode *, int);
extern void nilfs_dirty_inode(struct inode *, int flags);
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 489391561cda..d97ba5f11b77 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- * Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
*/
#include <linux/pagemap.h>
@@ -440,12 +435,12 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
__nilfs_clear_page_dirty(page);
}
-unsigned nilfs_page_count_clean_buffers(struct page *page,
- unsigned from, unsigned to)
+unsigned int nilfs_page_count_clean_buffers(struct page *page,
+ unsigned int from, unsigned int to)
{
- unsigned block_start, block_end;
+ unsigned int block_start, block_end;
struct buffer_head *bh, *head;
- unsigned nc = 0;
+ unsigned int nc = 0;
for (bh = head = page_buffers(page), block_start = 0;
bh != head || !block_start;
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index a43b8287d012..f3687c958fa8 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- * Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
*/
#ifndef _NILFS_PAGE_H
@@ -58,7 +53,8 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_dirty_page(struct page *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
-unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
+ unsigned int);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
sector_t *blkoff);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5afa77fadc11..d893dc912b62 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#include <linux/buffer_head.h>
@@ -47,8 +43,10 @@ enum {
/* work structure for recovery */
struct nilfs_recovery_block {
- ino_t ino; /* Inode number of the file that this block
- belongs to */
+ ino_t ino; /*
+ * Inode number of the file that this block
+ * belongs to
+ */
sector_t blocknr; /* block number */
__u64 vblocknr; /* virtual block number */
unsigned long blkoff; /* File offset of the data block (per block) */
@@ -156,7 +154,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
sr = (struct nilfs_super_root *)bh_sr->b_data;
if (check) {
- unsigned bytes = le16_to_cpu(sr->sr_bytes);
+ unsigned int bytes = le16_to_cpu(sr->sr_bytes);
if (bytes == 0 || bytes > nilfs->ns_blocksize) {
ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
@@ -508,7 +506,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
{
struct inode *inode;
struct nilfs_recovery_block *rb, *n;
- unsigned blocksize = nilfs->ns_blocksize;
+ unsigned int blocksize = nilfs->ns_blocksize;
struct page *page;
loff_t pos;
int err = 0, err2 = 0;
@@ -526,6 +524,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
0, &page, nilfs_get_block);
if (unlikely(err)) {
loff_t isize = inode->i_size;
+
if (pos + blocksize > isize)
nilfs_write_failed(inode->i_mapping,
pos + blocksize);
@@ -872,9 +871,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
flags = le16_to_cpu(sum->ss_flags);
if (!(flags & NILFS_SS_SR) && !scan_newer) {
- /* This will never happen because a superblock
- (last_segment) always points to a pseg
- having a super root. */
+ /*
+ * This will never happen because a superblock
+ * (last_segment) always points to a pseg with
+ * a super root.
+ */
ret = NILFS_SEG_FAIL_CONSISTENCY;
goto failed;
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index f63620ce3892..bf36df10540b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -133,7 +129,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
return 0;
}
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags,
time_t ctime, __u64 cno)
{
int err;
@@ -240,7 +236,7 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
{
struct nilfs_super_root *raw_sr;
struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
- unsigned srsize;
+ unsigned int srsize;
u32 crc;
raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index b04f08cc2397..7bbccc099709 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
#ifndef _NILFS_SEGBUF_H
@@ -82,7 +78,7 @@ struct nilfs_segment_buffer {
__u64 sb_nextnum;
sector_t sb_fseg_start, sb_fseg_end;
sector_t sb_pseg_start;
- unsigned sb_rest_blocks;
+ unsigned int sb_rest_blocks;
/* Buffers */
struct list_head sb_segsum_buffers;
@@ -124,7 +120,8 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
struct nilfs_segment_buffer *prev);
void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t,
+ __u64);
int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
struct buffer_head **);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 4317f72568e6..e78b68a81aec 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -49,18 +45,26 @@
*/
#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */
-#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments
- appended in collection retry loop */
+#define SC_MAX_SEGDELTA 64 /*
+ * Upper limit of the number of segments
+ * appended in collection retry loop
+ */
/* Construction mode */
enum {
SC_LSEG_SR = 1, /* Make a logical segment having a super root */
- SC_LSEG_DSYNC, /* Flush data blocks of a given file and make
- a logical segment without a super root */
- SC_FLUSH_FILE, /* Flush data files, leads to segment writes without
- creating a checkpoint */
- SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without
- a checkpoint */
+ SC_LSEG_DSYNC, /*
+ * Flush data blocks of a given file and make
+ * a logical segment without a super root.
+ */
+ SC_FLUSH_FILE, /*
+ * Flush data files, leads to segment writes without
+ * creating a checkpoint.
+ */
+ SC_FLUSH_DAT, /*
+ * Flush DAT file. This also creates segments
+ * without a checkpoint.
+ */
};
/* Stage numbers of dirty block collection */
@@ -154,17 +158,15 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
if (cur_ti) {
if (cur_ti->ti_magic == NILFS_TI_MAGIC)
return ++cur_ti->ti_count;
- else {
- /*
- * If journal_info field is occupied by other FS,
- * it is saved and will be restored on
- * nilfs_transaction_commit().
- */
- printk(KERN_WARNING
- "NILFS warning: journal info from a different "
- "FS\n");
- save = current->journal_info;
- }
+
+ /*
+ * If journal_info field is occupied by other FS,
+ * it is saved and will be restored on
+ * nilfs_transaction_commit().
+ */
+ printk(KERN_WARNING
+ "NILFS warning: journal info from a different FS\n");
+ save = current->journal_info;
}
if (!ti) {
ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
@@ -397,10 +399,10 @@ static void nilfs_transaction_unlock(struct super_block *sb)
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
struct nilfs_segsum_pointer *ssp,
- unsigned bytes)
+ unsigned int bytes)
{
struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
- unsigned blocksize = sci->sc_super->s_blocksize;
+ unsigned int blocksize = sci->sc_super->s_blocksize;
void *p;
if (unlikely(ssp->offset + bytes > blocksize)) {
@@ -422,8 +424,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
{
struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
struct buffer_head *sumbh;
- unsigned sumbytes;
- unsigned flags = 0;
+ unsigned int sumbytes;
+ unsigned int flags = 0;
int err;
if (nilfs_doing_gc())
@@ -444,8 +446,10 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
{
sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
- return -E2BIG; /* The current segment is filled up
- (internal code) */
+ return -E2BIG; /*
+ * The current segment is filled up
+ * (internal code)
+ */
sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
return nilfs_segctor_reset_segment_buffer(sci);
}
@@ -472,9 +476,9 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
*/
static int nilfs_segctor_segsum_block_required(
struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
- unsigned binfo_size)
+ unsigned int binfo_size)
{
- unsigned blocksize = sci->sc_super->s_blocksize;
+ unsigned int blocksize = sci->sc_super->s_blocksize;
/* Size of finfo and binfo is enough small against blocksize */
return ssp->offset + binfo_size +
@@ -533,7 +537,7 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
struct buffer_head *bh,
struct inode *inode,
- unsigned binfo_size)
+ unsigned int binfo_size)
{
struct nilfs_segment_buffer *segbuf;
int required, err = 0;
@@ -617,7 +621,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
*vblocknr = binfo->bi_v.bi_vblocknr;
}
-static struct nilfs_sc_operations nilfs_sc_file_ops = {
+static const struct nilfs_sc_operations nilfs_sc_file_ops = {
.collect_data = nilfs_collect_file_data,
.collect_node = nilfs_collect_file_node,
.collect_bmap = nilfs_collect_file_bmap,
@@ -666,7 +670,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
*binfo_dat = binfo->bi_dat;
}
-static struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static const struct nilfs_sc_operations nilfs_sc_dat_ops = {
.collect_data = nilfs_collect_dat_data,
.collect_node = nilfs_collect_file_node,
.collect_bmap = nilfs_collect_dat_bmap,
@@ -674,7 +678,7 @@ static struct nilfs_sc_operations nilfs_sc_dat_ops = {
.write_node_binfo = nilfs_write_dat_node_binfo,
};
-static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static const struct nilfs_sc_operations nilfs_sc_dsync_ops = {
.collect_data = nilfs_collect_file_data,
.collect_node = NULL,
.collect_bmap = NULL,
@@ -777,7 +781,7 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
{
struct nilfs_inode_info *ii, *n;
struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
- unsigned nv = 0;
+ unsigned int nv = 0;
while (!list_empty(head)) {
spin_lock(&nilfs->ns_inode_lock);
@@ -875,9 +879,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
&raw_cp, &bh_cp);
if (likely(!err)) {
- /* The following code is duplicated with cpfile. But, it is
- needed to collect the checkpoint even if it was not newly
- created */
+ /*
+ * The following code is duplicated with cpfile. But, it is
+ * needed to collect the checkpoint even if it was not newly
+ * created.
+ */
mark_buffer_dirty(bh_cp);
nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
nilfs_cpfile_put_checkpoint(
@@ -958,7 +964,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
{
struct buffer_head *bh_sr;
struct nilfs_super_root *raw_sr;
- unsigned isz, srsz;
+ unsigned int isz, srsz;
bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
@@ -1043,7 +1049,7 @@ static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
struct inode *inode,
- struct nilfs_sc_operations *sc_ops)
+ const struct nilfs_sc_operations *sc_ops)
{
LIST_HEAD(data_buffers);
LIST_HEAD(node_buffers);
@@ -1406,8 +1412,10 @@ static void nilfs_free_incomplete_logs(struct list_head *logs,
if (atomic_read(&segbuf->sb_err)) {
/* Case 1: The first segment failed */
if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
- /* Case 1a: Partial segment appended into an existing
- segment */
+ /*
+ * Case 1a: Partial segment appended into an existing
+ * segment
+ */
nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
segbuf->sb_fseg_end);
else /* Case 1b: New full segment */
@@ -1550,7 +1558,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
sector_t blocknr;
unsigned long nfinfo = segbuf->sb_sum.nfinfo;
unsigned long nblocks = 0, ndatablk = 0;
- struct nilfs_sc_operations *sc_op = NULL;
+ const struct nilfs_sc_operations *sc_op = NULL;
struct nilfs_segsum_pointer ssp;
struct nilfs_finfo *finfo = NULL;
union nilfs_binfo binfo;
@@ -1631,8 +1639,10 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
static void nilfs_begin_page_io(struct page *page)
{
if (!page || PageWriteback(page))
- /* For split b-tree node pages, this function may be called
- twice. We ignore the 2nd or later calls by this check. */
+ /*
+ * For split b-tree node pages, this function may be called
+ * twice. We ignore the 2nd or later calls by this check.
+ */
return;
lock_page(page);
@@ -1942,7 +1952,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
ifile, ii->vfs_inode.i_ino, &ibh);
if (unlikely(err)) {
nilfs_warning(sci->sc_super, __func__,
- "failed to get inode block.\n");
+ "failed to get inode block.");
return err;
}
mark_buffer_dirty(ibh);
@@ -2395,6 +2405,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_construction_timeout(unsigned long data)
{
struct task_struct *p = (struct task_struct *)data;
+
wake_up_process(p);
}
@@ -2555,10 +2566,10 @@ static int nilfs_segctor_thread(void *arg)
if (timeout || sci->sc_seq_request != sci->sc_seq_done)
mode = SC_LSEG_SR;
- else if (!sci->sc_flush_request)
- break;
- else
+ else if (sci->sc_flush_request)
mode = nilfs_segctor_flush_mode(sci);
+ else
+ break;
spin_unlock(&sci->sc_state_lock);
nilfs_segctor_thread_construct(sci, mode);
@@ -2684,8 +2695,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
{
int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
- /* The segctord thread was stopped and its timer was removed.
- But some tasks remain. */
+ /*
+ * The segctord thread was stopped and its timer was removed.
+ * But some tasks remain.
+ */
do {
struct nilfs_transaction_info ti;
@@ -2727,13 +2740,13 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
if (!list_empty(&sci->sc_dirty_files)) {
nilfs_warning(sci->sc_super, __func__,
- "dirty file(s) after the final construction\n");
+ "dirty file(s) after the final construction");
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
if (!list_empty(&sci->sc_iput_queue)) {
nilfs_warning(sci->sc_super, __func__,
- "iput queue is not empty\n");
+ "iput queue is not empty");
nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
}
@@ -2810,7 +2823,7 @@ void nilfs_detach_log_writer(struct super_block *sb)
if (!list_empty(&nilfs->ns_dirty_files)) {
list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
nilfs_warning(sb, __func__,
- "Hit dirty file after stopped log writer\n");
+ "Hit dirty file after stopped log writer");
}
spin_unlock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0408b9b2814b..6565c10b7b76 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
#ifndef _NILFS_SEGMENT_H
@@ -75,7 +71,7 @@ struct nilfs_recovery_info {
*/
struct nilfs_cstage {
int scnt;
- unsigned flags;
+ unsigned int flags;
struct nilfs_inode_info *dirty_file_ptr;
struct nilfs_inode_info *gc_inode_ptr;
};
@@ -84,7 +80,7 @@ struct nilfs_segment_buffer;
struct nilfs_segsum_pointer {
struct buffer_head *bh;
- unsigned offset; /* offset in bytes */
+ unsigned int offset; /* offset in bytes */
};
/**
@@ -193,11 +189,15 @@ enum {
NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */
NILFS_SC_UNCLOSED, /* Logical segment is not closed */
NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */
- NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a
- checkpoint */
- NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files
- other than DAT, cpfile, sufile, or files
- moved by GC */
+ NILFS_SC_PRIOR_FLUSH, /*
+ * Requesting immediate flush without making a
+ * checkpoint
+ */
+ NILFS_SC_HAVE_DELTA, /*
+ * Next checkpoint will have update of files
+ * other than DAT, cpfile, sufile, or files
+ * moved by GC.
+ */
};
/* sc_state */
@@ -207,17 +207,23 @@ enum {
/*
* Constant parameters
*/
-#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when
- destroying segctord */
+#define NILFS_SC_CLEANUP_RETRY 3 /*
+ * Retry count of construction when
+ * destroying segctord
+ */
/*
* Default values of timeout, in seconds.
*/
-#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks.
- It triggers construction of a
- logical segment with a super root */
-#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root
- creation */
+#define NILFS_SC_DEFAULT_TIMEOUT 5 /*
+ * Timeout value of dirty blocks.
+ * It triggers construction of a
+ * logical segment with a super root.
+ */
+#define NILFS_SC_DEFAULT_SR_FREQ 30 /*
+ * Maximum frequency of super root
+ * creation
+ */
/*
* The default threshold amount of data, in block counts.
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 52821ffc11f4..1963595a1580 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Koji Sato.
+ * Revised by Ryusuke Konishi.
*/
#include <linux/kernel.h>
@@ -61,6 +57,7 @@ static unsigned long
nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
{
__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+
do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
return (unsigned long)t;
}
@@ -69,6 +66,7 @@ static unsigned long
nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
{
__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+
return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
}
@@ -819,7 +817,7 @@ out:
* %-ENOMEM - Insufficient amount of memory available.
*/
ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
- unsigned sisz, size_t nsi)
+ unsigned int sisz, size_t nsi)
{
struct buffer_head *su_bh;
struct nilfs_segment_usage *su;
@@ -897,7 +895,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
* %-EINVAL - Invalid values in input (segment number, flags or nblocks)
*/
ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
- unsigned supsz, size_t nsup)
+ unsigned int supsz, size_t nsup)
{
struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
struct buffer_head *header_bh, *bh;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index b8afd72f2379..46e89872294c 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_SUFILE_H
@@ -42,9 +38,9 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
unsigned long nblocks, time_t modtime);
int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
-ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int,
size_t);
-ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
+ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned int, size_t);
int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
void (*dofunc)(struct inode *, __u64,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7f5d3d9f1c37..666107a18a22 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
/*
* linux/fs/ext2/super.c
@@ -173,12 +169,10 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
static void nilfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
- if (mdi) {
- kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
- kfree(mdi);
- }
+ if (nilfs_is_metadata_file_inode(inode))
+ nilfs_mdt_destroy(inode);
+
kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
}
@@ -279,7 +273,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
}
} else if (sbp[1] &&
sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
- memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+ memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
}
if (flip && sbp[1])
@@ -749,6 +743,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
@@ -891,7 +886,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
- return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
+ return !parse_options(data, sb, 0) ? -EINVAL : 0;
}
int nilfs_check_feature_compatibility(struct super_block *sb,
@@ -1316,7 +1311,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (!s->s_root) {
- s_new = true;
+ s_new = true;
/* New superblock instance created */
s->s_mode = mode;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index bbb0dcc35905..8ffa42b704d8 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -68,7 +68,7 @@ static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \
static const struct sysfs_ops nilfs_##name##_attr_ops = { \
.show = nilfs_##name##_attr_show, \
.store = nilfs_##name##_attr_store, \
-};
+}
#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
static void nilfs_##name##_attr_release(struct kobject *kobj) \
@@ -84,7 +84,7 @@ static struct kobj_type nilfs_##name##_ktype = { \
.default_attrs = nilfs_##name##_attrs, \
.sysfs_ops = &nilfs_##name##_attr_ops, \
.release = nilfs_##name##_attr_release, \
-};
+}
#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \
static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
@@ -756,7 +756,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- unsigned sbwcount;
+ unsigned int sbwcount;
down_read(&nilfs->ns_sem);
sbwcount = nilfs->ns_sbwcount;
@@ -770,7 +770,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- unsigned sb_update_freq;
+ unsigned int sb_update_freq;
down_read(&nilfs->ns_sem);
sb_update_freq = nilfs->ns_sb_update_freq;
@@ -784,7 +784,7 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
const char *buf, size_t count)
{
- unsigned val;
+ unsigned int val;
int err;
err = kstrtouint(skip_spaces(buf), 0, &val);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index 677e3a1a8370..648cedf9c06e 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -66,7 +66,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct kobject *, struct attribute *, \
const char *, size_t); \
-};
+}
NILFS_COMMON_ATTR_STRUCT(feature);
@@ -77,7 +77,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \
const char *, size_t); \
-};
+}
NILFS_DEV_ATTR_STRUCT(dev);
NILFS_DEV_ATTR_STRUCT(segments);
@@ -93,7 +93,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \
const char *, size_t); \
-};
+}
NILFS_CP_ATTR_STRUCT(snapshot);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 69bd801afb53..809bd2de7ad0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -112,8 +108,8 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
struct nilfs_super_root *raw_sr;
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct nilfs_inode *rawi;
- unsigned dat_entry_size, segment_usage_size, checkpoint_size;
- unsigned inode_size;
+ unsigned int dat_entry_size, segment_usage_size, checkpoint_size;
+ unsigned int inode_size;
int err;
err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
@@ -621,8 +617,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
if (err)
goto out;
- /* not failed_sbh; sbh is released automatically
- when reloading fails. */
+ /*
+ * Not to failed_sbh; sbh is released automatically
+ * when reloading fails.
+ */
}
nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
nilfs->ns_blocksize = blocksize;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 23778d385836..79369fd6b13b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -118,10 +114,10 @@ struct the_nilfs {
struct buffer_head *ns_sbh[2];
struct nilfs_super_block *ns_sbp[2];
time_t ns_sbwtime;
- unsigned ns_sbwcount;
- unsigned ns_sbsize;
- unsigned ns_mount_state;
- unsigned ns_sb_update_freq;
+ unsigned int ns_sbwcount;
+ unsigned int ns_sbsize;
+ unsigned int ns_mount_state;
+ unsigned int ns_sb_update_freq;
/*
* Following fields are dedicated to a writable FS-instance.
@@ -226,15 +222,14 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
* Mount option operations
*/
#define nilfs_clear_opt(nilfs, opt) \
- do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+ ((nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt)
#define nilfs_set_opt(nilfs, opt) \
- do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+ ((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt)
#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
#define nilfs_write_opt(nilfs, mask, opt) \
- do { (nilfs)->ns_mount_opt = \
+ ((nilfs)->ns_mount_opt = \
(((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \
- NILFS_MOUNT_##opt); \
- } while (0)
+ NILFS_MOUNT_##opt)) \
/**
* struct nilfs_root - nilfs root object
@@ -273,6 +268,7 @@ struct nilfs_root {
static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
{
u64 t = get_seconds();
+
return t < nilfs->ns_sbwtime ||
t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
}
@@ -280,6 +276,7 @@ static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
{
int flip_bits = nilfs->ns_sbwcount & 0x0FL;
+
return (flip_bits != 0x08 && flip_bits != 0x0F);
}
@@ -308,7 +305,7 @@ static inline void nilfs_get_root(struct nilfs_root *root)
static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
{
- unsigned valid_fs;
+ unsigned int valid_fs;
down_read(&nilfs->ns_sem);
valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b44c68a857e7..0a3bc2cf192c 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
&mnt->mnt_root->d_lock);
}
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/*
+ * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
+ */
+extern void fsnotify_mark_destroy_list(void);
+
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62cb2854..3e2dd85be5dd 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
*/
void fsnotify_destroy_group(struct fsnotify_group *group)
{
- /* clear all inode marks for this group */
- fsnotify_clear_marks_by_group(group);
+ /* clear all inode marks for this group, attach them to destroy_list */
+ fsnotify_detach_group_marks(group);
- synchronize_srcu(&fsnotify_mark_srcu);
+ /*
+ * Wait for fsnotify_mark_srcu period to end and free all marks in
+ * destroy_list
+ */
+ fsnotify_mark_destroy_list();
- /* clear the notification queue of all events */
+ /*
+ * Since we have waited for fsnotify_mark_srcu in
+ * fsnotify_mark_destroy_list() there can be no outstanding event
+ * notification against this group. So clearing the notification queue
+ * of all events is reliable now.
+ */
fsnotify_flush_notify(group);
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d7d373..d3fea0bd89e2 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
}
/*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
*/
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
@@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
spin_unlock(&mark->lock);
- return;
+ return false;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
-
/*
* Some groups like to know that marks are being freed. This is a
* callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
+
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+
+ return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ if (__fsnotify_free_mark(mark)) {
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+ }
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
/*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
*/
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
{
- fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+ struct fsnotify_mark *mark;
+
+ while (1) {
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ if (list_empty(&group->marks_list)) {
+ mutex_unlock(&group->mark_mutex);
+ break;
+ }
+ mark = list_first_entry(&group->marks_list,
+ struct fsnotify_mark, g_list);
+ fsnotify_get_mark(mark);
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&group->mark_mutex);
+ __fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ }
}
void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
mark->free_mark = free_mark;
}
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
@@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
fsnotify_put_mark(mark);
}
}
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+ fsnotify_mark_destroy_list();
+}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e361d1a0ca09..460c0cedab3a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5351,7 +5351,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
{
int ret;
u32 left_cpos, rec_range, trunc_range;
- int wants_rotate = 0, is_rightmost_tree_rec = 0;
+ int is_rightmost_tree_rec = 0;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5457,7 +5457,6 @@ static int ocfs2_truncate_rec(handle_t *handle,
memset(rec, 0, sizeof(*rec));
ocfs2_cleanup_merge(el, index);
- wants_rotate = 1;
next_free = le16_to_cpu(el->l_next_free_rec);
if (is_rightmost_tree_rec && next_free > 1) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 1934abb6b680..6aaf3e351391 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -272,10 +272,21 @@ struct o2hb_region {
struct delayed_work hr_write_timeout_work;
unsigned long hr_last_timeout_start;
+ /* negotiate timer, used to negotiate extending hb timeout. */
+ struct delayed_work hr_nego_timeout_work;
+ unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
/* Used during o2hb_check_slot to hold a copy of the block
* being checked because we temporarily have to zero out the
* crc field. */
struct o2hb_disk_heartbeat_block *hr_tmp_block;
+
+ /* Message key for negotiate timeout message. */
+ unsigned int hr_key;
+ struct list_head hr_handler_list;
+
+ /* last hb status, 0 for success, other value for error. */
+ int hr_last_hb_status;
};
struct o2hb_bio_wait_ctxt {
@@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
+#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2)
+
+enum {
+ O2HB_NEGO_TIMEOUT_MSG = 1,
+ O2HB_NEGO_APPROVE_MSG = 2,
+};
+
+struct o2hb_nego_msg {
+ u8 node_num;
+};
+
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
@@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work)
o2quo_disk_timeout();
}
-static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+static void o2hb_arm_timeout(struct o2hb_region *reg)
{
/* Arm writeout only after thread reaches steady state */
if (atomic_read(&reg->hr_steady_iterations) != 0)
@@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
spin_unlock(&o2hb_live_lock);
}
cancel_delayed_work(&reg->hr_write_timeout_work);
- reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+
+ cancel_delayed_work(&reg->hr_nego_timeout_work);
+ /* negotiate timeout must be less than write timeout. */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
+ memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
}
-static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+static void o2hb_disarm_timeout(struct o2hb_region *reg)
{
cancel_delayed_work_sync(&reg->hr_write_timeout_work);
+ cancel_delayed_work_sync(&reg->hr_nego_timeout_work);
+}
+
+static int o2hb_send_nego_msg(int key, int type, u8 target)
+{
+ struct o2hb_nego_msg msg;
+ int status, ret;
+
+ msg.node_num = o2nm_this_node();
+again:
+ ret = o2net_send_message(type, key, &msg, sizeof(msg),
+ target, &status);
+
+ if (ret == -EAGAIN || ret == -ENOMEM) {
+ msleep(100);
+ goto again;
+ }
+
+ return ret;
+}
+
+static void o2hb_nego_timeout(struct work_struct *work)
+{
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int master_node, i, ret;
+ struct o2hb_region *reg;
+
+ reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work);
+ /* don't negotiate timeout if last hb failed since it is very
+ * possible io failed. Should let write timeout fence self.
+ */
+ if (reg->hr_last_hb_status)
+ return;
+
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ /* lowest node as master node to make negotiate decision. */
+ master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0);
+
+ if (master_node == o2nm_this_node()) {
+ if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ set_bit(master_node, reg->hr_nego_node_bitmap);
+ }
+ if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
+ sizeof(reg->hr_nego_node_bitmap))) {
+ /* check negotiate bitmap every second to do timeout
+ * approve decision.
+ */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(1000));
+
+ return;
+ }
+
+ printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ /* approve negotiate timeout request. */
+ o2hb_arm_timeout(reg);
+
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ if (i == master_node)
+ continue;
+
+ mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i);
+ ret = o2hb_send_nego_msg(reg->hr_key,
+ O2HB_NEGO_APPROVE_MSG, i);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n",
+ i, ret);
+ }
+ } else {
+ /* negotiate timeout with master node. */
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
+ reg->hr_dev_name, master_node);
+ ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
+ master_node);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n",
+ master_node, ret);
+ }
+}
+
+static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = data;
+ struct o2hb_nego_msg *nego_msg;
+
+ nego_msg = (struct o2hb_nego_msg *)msg->buf;
+ printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n",
+ nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_dev_name);
+ if (nego_msg->node_num < O2NM_MAX_NODES)
+ set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
+ else
+ mlog(ML_ERROR, "got nego timeout message from bad node.\n");
+
+ return 0;
+}
+
+static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = data;
+
+ printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ o2hb_arm_timeout(reg);
+ return 0;
}
static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -1032,7 +1172,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
/* Skip disarming the timeout if own slot has stale/bad data */
if (own_slot_ok) {
o2hb_set_quorum_device(reg);
- o2hb_arm_write_timeout(reg);
+ o2hb_arm_timeout(reg);
+ reg->hr_last_timeout_start = jiffies;
}
bail:
@@ -1096,6 +1237,7 @@ static int o2hb_thread(void *data)
before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
+ reg->hr_last_hb_status = ret;
after_hb = ktime_get_real();
@@ -1114,7 +1256,7 @@ static int o2hb_thread(void *data)
}
}
- o2hb_disarm_write_timeout(reg);
+ o2hb_disarm_timeout(reg);
/* unclean stop is only used in very bad situation */
for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
@@ -1451,12 +1593,12 @@ static void o2hb_region_release(struct config_item *item)
list_del(&reg->hr_all_item);
spin_unlock(&o2hb_live_lock);
+ o2net_unregister_handler_list(&reg->hr_handler_list);
kfree(reg);
}
static int o2hb_read_block_input(struct o2hb_region *reg,
const char *page,
- size_t count,
unsigned long *ret_bytes,
unsigned int *ret_bits)
{
@@ -1499,8 +1641,8 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
if (reg->hr_bdev)
return -EINVAL;
- status = o2hb_read_block_input(reg, page, count,
- &block_bytes, &block_bits);
+ status = o2hb_read_block_input(reg, page, &block_bytes,
+ &block_bits);
if (status)
return status;
@@ -1763,6 +1905,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
}
INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
+ INIT_DELAYED_WORK(&reg->hr_nego_timeout_work, o2hb_nego_timeout);
/*
* A node is considered live after it has beat LIVE_THRESHOLD
@@ -1996,13 +2139,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+ /* this is the same way to generate msg key as dlm, for local heartbeat,
+ * name is also the same, so make initial crc value different to avoid
+ * message key conflict.
+ */
+ reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS,
+ name, strlen(name));
+ INIT_LIST_HEAD(&reg->hr_handler_list);
+ ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_timeout_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto free;
+
+ ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_approve_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto unregister_handler;
+
ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
if (ret) {
config_item_put(&reg->hr_item);
- goto free;
+ goto unregister_handler;
}
return &reg->hr_item;
+
+unregister_handler:
+ o2net_unregister_handler_list(&reg->hr_handler_list);
free:
kfree(reg);
return ERR_PTR(ret);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2d0acd6678fe..4238eb28889f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -600,10 +600,11 @@ static void o2net_set_nn_state(struct o2net_node *nn,
static void o2net_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
+ struct o2net_sock_container *sc;
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_user_data) {
- struct o2net_sock_container *sc = sk->sk_user_data;
+ read_lock_bh(&sk->sk_callback_lock);
+ sc = sk->sk_user_data;
+ if (sc) {
sclog(sc, "data_ready hit\n");
o2net_set_data_ready_time(sc);
o2net_sc_queue_work(sc, &sc->sc_rx_work);
@@ -611,7 +612,7 @@ static void o2net_data_ready(struct sock *sk)
} else {
ready = sk->sk_data_ready;
}
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
ready(sk);
}
@@ -622,7 +623,7 @@ static void o2net_state_change(struct sock *sk)
void (*state_change)(struct sock *sk);
struct o2net_sock_container *sc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
sc = sk->sk_user_data;
if (sc == NULL) {
state_change = sk->sk_state_change;
@@ -649,7 +650,7 @@ static void o2net_state_change(struct sock *sk)
break;
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
state_change(sk);
}
@@ -2012,7 +2013,7 @@ static void o2net_listen_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
ready = sk->sk_user_data;
if (ready == NULL) { /* check for teardown race */
ready = sk->sk_data_ready;
@@ -2039,7 +2040,7 @@ static void o2net_listen_data_ready(struct sock *sk)
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
if (ready != NULL)
ready(sk);
}
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index b95e7df5b76a..94b18369b1cc 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,6 +44,9 @@
* version here in tcp_internal.h should not need to be bumped for
* filesystem locking changes.
*
+ * New in version 12
+ * - Negotiate hb timeout when storage is down.
+ *
* New in version 11
* - Negotiation of filesystem locking in the dlm join.
*
@@ -75,7 +78,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 11ULL
+#define O2NET_PROTOCOL_VERSION 12ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0748777f2e2a..c56a7679df93 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -176,12 +176,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
}
if (is_bad_inode(inode)) {
iput(inode);
- if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
- (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
- /* Return OCFS2_FILECHECK_ERR_XXX related errno */
- inode = ERR_PTR(rc);
- else
- inode = ERR_PTR(-ESTALE);
+ inode = ERR_PTR(rc);
goto bail;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index f4cd3c3e9fb7..497a4171ef61 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
{
- return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+ return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
}
static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 540ab5b75dbb..44d178b8d1aa 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -580,7 +580,7 @@ struct ocfs2_extended_slot {
/*00*/ __u8 es_valid;
__u8 es_reserved1[3];
__le32 es_node_num;
-/*10*/
+/*08*/
};
/*
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 1e09592148ad..d7407994f308 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -535,12 +535,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
spin_unlock(&osb->osb_lock);
status = ocfs2_update_disk_slot(osb, si, slot_num);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto bail;
- }
-bail:
ocfs2_free_slot_info(osb);
}
-
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ad16995c9e7a..d2053853951e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
}
static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
@@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
}
static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
@@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
}
static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 99c19545752c..5893ddde0e4b 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -448,13 +448,14 @@ out_unlock:
}
static int orangefs_xattr_set_default(const struct xattr_handler *handler,
- struct dentry *dentry,
+ struct dentry *unused,
+ struct inode *inode,
const char *name,
const void *buffer,
size_t size,
int flags)
{
- return orangefs_inode_setxattr(dentry->d_inode,
+ return orangefs_inode_setxattr(inode,
ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
name,
buffer,
@@ -478,13 +479,14 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler,
}
static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
- struct dentry *dentry,
+ struct dentry *unused,
+ struct inode *inode,
const char *name,
const void *buffer,
size_t size,
int flags)
{
- return orangefs_inode_setxattr(dentry->d_inode,
+ return orangefs_inode_setxattr(inode,
ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
name,
buffer,
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index cc514da6f3e7..80aa6f1eb336 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -336,7 +336,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
- struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
@@ -357,28 +356,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
return PTR_ERR(link);
}
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_free_link;
-
- override_cred->fsuid = stat->uid;
- override_cred->fsgid = stat->gid;
- /*
- * CAP_SYS_ADMIN for copying up extended attributes
- * CAP_DAC_OVERRIDE for create
- * CAP_FOWNER for chmod, timestamp update
- * CAP_FSETID for chmod
- * CAP_CHOWN for chown
- * CAP_MKNOD for mknod
- */
- cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- cap_raise(override_cred->cap_effective, CAP_FOWNER);
- cap_raise(override_cred->cap_effective, CAP_FSETID);
- cap_raise(override_cred->cap_effective, CAP_CHOWN);
- cap_raise(override_cred->cap_effective, CAP_MKNOD);
- old_cred = override_creds(override_cred);
+ old_cred = ovl_override_creds(dentry->d_sb);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
@@ -401,9 +379,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
out_unlock:
unlock_rename(workdir, upperdir);
revert_creds(old_cred);
- put_cred(override_cred);
-out_free_link:
if (link)
free_page((unsigned long) link);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index b3fc0a35bf62..c2a6b0894022 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -407,26 +407,20 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const struct cred *old_cred;
struct cred *override_cred;
+ old_cred = ovl_override_creds(dentry->d_sb);
+
err = -ENOMEM;
override_cred = prepare_creds();
- if (!override_cred)
- goto out_iput;
-
- /*
- * CAP_SYS_ADMIN for setting opaque xattr
- * CAP_DAC_OVERRIDE for create in workdir, rename
- * CAP_FOWNER for removing whiteout from sticky dir
- */
- cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- cap_raise(override_cred->cap_effective, CAP_FOWNER);
- old_cred = override_creds(override_cred);
-
- err = ovl_create_over_whiteout(dentry, inode, &stat, link,
- hardlink);
-
+ if (override_cred) {
+ override_cred->fsuid = old_cred->fsuid;
+ override_cred->fsgid = old_cred->fsgid;
+ put_cred(override_creds(override_cred));
+ put_cred(override_cred);
+
+ err = ovl_create_over_whiteout(dentry, inode, &stat,
+ link, hardlink);
+ }
revert_creds(old_cred);
- put_cred(override_cred);
}
if (!err)
@@ -662,32 +656,11 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
- const struct cred *old_cred;
- struct cred *override_cred;
-
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_drop_write;
-
- /*
- * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
- * CAP_DAC_OVERRIDE for create in workdir, rename
- * CAP_FOWNER for removing whiteout from sticky dir
- * CAP_FSETID for chmod of opaque dir
- * CAP_CHOWN for chown of opaque dir
- */
- cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- cap_raise(override_cred->cap_effective, CAP_FOWNER);
- cap_raise(override_cred->cap_effective, CAP_FSETID);
- cap_raise(override_cred->cap_effective, CAP_CHOWN);
- old_cred = override_creds(override_cred);
+ const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
- put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
@@ -725,7 +698,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
- struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
@@ -794,26 +766,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
- if (old_opaque || new_opaque) {
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_drop_write;
-
- /*
- * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
- * CAP_DAC_OVERRIDE for create in workdir
- * CAP_FOWNER for removing whiteout from sticky dir
- * CAP_FSETID for chmod of opaque dir
- * CAP_CHOWN for chown of opaque dir
- */
- cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- cap_raise(override_cred->cap_effective, CAP_FOWNER);
- cap_raise(override_cred->cap_effective, CAP_FSETID);
- cap_raise(override_cred->cap_effective, CAP_CHOWN);
- old_cred = override_creds(override_cred);
- }
+ if (old_opaque || new_opaque)
+ old_cred = ovl_override_creds(old->d_sb);
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
@@ -943,10 +897,8 @@ out_dput_old:
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
- if (old_opaque || new_opaque) {
+ if (old_opaque || new_opaque)
revert_creds(old_cred);
- put_cred(override_cred);
- }
out_drop_write:
ovl_drop_write(old);
out:
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c7b31a03dc9c..1dbeab6cf96e 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -210,8 +210,9 @@ static bool ovl_is_private_xattr(const char *name)
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
-int ovl_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
int err;
struct dentry *upperdentry;
@@ -237,41 +238,27 @@ out:
return err;
}
-static bool ovl_need_xattr_filter(struct dentry *dentry,
- enum ovl_path_type type)
-{
- if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
- return S_ISDIR(dentry->d_inode->i_mode);
- else
- return false;
-}
-
ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size)
{
- struct path realpath;
- enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+ struct dentry *realdentry = ovl_dentry_real(dentry);
- if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+ if (ovl_is_private_xattr(name))
return -ENODATA;
- return vfs_getxattr(realpath.dentry, name, value, size);
+ return vfs_getxattr(realdentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
- struct path realpath;
- enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+ struct dentry *realdentry = ovl_dentry_real(dentry);
ssize_t res;
int off;
- res = vfs_listxattr(realpath.dentry, list, size);
+ res = vfs_listxattr(realdentry, list, size);
if (res <= 0 || size == 0)
return res;
- if (!ovl_need_xattr_filter(dentry, type))
- return res;
-
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
@@ -301,7 +288,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name)
goto out;
err = -ENODATA;
- if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+ if (ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 99ec4b035237..4bd9b5ba8f42 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -153,6 +153,7 @@ void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
+const struct cred *ovl_override_creds(struct super_block *sb);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
@@ -171,8 +172,9 @@ int ovl_check_d_type_supported(struct path *realpath);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
-int ovl_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags);
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index da186ee4f846..cf37fc76fc9f 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -36,6 +36,7 @@ struct ovl_dir_cache {
struct ovl_readdir_data {
struct dir_context ctx;
+ struct dentry *dentry;
bool is_lowest;
struct rb_root root;
struct list_head *list;
@@ -206,21 +207,10 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
- struct cred *override_cred;
-
- override_cred = prepare_creds();
- if (!override_cred)
- return -ENOMEM;
- /*
- * CAP_DAC_OVERRIDE for lookup
- */
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- old_cred = override_creds(override_cred);
+ old_cred = ovl_override_creds(rdd->dentry->d_sb);
- inode_lock(dir->d_inode);
- err = 0;
- // XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex);
+ err = down_write_killable(&dir->d_inode->i_rwsem);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
@@ -234,7 +224,6 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
inode_unlock(dir->d_inode);
}
revert_creds(old_cred);
- put_cred(override_cred);
return err;
}
@@ -290,6 +279,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
+ .dentry = dentry,
.list = list,
.root = RB_ROOT,
.is_lowest = false,
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ed53ae0fe868..ce02f46029da 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -42,6 +42,8 @@ struct ovl_fs {
long lower_namelen;
/* pathnames of lower and upper dirs, for show_options */
struct ovl_config config;
+ /* creds of process who forced instantiation of super block */
+ const struct cred *creator_cred;
};
struct ovl_dir_cache;
@@ -265,6 +267,13 @@ bool ovl_is_whiteout(struct dentry *dentry)
return inode && IS_WHITEOUT(inode);
}
+const struct cred *ovl_override_creds(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return override_creds(ofs->creator_cred);
+}
+
static bool ovl_is_opaquedir(struct dentry *dentry)
{
int res;
@@ -603,6 +612,7 @@ static void ovl_put_super(struct super_block *sb)
kfree(ufs->config.lowerdir);
kfree(ufs->config.upperdir);
kfree(ufs->config.workdir);
+ put_cred(ufs->creator_cred);
kfree(ufs);
}
@@ -1064,16 +1074,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
/*
* Upper should support d_type, else whiteouts are visible.
* Given workdir and upper are on same fs, we can do
- * iterate_dir() on workdir.
+ * iterate_dir() on workdir. This check requires successful
+ * creation of workdir in previous step.
*/
- err = ovl_check_d_type_supported(&workpath);
- if (err < 0)
- goto out_put_workdir;
+ if (ufs->workdir) {
+ err = ovl_check_d_type_supported(&workpath);
+ if (err < 0)
+ goto out_put_workdir;
- if (!err) {
- pr_err("overlayfs: upper fs needs to support d_type.\n");
- err = -EINVAL;
- goto out_put_workdir;
+ if (!err) {
+ pr_err("overlayfs: upper fs needs to support d_type.\n");
+ err = -EINVAL;
+ goto out_put_workdir;
+ }
}
}
@@ -1108,10 +1121,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
else
sb->s_d_op = &ovl_dentry_operations;
+ ufs->creator_cred = prepare_creds();
+ if (!ufs->creator_cred)
+ goto out_put_lower_mnt;
+
err = -ENOMEM;
oe = ovl_alloc_entry(numlower);
if (!oe)
- goto out_put_lower_mnt;
+ goto out_put_cred;
root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe));
if (!root_dentry)
@@ -1144,6 +1161,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
out_free_oe:
kfree(oe);
+out_put_cred:
+ put_cred(ufs->creator_cred);
out_put_lower_mnt:
for (i = 0; i < ufs->numlower; i++)
mntput(ufs->lower_mnt[i]);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 2c60f17e7d92..8a4a266beff3 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -822,10 +822,10 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
static int
posix_acl_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_backing_inode(dentry);
struct posix_acl *acl = NULL;
int ret;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index b6c00ce0e29e..88c7de12197b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -83,6 +83,7 @@
#include <linux/tracehook.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
+#include <linux/fs_struct.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -139,12 +140,25 @@ static inline const char *get_task_state(struct task_struct *tsk)
return task_state_array[fls(state)];
}
+static inline int get_task_umask(struct task_struct *tsk)
+{
+ struct fs_struct *fs;
+ int umask = -ENOENT;
+
+ task_lock(tsk);
+ fs = tsk->fs;
+ if (fs)
+ umask = fs->umask;
+ task_unlock(tsk);
+ return umask;
+}
+
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
- int g;
+ int g, umask;
struct task_struct *tracer;
const struct cred *cred;
pid_t ppid, tpid = 0, tgid, ngid;
@@ -162,6 +176,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
+ umask = get_task_umask(p);
+ if (umask >= 0)
+ seq_printf(m, "Umask:\t%#04o\n", umask);
+
task_lock(p);
if (p->files)
max_fds = files_fdtable(p->files)->max_fds;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ff4527dd69b7..a11eb7196ec8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3163,6 +3163,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
}
/*
+ * proc_tid_comm_permission is a special permission function exclusively
+ * used for the node /proc/<pid>/task/<tid>/comm.
+ * It bypasses generic permission checks in the case where a task of the same
+ * task group attempts to access the node.
+ * The rationale behind this is that glibc and bionic access this node for
+ * cross thread naming (pthread_set/getname_np(!self)). However, if
+ * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
+ * which locks out the cross thread naming implementation.
+ * This function makes sure that the node is always accessible for members of
+ * same thread group.
+ */
+static int proc_tid_comm_permission(struct inode *inode, int mask)
+{
+ bool is_same_tgroup;
+ struct task_struct *task;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ is_same_tgroup = same_thread_group(current, task);
+ put_task_struct(task);
+
+ if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
+ /* This file (/proc/<pid>/task/<tid>/comm) can always be
+ * read or written by the members of the corresponding
+ * thread group.
+ */
+ return 0;
+ }
+
+ return generic_permission(inode, mask);
+}
+
+static const struct inode_operations proc_tid_comm_inode_operations = {
+ .permission = proc_tid_comm_permission,
+};
+
+/*
* Tasks
*/
static const struct pid_entry tid_base_stuff[] = {
@@ -3180,7 +3218,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
- REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+ NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
+ &proc_tid_comm_inode_operations,
+ &proc_pid_set_comm_operations, {}),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 712f1b9992cc..3ecd445e830d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
/*
- * Caveats on high order pages: page->_count will only be set
+ * Caveats on high order pages: page->_refcount will only be set
* -1 on the head page; SLUB/SLQB do the same for PG_slab;
* SLOB won't set PG_slab at all on compound pages.
*/
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 55bc7d6c8aac..06702783bf40 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -121,6 +121,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
if (IS_ERR(sb))
return ERR_CAST(sb);
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+
if (!proc_parse_options(options, ns)) {
deactivate_locked_super(sb);
return ERR_PTR(-EINVAL);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 541583510cfb..4648c7f63ae2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1027,11 +1027,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
};
if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+ if (down_write_killable(&mm->mmap_sem)) {
+ count = -EINTR;
+ goto out_mm;
+ }
+
/*
* Writing 5 to /proc/pid/clear_refs resets the peak
* resident set size to this mm's current rss value.
*/
- down_write(&mm->mmap_sem);
reset_mm_hiwater_rss(mm);
up_write(&mm->mmap_sem);
goto out_mm;
@@ -1043,7 +1047,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
up_read(&mm->mmap_sem);
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem)) {
+ count = -EINTR;
+ goto out_mm;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 8afe10cf7df8..8ab782d8b33d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1071,7 +1071,7 @@ static int __init parse_crash_elf32_headers(void)
/* Do some basic Verification. */
if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
(ehdr.e_type != ET_CORE) ||
- !elf_check_arch(&ehdr) ||
+ !vmcore_elf32_check_arch(&ehdr) ||
ehdr.e_ident[EI_CLASS] != ELFCLASS32||
ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
ehdr.e_version != EV_CURRENT ||
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index a586467f6ff6..be3ddd189cd4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -211,14 +211,11 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
struct page **pages = NULL, **ptr, *page;
loff_t isize;
- if (!(flags & MAP_SHARED))
- return addr;
-
/* the mapping mustn't extend beyond the EOF */
lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
isize = i_size_read(inode);
- ret = -EINVAL;
+ ret = -ENOSYS;
maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (pgoff >= maxpages)
goto out;
@@ -227,7 +224,6 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
goto out;
/* gang-find the pages */
- ret = -ENOMEM;
pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto out_free;
@@ -263,7 +259,7 @@ out:
*/
static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
{
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)))
return -ENOSYS;
file_accessed(file);
diff --git a/fs/readdir.c b/fs/readdir.c
index a86c6c04b9bc..9d0212c374d6 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -35,13 +35,13 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
if (res)
goto out;
- if (shared)
+ if (shared) {
inode_lock_shared(inode);
- else
- inode_lock(inode);
- // res = mutex_lock_killable(&inode->i_mutex);
- // if (res)
- // goto out;
+ } else {
+ res = down_write_killable(&inode->i_rwsem);
+ if (res)
+ goto out;
+ }
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
@@ -182,6 +182,8 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
}
dirent = buf->previous;
if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
if (__put_user(offset, &dirent->d_off))
goto efault;
}
@@ -261,6 +263,8 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
return -EINVAL;
dirent = buf->previous;
if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
if (__put_user(offset, &dirent->d_off))
goto efault;
}
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 99a5d5dae46a..415d66ca87d1 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -3,8 +3,8 @@
*/
#include <linux/string.h>
-#include <linux/random.h>
#include <linux/time.h>
+#include <linux/uuid.h>
#include "reiserfs.h"
/* find where objectid map starts */
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 86aeb9dd805a..e4cbb7719906 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused,
}
static int
-security_set(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, const void *buffer, size_t size, int flags)
+security_set(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, const void *buffer,
+ size_t size, int flags)
{
- if (IS_PRIVATE(d_inode(dentry)))
+ if (IS_PRIVATE(inode))
return -EPERM;
- return reiserfs_xattr_set(d_inode(dentry),
+ return reiserfs_xattr_set(inode,
xattr_full_name(handler, name),
buffer, size, flags);
}
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 31837f031f59..f15a5f9e84ce 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused,
}
static int
-trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, const void *buffer, size_t size, int flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, const void *buffer,
+ size_t size, int flags)
{
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+ if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
return -EPERM;
- return reiserfs_xattr_set(d_inode(dentry),
+ return reiserfs_xattr_set(inode,
xattr_full_name(handler, name),
buffer, size, flags);
}
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index f7c39731684b..dc59df43b2db 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused,
}
static int
-user_set(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, const void *buffer, size_t size, int flags)
+user_set(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, const void *buffer,
+ size_t size, int flags)
{
- if (!reiserfs_xattrs_user(dentry->d_sb))
+ if (!reiserfs_xattrs_user(inode->i_sb))
return -EOPNOTSUPP;
- return reiserfs_xattr_set(d_inode(dentry),
+ return reiserfs_xattr_set(inode,
xattr_full_name(handler, name),
buffer, size, flags);
}
diff --git a/fs/select.c b/fs/select.c
index 869293988c2a..8ed9da50896a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -47,7 +47,7 @@
#define MAX_SLACK (100 * NSEC_PER_MSEC)
-static long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec64 *tv)
{
long slack;
int divfactor = 1000;
@@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv)
return slack;
}
-u64 select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec64 *tv)
{
u64 ret;
- struct timespec now;
+ struct timespec64 now;
/*
* Realtime tasks get a slack of 0 for obvious reasons.
@@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv)
if (rt_task(current))
return 0;
- ktime_get_ts(&now);
- now = timespec_sub(*tv, now);
+ ktime_get_ts64(&now);
+ now = timespec64_sub(*tv, now);
ret = __estimate_accuracy(&now);
if (ret < current->timer_slack_ns)
return current->timer_slack_ns;
@@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout);
/**
* poll_select_set_timeout - helper function to setup the timeout value
- * @to: pointer to timespec variable for the final timeout
+ * @to: pointer to timespec64 variable for the final timeout
* @sec: seconds (from user space)
* @nsec: nanoseconds (from user space)
*
@@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout);
*
* Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
*/
-int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
{
- struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+ struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
- if (!timespec_valid(&ts))
+ if (!timespec64_valid(&ts))
return -EINVAL;
/* Optimize for the zero timeout value here */
if (!sec && !nsec) {
to->tv_sec = to->tv_nsec = 0;
} else {
- ktime_get_ts(to);
- *to = timespec_add_safe(*to, ts);
+ ktime_get_ts64(to);
+ *to = timespec64_add_safe(*to, ts);
}
return 0;
}
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+static int poll_select_copy_remaining(struct timespec64 *end_time,
+ void __user *p,
int timeval, int ret)
{
+ struct timespec64 rts64;
struct timespec rts;
struct timeval rtv;
@@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
if (!end_time->tv_sec && !end_time->tv_nsec)
return ret;
- ktime_get_ts(&rts);
- rts = timespec_sub(*end_time, rts);
- if (rts.tv_sec < 0)
- rts.tv_sec = rts.tv_nsec = 0;
+ ktime_get_ts64(&rts64);
+ rts64 = timespec64_sub(*end_time, rts64);
+ if (rts64.tv_sec < 0)
+ rts64.tv_sec = rts64.tv_nsec = 0;
+
+ rts = timespec64_to_timespec(rts64);
if (timeval) {
if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
memset(&rtv, 0, sizeof(rtv));
- rtv.tv_sec = rts.tv_sec;
- rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+ rtv.tv_sec = rts64.tv_sec;
+ rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
if (!copy_to_user(p, &rtv, sizeof(rtv)))
return ret;
@@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
@@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
* pointer to the expiry value.
*/
if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
+ expire = timespec64_to_ktime(*end_time);
to = &expire;
}
@@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
* I'm trying ERESTARTNOHAND which restart only when you want to.
*/
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timespec *end_time)
+ fd_set __user *exp, struct timespec64 *end_time)
{
fd_set_bits fds;
void *bits;
@@ -622,7 +626,7 @@ out_nofds:
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
fd_set __user *, exp, struct timeval __user *, tvp)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
struct timeval tv;
int ret;
@@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts, end_time, *to = NULL;
+ struct timespec ts;
+ struct timespec64 ts64, end_time, *to = NULL;
int ret;
if (tsp) {
if (copy_from_user(&ts, tsp, sizeof(ts)))
return -EFAULT;
+ ts64 = timespec_to_timespec64(ts);
to = &end_time;
- if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
return -EINVAL;
}
@@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
}
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
@@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
* pointer to the expiry value.
*/
if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
+ expire = timespec64_to_ktime(*end_time);
to = &expire;
}
@@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
sizeof(struct pollfd))
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
@@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block)
{
struct pollfd __user *ufds = restart_block->poll.ufds;
int nfds = restart_block->poll.nfds;
- struct timespec *to = NULL, end_time;
+ struct timespec64 *to = NULL, end_time;
int ret;
if (restart_block->poll.has_timeout) {
@@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block)
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
@@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
size_t, sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts, end_time, *to = NULL;
+ struct timespec ts;
+ struct timespec64 end_time, *to = NULL;
int ret;
if (tsp) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 595ca0debe11..69e287e20732 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -260,7 +260,7 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
pr_err("\txattr_names %u\n", ui->xattr_names);
pr_err("\tdirty %u\n", ui->dirty);
pr_err("\txattr %u\n", ui->xattr);
- pr_err("\tbulk_read %u\n", ui->xattr);
+ pr_err("\tbulk_read %u\n", ui->bulk_read);
pr_err("\tsynced_i_size %llu\n",
(unsigned long long)ui->synced_i_size);
pr_err("\tui_size %llu\n",
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index f4fbc7b6b794..3cbb904a6d7d 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,8 +28,8 @@
#include "ubifs.h"
#include <linux/slab.h>
-#include <linux/random.h>
#include <linux/math64.h>
+#include <linux/uuid.h>
/*
* Default journal size in logical eraseblocks as a percent of total
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 6c277eb6aef9..b5fc27969e9d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
}
static int ubifs_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
-
dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
name, inode->i_ino, dentry, size);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 66cdb44616d5..2d97952e341a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -137,7 +137,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
- mmput(ctx->mm);
+ mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
}
@@ -434,6 +434,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
ACCESS_ONCE(ctx->released) = true;
+ if (!mmget_not_zero(mm))
+ goto wakeup;
+
/*
* Flush page faults out of all CPUs. NOTE: all page faults
* must be retried without returning VM_FAULT_SIGBUS if
@@ -466,7 +469,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
up_write(&mm->mmap_sem);
-
+ mmput(mm);
+wakeup:
/*
* After no new page faults can wait on this fault_*wqh, flush
* the last page faults that may have been already waiting on
@@ -760,10 +764,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
start = uffdio_register.range.start;
end = start + uffdio_register.range.len;
+ ret = -ENOMEM;
+ if (!mmget_not_zero(mm))
+ goto out;
+
down_write(&mm->mmap_sem);
vma = find_vma_prev(mm, start, &prev);
-
- ret = -ENOMEM;
if (!vma)
goto out_unlock;
@@ -864,6 +870,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
} while (vma && vma->vm_start < end);
out_unlock:
up_write(&mm->mmap_sem);
+ mmput(mm);
if (!ret) {
/*
* Now that we scanned all vmas we can already tell
@@ -902,10 +909,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
start = uffdio_unregister.start;
end = start + uffdio_unregister.len;
+ ret = -ENOMEM;
+ if (!mmget_not_zero(mm))
+ goto out;
+
down_write(&mm->mmap_sem);
vma = find_vma_prev(mm, start, &prev);
-
- ret = -ENOMEM;
if (!vma)
goto out_unlock;
@@ -998,6 +1007,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
} while (vma && vma->vm_start < end);
out_unlock:
up_write(&mm->mmap_sem);
+ mmput(mm);
out:
return ret;
}
@@ -1067,9 +1077,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
goto out;
-
- ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len);
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len);
+ mmput(ctx->mm);
+ }
if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
return -EFAULT;
if (ret < 0)
@@ -1110,8 +1122,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
goto out;
- ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len);
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ mmput(ctx->mm);
+ }
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
return -EFAULT;
if (ret < 0)
@@ -1289,12 +1304,12 @@ static struct file *userfaultfd_file_create(int flags)
ctx->released = false;
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
- atomic_inc(&ctx->mm->mm_users);
+ atomic_inc(&ctx->mm->mm_count);
file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
if (IS_ERR(file)) {
- mmput(ctx->mm);
+ mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
out:
diff --git a/fs/xattr.c b/fs/xattr.c
index b11945e15fde..4beafc43daa5 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -100,7 +100,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
if (issec)
inode->i_flags &= ~S_NOSEC;
if (inode->i_op->setxattr) {
- error = inode->i_op->setxattr(dentry, name, value, size, flags);
+ error = inode->i_op->setxattr(dentry, inode, name, value, size, flags);
if (!error) {
fsnotify_xattr(dentry);
security_inode_post_setxattr(dentry, name, value,
@@ -655,6 +655,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
* operations to the correct xattr_handler.
*/
#define for_each_xattr_handler(handlers, handler) \
+ if (handlers) \
for ((handler) = *(handlers)++; \
(handler) != NULL; \
(handler) = *(handlers)++)
@@ -668,7 +669,7 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
const struct xattr_handler *handler;
if (!*name)
- return NULL;
+ return ERR_PTR(-EINVAL);
for_each_xattr_handler(handlers, handler) {
const char *n;
@@ -744,7 +745,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
* Find the handler for the prefix and dispatch its set() operation.
*/
int
-generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
+generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
{
const struct xattr_handler *handler;
@@ -753,7 +755,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
- return handler->set(handler, dentry, name, value, size, flags);
+ return handler->set(handler, dentry, inode, name, value, size, flags);
}
/*
@@ -768,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name)
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
- return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
+ return handler->set(handler, dentry, d_inode(dentry), name, NULL,
+ 0, XATTR_REPLACE);
}
EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 686ba6fb20dd..339c696bbc01 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
}
void *
-kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
- xfs_km_flags_t flags)
+kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
{
- void *new;
+ int retries = 0;
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
- new = kmem_alloc(newsize, flags);
- if (ptr) {
- if (new)
- memcpy(new, ptr,
- ((oldsize < newsize) ? oldsize : newsize));
- kmem_free(ptr);
- }
- return new;
+ do {
+ ptr = krealloc(old, newsize, lflags);
+ if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ return ptr;
+ if (!(++retries % 100))
+ xfs_err(NULL,
+ "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
+ current->comm, current->pid,
+ newsize, __func__, lflags);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ } while (1);
}
void *
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index d1c66e465ca5..689f746224e7 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
extern void *kmem_alloc(size_t, xfs_km_flags_t);
extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
{
kvfree(ptr);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index fa3b948ef9c2..4e126f41a0aa 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -242,37 +242,21 @@ xfs_attr_set(
return error;
}
- /*
- * Start our first transaction of the day.
- *
- * All future transactions during this code must be "chained" off
- * this one via the trans_dup() call. All transactions will contain
- * the inode, and the inode will always be marked with trans_ihold().
- * Since the inode will be locked in all transactions, we must log
- * the inode in every transaction to let it float upward through
- * the log.
- */
- args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+ tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+ tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
-
- if (rsvd)
- args.trans->t_flags |= XFS_TRANS_RESERVE;
-
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
- if (error) {
- xfs_trans_cancel(args.trans);
+ error = xfs_trans_alloc(mp, &tres, args.total, 0,
+ rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
+ if (error)
return error;
- }
- xfs_ilock(dp, XFS_ILOCK_EXCL);
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
@@ -429,31 +413,15 @@ xfs_attr_remove(
return error;
/*
- * Start our first transaction of the day.
- *
- * All future transactions during this code must be "chained" off
- * this one via the trans_dup() call. All transactions will contain
- * the inode, and the inode will always be marked with trans_ihold().
- * Since the inode will be locked in all transactions, we must log
- * the inode in every transaction to let it float upward through
- * the log.
- */
- args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
-
- /*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
-
- if (flags & ATTR_ROOT)
- args.trans->t_flags |= XFS_TRANS_RESERVE;
-
- error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
- XFS_ATTRRM_SPACE_RES(mp), 0);
- if (error) {
- xfs_trans_cancel(args.trans);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
+ XFS_ATTRRM_SPACE_RES(mp), 0,
+ (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
+ &args.trans);
+ if (error)
return error;
- }
xfs_ilock(dp, XFS_ILOCK_EXCL);
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ce41d7fe753c..932381caef1b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1121,15 +1121,14 @@ xfs_bmap_add_attrfork(
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
- tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+
blks = XFS_ADDAFORK_SPACE_RES(mp);
- if (rsvd)
- tp->t_flags |= XFS_TRANS_RESERVE;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
- if (error) {
- xfs_trans_cancel(tp);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
+ rsvd ? XFS_TRANS_RESERVE : 0, &tp);
+ if (error)
return error;
- }
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
@@ -6026,13 +6025,10 @@ xfs_bmap_split_extent(
xfs_fsblock_t firstfsb;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 974d62e677f4..e5bb9cc3b243 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -257,15 +257,12 @@ xfs_dir2_block_to_sf(
*
* Convert the inode to local format and copy the data in.
*/
- dp->i_df.if_flags &= ~XFS_IFEXTENTS;
- dp->i_df.if_flags |= XFS_IFINLINE;
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
ASSERT(dp->i_df.if_bytes == 0);
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+ xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size);
+ dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ dp->i_d.di_size = size;
logflags |= XFS_ILOG_DDATA;
- memcpy(dp->i_df.if_u1.if_data, dst, size);
- dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 11faf7df14c8..bbcc8c7a44b3 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -231,6 +231,48 @@ xfs_iformat_fork(
return error;
}
+void
+xfs_init_local_fork(
+ struct xfs_inode *ip,
+ int whichfork,
+ const void *data,
+ int size)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int mem_size = size, real_size = 0;
+ bool zero_terminate;
+
+ /*
+ * If we are using the local fork to store a symlink body we need to
+ * zero-terminate it so that we can pass it back to the VFS directly.
+ * Overallocate the in-memory fork by one for that and add a zero
+ * to terminate it below.
+ */
+ zero_terminate = S_ISLNK(VFS_I(ip)->i_mode);
+ if (zero_terminate)
+ mem_size++;
+
+ if (size == 0)
+ ifp->if_u1.if_data = NULL;
+ else if (mem_size <= sizeof(ifp->if_u2.if_inline_data))
+ ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ else {
+ real_size = roundup(mem_size, 4);
+ ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+ }
+
+ if (size) {
+ memcpy(ifp->if_u1.if_data, data, size);
+ if (zero_terminate)
+ ifp->if_u1.if_data[size] = '\0';
+ }
+
+ ifp->if_bytes = size;
+ ifp->if_real_bytes = real_size;
+ ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+ ifp->if_flags |= XFS_IFINLINE;
+}
+
/*
* The file is in-lined in the on-disk inode.
* If it fits into if_inline_data, then copy
@@ -248,8 +290,6 @@ xfs_iformat_local(
int whichfork,
int size)
{
- xfs_ifork_t *ifp;
- int real_size;
/*
* If the size is unreasonable, then something
@@ -265,22 +305,8 @@ xfs_iformat_local(
ip->i_mount, dip);
return -EFSCORRUPTED;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
- real_size = 0;
- if (size == 0)
- ifp->if_u1.if_data = NULL;
- else if (size <= sizeof(ifp->if_u2.if_inline_data))
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- else {
- real_size = roundup(size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
- }
- ifp->if_bytes = size;
- ifp->if_real_bytes = real_size;
- if (size)
- memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
- ifp->if_flags &= ~XFS_IFEXTENTS;
- ifp->if_flags |= XFS_IFINLINE;
+
+ xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
return 0;
}
@@ -516,7 +542,6 @@ xfs_iroot_realloc(
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
KM_SLEEP | KM_NOFS);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
@@ -660,7 +685,6 @@ xfs_idata_realloc(
ifp->if_u1.if_data =
kmem_realloc(ifp->if_u1.if_data,
real_size,
- ifp->if_real_bytes,
KM_SLEEP | KM_NOFS);
}
} else {
@@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct(
if (rnew_size != ifp->if_real_bytes) {
ifp->if_u1.if_extents =
kmem_realloc(ifp->if_u1.if_extents,
- rnew_size,
- ifp->if_real_bytes, KM_NOFS);
+ rnew_size, KM_NOFS);
}
if (rnew_size > ifp->if_real_bytes) {
memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect(
if (new_size == 0) {
xfs_iext_destroy(ifp);
} else {
- ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
- kmem_realloc(ifp->if_u1.if_ext_irec,
- new_size, size, KM_NOFS);
+ ifp->if_u1.if_ext_irec =
+ kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
}
}
@@ -1497,6 +1519,24 @@ xfs_iext_indirect_to_direct(
}
/*
+ * Remove all records from the indirection array.
+ */
+STATIC void
+xfs_iext_irec_remove_all(
+ struct xfs_ifork *ifp)
+{
+ int nlists;
+ int i;
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ for (i = 0; i < nlists; i++)
+ kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
+ kmem_free(ifp->if_u1.if_ext_irec);
+ ifp->if_flags &= ~XFS_IFEXTIREC;
+}
+
+/*
* Free incore file extents.
*/
void
@@ -1504,14 +1544,7 @@ xfs_iext_destroy(
xfs_ifork_t *ifp) /* inode fork pointer */
{
if (ifp->if_flags & XFS_IFEXTIREC) {
- int erp_idx;
- int nlists;
-
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
- xfs_iext_irec_remove(ifp, erp_idx);
- }
- ifp->if_flags &= ~XFS_IFEXTIREC;
+ xfs_iext_irec_remove_all(ifp);
} else if (ifp->if_real_bytes) {
kmem_free(ifp->if_u1.if_extents);
} else if (ifp->if_bytes) {
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7d3b1ed6dcbe..f95e072ae646 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -134,6 +134,7 @@ void xfs_iroot_realloc(struct xfs_inode *, int, int);
int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
int);
+void xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
struct xfs_bmbt_rec_host *
xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index d54a8018b079..e8f49c029ff0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -212,6 +212,11 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
/*
+ * The only type valid for th_type in CIL-enabled file system logs:
+ */
+#define XFS_TRANS_CHECKPOINT 40
+
+/*
* Log item types.
*/
#define XFS_LI_EFI 0x1236
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 8a53eaa349f4..12ca86778e02 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -838,12 +838,10 @@ xfs_sync_sb(
struct xfs_trans *tp;
int error;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
+ XFS_TRANS_NO_WRITECOUNT, &tp);
+ if (error)
return error;
- }
xfs_log_sb(tp);
if (wait)
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 81ac870834da..16002b5ec4eb 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -56,103 +56,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_rtbuf_ops;
/*
- * Transaction types. Used to distinguish types of buffers. These never reach
- * the log.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE 1
-#define XFS_TRANS_SETATTR_SIZE 2
-#define XFS_TRANS_INACTIVE 3
-#define XFS_TRANS_CREATE 4
-#define XFS_TRANS_CREATE_TRUNC 5
-#define XFS_TRANS_TRUNCATE_FILE 6
-#define XFS_TRANS_REMOVE 7
-#define XFS_TRANS_LINK 8
-#define XFS_TRANS_RENAME 9
-#define XFS_TRANS_MKDIR 10
-#define XFS_TRANS_RMDIR 11
-#define XFS_TRANS_SYMLINK 12
-#define XFS_TRANS_SET_DMATTRS 13
-#define XFS_TRANS_GROWFS 14
-#define XFS_TRANS_STRAT_WRITE 15
-#define XFS_TRANS_DIOSTRAT 16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define XFS_TRANS_WRITEID 18
-#define XFS_TRANS_ADDAFORK 19
-#define XFS_TRANS_ATTRINVAL 20
-#define XFS_TRANS_ATRUNCATE 21
-#define XFS_TRANS_ATTR_SET 22
-#define XFS_TRANS_ATTR_RM 23
-#define XFS_TRANS_ATTR_FLAG 24
-#define XFS_TRANS_CLEAR_AGI_BUCKET 25
-#define XFS_TRANS_SB_CHANGE 26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1 27
-#define XFS_TRANS_DUMMY2 28
-#define XFS_TRANS_QM_QUOTAOFF 29
-#define XFS_TRANS_QM_DQALLOC 30
-#define XFS_TRANS_QM_SETQLIM 31
-#define XFS_TRANS_QM_DQCLUSTER 32
-#define XFS_TRANS_QM_QINOCREATE 33
-#define XFS_TRANS_QM_QUOTAOFF_END 34
-#define XFS_TRANS_FSYNC_TS 35
-#define XFS_TRANS_GROWFSRT_ALLOC 36
-#define XFS_TRANS_GROWFSRT_ZERO 37
-#define XFS_TRANS_GROWFSRT_FREE 38
-#define XFS_TRANS_SWAPEXT 39
-#define XFS_TRANS_CHECKPOINT 40
-#define XFS_TRANS_ICREATE 41
-#define XFS_TRANS_CREATE_TMPFILE 42
-#define XFS_TRANS_TYPE_MAX 43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
- { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
- { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
- { XFS_TRANS_INACTIVE, "INACTIVE" }, \
- { XFS_TRANS_CREATE, "CREATE" }, \
- { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
- { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
- { XFS_TRANS_REMOVE, "REMOVE" }, \
- { XFS_TRANS_LINK, "LINK" }, \
- { XFS_TRANS_RENAME, "RENAME" }, \
- { XFS_TRANS_MKDIR, "MKDIR" }, \
- { XFS_TRANS_RMDIR, "RMDIR" }, \
- { XFS_TRANS_SYMLINK, "SYMLINK" }, \
- { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
- { XFS_TRANS_GROWFS, "GROWFS" }, \
- { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
- { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
- { XFS_TRANS_WRITEID, "WRITEID" }, \
- { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
- { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
- { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
- { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
- { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
- { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
- { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
- { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \
- { XFS_TRANS_DUMMY1, "DUMMY1" }, \
- { XFS_TRANS_DUMMY2, "DUMMY2" }, \
- { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
- { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
- { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
- { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
- { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
- { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
- { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
- { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
- { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
- { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
- { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
- { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
- { XFS_TRANS_ICREATE, "ICREATE" }, \
- { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
- { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
-
-/*
* This structure is used to track log items associated with
* a transaction. It points to the log item and keeps some
* flags to track the state of the log item. It also tracks
@@ -181,8 +84,9 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
- count in superblock */
+#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
+#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */
+
/*
* Field values for xfs_trans_mod_sb.
*/
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c535887c60a8..4c463b99fe57 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -84,23 +84,71 @@ xfs_find_bdev_for_inode(
}
/*
- * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
- * release holds on the inode and bio, and finally free
- * up memory. Do not use the ioend after this.
+ * We're now finished for good with this page. Update the page state via the
+ * associated buffer_heads, paying attention to the start and end offsets that
+ * we need to process on the page.
+ */
+static void
+xfs_finish_page_writeback(
+ struct inode *inode,
+ struct bio_vec *bvec,
+ int error)
+{
+ unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
+ struct buffer_head *head, *bh;
+ unsigned int off = 0;
+
+ ASSERT(bvec->bv_offset < PAGE_SIZE);
+ ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
+ ASSERT(end < PAGE_SIZE);
+ ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
+
+ bh = head = page_buffers(bvec->bv_page);
+
+ do {
+ if (off < bvec->bv_offset)
+ goto next_bh;
+ if (off > end)
+ break;
+ bh->b_end_io(bh, !error);
+next_bh:
+ off += bh->b_size;
+ } while ((bh = bh->b_this_page) != head);
+}
+
+/*
+ * We're now finished for good with this ioend structure. Update the page
+ * state, release holds on bios, and finally free up memory. Do not use the
+ * ioend after this.
*/
STATIC void
xfs_destroy_ioend(
- xfs_ioend_t *ioend)
+ struct xfs_ioend *ioend,
+ int error)
{
- struct buffer_head *bh, *next;
+ struct inode *inode = ioend->io_inode;
+ struct bio *last = ioend->io_bio;
+ struct bio *bio, *next;
- for (bh = ioend->io_buffer_head; bh; bh = next) {
- next = bh->b_private;
- bh->b_end_io(bh, !ioend->io_error);
- }
+ for (bio = &ioend->io_inline_bio; bio; bio = next) {
+ struct bio_vec *bvec;
+ int i;
+
+ /*
+ * For the last bio, bi_private points to the ioend, so we
+ * need to explicitly end the iteration here.
+ */
+ if (bio == last)
+ next = NULL;
+ else
+ next = bio->bi_private;
- mempool_free(ioend, xfs_ioend_pool);
+ /* walk each page on bio, ending page IO on them */
+ bio_for_each_segment_all(bvec, bio, i)
+ xfs_finish_page_writeback(inode, bvec, error);
+
+ bio_put(bio);
+ }
}
/*
@@ -120,13 +168,9 @@ xfs_setfilesize_trans_alloc(
struct xfs_trans *tp;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+ if (error)
return error;
- }
ioend->io_append_trans = tp;
@@ -174,7 +218,8 @@ xfs_setfilesize(
STATIC int
xfs_setfilesize_ioend(
- struct xfs_ioend *ioend)
+ struct xfs_ioend *ioend,
+ int error)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_trans *tp = ioend->io_append_trans;
@@ -188,53 +233,32 @@ xfs_setfilesize_ioend(
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
- if (ioend->io_error) {
+ if (error) {
xfs_trans_cancel(tp);
- return ioend->io_error;
+ return error;
}
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
/*
- * Schedule IO completion handling on the final put of an ioend.
- *
- * If there is no work to do we might as well call it a day and free the
- * ioend right now.
- */
-STATIC void
-xfs_finish_ioend(
- struct xfs_ioend *ioend)
-{
- if (atomic_dec_and_test(&ioend->io_remaining)) {
- struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
-
- if (ioend->io_type == XFS_IO_UNWRITTEN)
- queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
- else if (ioend->io_append_trans)
- queue_work(mp->m_data_workqueue, &ioend->io_work);
- else
- xfs_destroy_ioend(ioend);
- }
-}
-
-/*
* IO write completion.
*/
STATIC void
xfs_end_io(
struct work_struct *work)
{
- xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
- int error = 0;
+ struct xfs_ioend *ioend =
+ container_of(work, struct xfs_ioend, io_work);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ int error = ioend->io_bio->bi_error;
/*
* Set an error if the mount has shut down and proceed with end I/O
* processing so it can perform whatever cleanups are necessary.
*/
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- ioend->io_error = -EIO;
+ error = -EIO;
/*
* For unwritten extents we need to issue transactions to convert a
@@ -244,55 +268,33 @@ xfs_end_io(
* on error.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
- if (ioend->io_error)
+ if (error)
goto done;
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
} else if (ioend->io_append_trans) {
- error = xfs_setfilesize_ioend(ioend);
+ error = xfs_setfilesize_ioend(ioend, error);
} else {
ASSERT(!xfs_ioend_is_append(ioend));
}
done:
- if (error)
- ioend->io_error = error;
- xfs_destroy_ioend(ioend);
+ xfs_destroy_ioend(ioend, error);
}
-/*
- * Allocate and initialise an IO completion structure.
- * We need to track unwritten extent write completion here initially.
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
- struct inode *inode,
- unsigned int type)
+STATIC void
+xfs_end_bio(
+ struct bio *bio)
{
- xfs_ioend_t *ioend;
-
- ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-
- /*
- * Set the count to 1 initially, which will prevent an I/O
- * completion callback from happening before we have started
- * all the I/O from calling the completion routine too early.
- */
- atomic_set(&ioend->io_remaining, 1);
- ioend->io_error = 0;
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
- ioend->io_inode = inode;
- ioend->io_buffer_head = NULL;
- ioend->io_buffer_tail = NULL;
- ioend->io_offset = 0;
- ioend->io_size = 0;
- ioend->io_append_trans = NULL;
+ struct xfs_ioend *ioend = bio->bi_private;
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- INIT_WORK(&ioend->io_work, xfs_end_io);
- return ioend;
+ if (ioend->io_type == XFS_IO_UNWRITTEN)
+ queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+ else if (ioend->io_append_trans)
+ queue_work(mp->m_data_workqueue, &ioend->io_work);
+ else
+ xfs_destroy_ioend(ioend, bio->bi_error);
}
STATIC int
@@ -364,50 +366,6 @@ xfs_imap_valid(
offset < imap->br_startoff + imap->br_blockcount;
}
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC void
-xfs_end_bio(
- struct bio *bio)
-{
- xfs_ioend_t *ioend = bio->bi_private;
-
- if (!ioend->io_error)
- ioend->io_error = bio->bi_error;
-
- /* Toss bio and pass work off to an xfsdatad thread */
- bio->bi_private = NULL;
- bio->bi_end_io = NULL;
- bio_put(bio);
-
- xfs_finish_ioend(ioend);
-}
-
-STATIC void
-xfs_submit_ioend_bio(
- struct writeback_control *wbc,
- xfs_ioend_t *ioend,
- struct bio *bio)
-{
- atomic_inc(&ioend->io_remaining);
- bio->bi_private = ioend;
- bio->bi_end_io = xfs_end_bio;
- submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-}
-
-STATIC struct bio *
-xfs_alloc_ioend_bio(
- struct buffer_head *bh)
-{
- struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
-
- ASSERT(bio->bi_private == NULL);
- bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_bdev = bh->b_bdev;
- return bio;
-}
-
STATIC void
xfs_start_buffer_writeback(
struct buffer_head *bh)
@@ -452,28 +410,35 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
}
/*
- * Submit all of the bios for an ioend. We are only passed a single ioend at a
- * time; the caller is responsible for chaining prior to submission.
+ * Submit the bio for an ioend. We are passed an ioend with a bio attached to
+ * it, and we submit that bio. The ioend may be used for multiple bio
+ * submissions, so we only want to allocate an append transaction for the ioend
+ * once. In the case of multiple bio submission, each bio will take an IO
+ * reference to the ioend to ensure that the ioend completion is only done once
+ * all bios have been submitted and the ioend is really done.
*
* If @fail is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the ioend chain rather
- * than submit it to IO. This typically only happens on a filesystem shutdown.
+ * and unlocked them. In this situation, we need to fail the bio and ioend
+ * rather than submit it to IO. This typically only happens on a filesystem
+ * shutdown.
*/
STATIC int
xfs_submit_ioend(
struct writeback_control *wbc,
- xfs_ioend_t *ioend,
+ struct xfs_ioend *ioend,
int status)
{
- struct buffer_head *bh;
- struct bio *bio;
- sector_t lastblock = 0;
-
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+ ioend->io_type != XFS_IO_UNWRITTEN &&
+ xfs_ioend_is_append(ioend) &&
+ !ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
+
+ ioend->io_bio->bi_private = ioend;
+ ioend->io_bio->bi_end_io = xfs_end_bio;
+
/*
* If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
@@ -481,33 +446,73 @@ xfs_submit_ioend(
* time.
*/
if (status) {
- ioend->io_error = status;
- xfs_finish_ioend(ioend);
+ ioend->io_bio->bi_error = status;
+ bio_endio(ioend->io_bio);
return status;
}
- bio = NULL;
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+ submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+ ioend->io_bio);
+ return 0;
+}
- if (!bio) {
-retry:
- bio = xfs_alloc_ioend_bio(bh);
- } else if (bh->b_blocknr != lastblock + 1) {
- xfs_submit_ioend_bio(wbc, ioend, bio);
- goto retry;
- }
+static void
+xfs_init_bio_from_bh(
+ struct bio *bio,
+ struct buffer_head *bh)
+{
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+}
- if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
- xfs_submit_ioend_bio(wbc, ioend, bio);
- goto retry;
- }
+static struct xfs_ioend *
+xfs_alloc_ioend(
+ struct inode *inode,
+ unsigned int type,
+ xfs_off_t offset,
+ struct buffer_head *bh)
+{
+ struct xfs_ioend *ioend;
+ struct bio *bio;
- lastblock = bh->b_blocknr;
- }
- if (bio)
- xfs_submit_ioend_bio(wbc, ioend, bio);
- xfs_finish_ioend(ioend);
- return 0;
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
+ xfs_init_bio_from_bh(bio, bh);
+
+ ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_type = type;
+ ioend->io_inode = inode;
+ ioend->io_size = 0;
+ ioend->io_offset = offset;
+ INIT_WORK(&ioend->io_work, xfs_end_io);
+ ioend->io_append_trans = NULL;
+ ioend->io_bio = bio;
+ return ioend;
+}
+
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in xfs_destroy_ioend().
+ */
+static void
+xfs_chain_bio(
+ struct xfs_ioend *ioend,
+ struct writeback_control *wbc,
+ struct buffer_head *bh)
+{
+ struct bio *new;
+
+ new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+ xfs_init_bio_from_bh(new, bh);
+
+ bio_chain(ioend->io_bio, new);
+ bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
+ submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+ ioend->io_bio);
+ ioend->io_bio = new;
}
/*
@@ -523,27 +528,24 @@ xfs_add_to_ioend(
struct buffer_head *bh,
xfs_off_t offset,
struct xfs_writepage_ctx *wpc,
+ struct writeback_control *wbc,
struct list_head *iolist)
{
if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
bh->b_blocknr != wpc->last_block + 1 ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
- struct xfs_ioend *new;
-
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
-
- new = xfs_alloc_ioend(inode, wpc->io_type);
- new->io_offset = offset;
- new->io_buffer_head = bh;
- new->io_buffer_tail = bh;
- wpc->ioend = new;
- } else {
- wpc->ioend->io_buffer_tail->b_private = bh;
- wpc->ioend->io_buffer_tail = bh;
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
}
- bh->b_private = NULL;
+ /*
+ * If the buffer doesn't fit into the bio we need to allocate a new
+ * one. This shouldn't happen more than once for a given buffer.
+ */
+ while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
+ xfs_chain_bio(wpc->ioend, wbc, bh);
+
wpc->ioend->io_size += bh->b_size;
wpc->last_block = bh->b_blocknr;
xfs_start_buffer_writeback(bh);
@@ -803,7 +805,7 @@ xfs_writepage_map(
lock_buffer(bh);
if (wpc->io_type != XFS_IO_OVERWRITE)
xfs_map_at_offset(inode, bh, &wpc->imap, offset);
- xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+ xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
count++;
}
@@ -1391,13 +1393,10 @@ xfs_end_io_direct_write(
trace_xfs_end_io_direct_write_append(ip, offset, size);
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
- return error;
- }
- error = xfs_setfilesize(ip, tp, offset, size);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
+ &tp);
+ if (!error)
+ error = xfs_setfilesize(ip, tp, offset, size);
}
return error;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b4421177b68d..814aab790713 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,7 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern mempool_t *xfs_ioend_pool;
+extern struct bio_set *xfs_ioend_bioset;
/*
* Types of I/O for bmap clustering and I/O completion tracking.
@@ -37,22 +37,19 @@ enum {
{ XFS_IO_OVERWRITE, "overwrite" }
/*
- * xfs_ioend struct manages large extent writes for XFS.
- * It can manage several multi-page bio's at once.
+ * Structure for buffered I/O completions.
*/
-typedef struct xfs_ioend {
+struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
unsigned int io_type; /* delalloc / unwritten */
- int io_error; /* I/O error code */
- atomic_t io_remaining; /* hold count */
struct inode *io_inode; /* file being written to */
- struct buffer_head *io_buffer_head;/* buffer linked list head */
- struct buffer_head *io_buffer_tail;/* buffer linked list tail */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
-} xfs_ioend_t;
+ struct bio *io_bio; /* bio being built */
+ struct bio io_inline_bio; /* MUST BE LAST! */
+};
extern const struct address_space_operations xfs_address_space_operations;
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index dd4824589470..e3da5d448bcf 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern {
*========================================================================*/
+/* Return 0 on success, or -errno; other state communicated via *context */
typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
- unsigned char *, int, int, unsigned char *);
+ unsigned char *, int, int);
typedef struct xfs_attr_list_context {
struct xfs_inode *dp; /* inode */
@@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context {
int firstu; /* first used byte in buffer */
int flags; /* from VOP call */
int resynch; /* T/F: resynch with cursor */
- int put_value; /* T/F: need value for listent */
put_listent_func_t put_listent; /* list output fmt function */
int index; /* index into output buffer */
} xfs_attr_list_context_t;
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 2bb959ada45b..55d214981ed2 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -405,21 +405,11 @@ xfs_attr_inactive(
goto out_destroy_fork;
xfs_iunlock(dp, lock_mode);
- /*
- * Start our first transaction of the day.
- *
- * All future transactions during this code must be "chained" off
- * this one via the trans_dup() call. All transactions will contain
- * the inode, and the inode will always be marked with trans_ihold().
- * Since the inode will be locked in all transactions, we must log
- * the inode in every transaction to let it float upward through
- * the log.
- */
lock_mode = 0;
- trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
- error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans);
if (error)
- goto out_cancel;
+ goto out_destroy_fork;
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(dp, lock_mode);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 4fa14820e2e2..d25f26b22ac9 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sfe->flags,
sfe->nameval,
(int)sfe->namelen,
- (int)sfe->valuelen,
- &sfe->nameval[sfe->namelen]);
-
+ (int)sfe->valuelen);
+ if (error)
+ return error;
/*
* Either search callback finished early or
* didn't fit it all in the buffer after all.
*/
if (context->seen_enough)
break;
-
- if (error)
- return error;
sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
}
trace_xfs_attr_list_sf_all(context);
@@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sbp->flags,
sbp->name,
sbp->namelen,
- sbp->valuelen,
- &sbp->name[sbp->namelen]);
+ sbp->valuelen);
if (error) {
kmem_free(sbuf);
return error;
@@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int(
*/
retval = 0;
for (; i < ichdr.count; entry++, i++) {
+ char *name;
+ int namelen, valuelen;
+
if (be32_to_cpu(entry->hashval) != cursor->hashval) {
cursor->hashval = be32_to_cpu(entry->hashval);
cursor->offset = 0;
@@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int(
continue; /* skip incomplete entries */
if (entry->flags & XFS_ATTR_LOCAL) {
- xfs_attr_leaf_name_local_t *name_loc =
- xfs_attr3_leaf_name_local(leaf, i);
-
- retval = context->put_listent(context,
- entry->flags,
- name_loc->nameval,
- (int)name_loc->namelen,
- be16_to_cpu(name_loc->valuelen),
- &name_loc->nameval[name_loc->namelen]);
- if (retval)
- return retval;
+ xfs_attr_leaf_name_local_t *name_loc;
+
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
+ name = name_loc->nameval;
+ namelen = name_loc->namelen;
+ valuelen = be16_to_cpu(name_loc->valuelen);
} else {
- xfs_attr_leaf_name_remote_t *name_rmt =
- xfs_attr3_leaf_name_remote(leaf, i);
-
- int valuelen = be32_to_cpu(name_rmt->valuelen);
-
- if (context->put_value) {
- xfs_da_args_t args;
-
- memset((char *)&args, 0, sizeof(args));
- args.geo = context->dp->i_mount->m_attr_geo;
- args.dp = context->dp;
- args.whichfork = XFS_ATTR_FORK;
- args.valuelen = valuelen;
- args.rmtvaluelen = valuelen;
- args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
- args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args.rmtblkcnt = xfs_attr3_rmt_blocks(
- args.dp->i_mount, valuelen);
- retval = xfs_attr_rmtval_get(&args);
- if (!retval)
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- args.value);
- kmem_free(args.value);
- } else {
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- NULL);
- }
- if (retval)
- return retval;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ name = name_rmt->name;
+ namelen = name_rmt->namelen;
+ valuelen = be32_to_cpu(name_rmt->valuelen);
}
+
+ retval = context->put_listent(context, entry->flags,
+ name, namelen, valuelen);
+ if (retval)
+ break;
if (context->seen_enough)
break;
cursor->offset++;
@@ -551,8 +519,7 @@ xfs_attr_put_listent(
int flags,
unsigned char *name,
int namelen,
- int valuelen,
- unsigned char *value)
+ int valuelen)
{
struct attrlist *alist = (struct attrlist *)context->alist;
attrlist_ent_t *aep;
@@ -581,7 +548,7 @@ xfs_attr_put_listent(
trace_xfs_attr_list_full(context);
alist->al_more = 1;
context->seen_enough = 1;
- return 1;
+ return 0;
}
aep = (attrlist_ent_t *)&context->alist[context->firstu];
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3b6309865c65..586bb64e674b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -72,18 +72,11 @@ xfs_zero_extent(
struct xfs_mount *mp = ip->i_mount;
xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
sector_t block = XFS_BB_TO_FSBT(mp, sector);
- ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
-
- if (IS_DAX(VFS_I(ip)))
- return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
- sector, size);
-
- /*
- * let the block layer decide on the fastest method of
- * implementing the zeroing.
- */
- return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+ return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+ block << (mp->m_super->s_blocksize_bits - 9),
+ count_fsb << (mp->m_super->s_blocksize_bits - 9),
+ GFP_NOFS, true);
}
/*
@@ -900,19 +893,15 @@ xfs_free_eofblocks(
* Free them up now by truncating the file to
* its current size.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
if (need_iolock) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- xfs_trans_cancel(tp);
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
return -EAGAIN;
- }
}
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+ &tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp);
if (need_iolock)
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return error;
@@ -1037,9 +1026,9 @@ xfs_alloc_file_space(
/*
* Allocate and setup the transaction.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- resblks, resrtextents);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+ resrtextents, 0, &tp);
+
/*
* Check for running out of space
*/
@@ -1048,7 +1037,6 @@ xfs_alloc_file_space(
* Free the transaction structure.
*/
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1311,18 +1299,10 @@ xfs_free_file_space(
* transaction to dip into the reserve blocks to ensure
* the freeing of the space succeeds at ENOSPC.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
-
- /*
- * check for running out of space
- */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
+ &tp);
if (error) {
- /*
- * Free the transaction structure.
- */
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1482,19 +1462,16 @@ xfs_shift_file_space(
}
while (!error && !done) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
/*
* We would need to reserve permanent block for transaction.
* This will come into picture when after shifting extent into
* hole we found that adjacent extents can be merged which
* may lead to freeing of a block during record update.
*/
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+ if (error)
break;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
@@ -1747,12 +1724,9 @@ xfs_swap_extents(
if (error)
goto out_unlock;
- tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
goto out_unlock;
- }
/*
* Lock and join the inodes to the tansaction so that transaction commit
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9a2191b91137..e71cfbd5acb3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1100,22 +1100,18 @@ xfs_bwrite(
return error;
}
-STATIC void
+static void
xfs_buf_bio_end_io(
struct bio *bio)
{
- xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
+ struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
/*
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
*/
- if (bio->bi_error) {
- spin_lock(&bp->b_lock);
- if (!bp->b_io_error)
- bp->b_io_error = bio->bi_error;
- spin_unlock(&bp->b_lock);
- }
+ if (bio->bi_error)
+ cmpxchg(&bp->b_io_error, 0, bio->bi_error);
if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4eb89bd4ee73..8bfb974f0772 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -183,6 +183,26 @@ typedef struct xfs_buf {
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset in first page */
int b_error; /* error code on I/O */
+
+ /*
+ * async write failure retry count. Initialised to zero on the first
+ * failure, then when it exceeds the maximum configured without a
+ * success the write is considered to be failed permanently and the
+ * iodone handler will take appropriate action.
+ *
+ * For retry timeouts, we record the jiffie of the first failure. This
+ * means that we can change the retry timeout for buffers already under
+ * I/O and thus avoid getting stuck in a retry loop with a long timeout.
+ *
+ * last_error is used to ensure that we are getting repeated errors, not
+ * different errors. e.g. a block device might change ENOSPC to EIO when
+ * a failure timeout occurs, so we want to re-initialise the error
+ * retry behaviour appropriately when that happens.
+ */
+ int b_retries;
+ unsigned long b_first_retry_time; /* in jiffies */
+ int b_last_error;
+
const struct xfs_buf_ops *b_ops;
#ifdef XFS_BUF_LOCK_TRACKING
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 99e91a0e554e..34257992934c 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1042,35 +1042,22 @@ xfs_buf_do_callbacks(
}
}
-/*
- * This is the iodone() function for buffers which have had callbacks
- * attached to them by xfs_buf_attach_iodone(). It should remove each
- * log item from the buffer's list and call the callback of each in turn.
- * When done, the buffer's fsprivate field is set to NULL and the buffer
- * is unlocked with a call to iodone().
- */
-void
-xfs_buf_iodone_callbacks(
+static bool
+xfs_buf_iodone_callback_error(
struct xfs_buf *bp)
{
struct xfs_log_item *lip = bp->b_fspriv;
struct xfs_mount *mp = lip->li_mountp;
static ulong lasttime;
static xfs_buftarg_t *lasttarg;
-
- if (likely(!bp->b_error))
- goto do_callbacks;
+ struct xfs_error_cfg *cfg;
/*
* If we've already decided to shutdown the filesystem because of
* I/O errors, there's no point in giving this a retry.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
- xfs_buf_stale(bp);
- bp->b_flags |= XBF_DONE;
- trace_xfs_buf_item_iodone(bp, _RET_IP_);
- goto do_callbacks;
- }
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out_stale;
if (bp->b_target != lasttarg ||
time_after(jiffies, (lasttime + 5*HZ))) {
@@ -1079,45 +1066,93 @@ xfs_buf_iodone_callbacks(
}
lasttarg = bp->b_target;
+ /* synchronous writes will have callers process the error */
+ if (!(bp->b_flags & XBF_ASYNC))
+ goto out_stale;
+
+ trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+ ASSERT(bp->b_iodone != NULL);
+
/*
* If the write was asynchronous then no one will be looking for the
- * error. Clear the error state and write the buffer out again.
- *
- * XXX: This helps against transient write errors, but we need to find
- * a way to shut the filesystem down if the writes keep failing.
- *
- * In practice we'll shut the filesystem down soon as non-transient
- * errors tend to affect the whole device and a failing log write
- * will make us give up. But we really ought to do better here.
+ * error. If this is the first failure of this type, clear the error
+ * state and write the buffer out again. This means we always retry an
+ * async write failure at least once, but we also need to set the buffer
+ * up to behave correctly now for repeated failures.
*/
- if (bp->b_flags & XBF_ASYNC) {
- ASSERT(bp->b_iodone != NULL);
+ if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+ bp->b_last_error != bp->b_error) {
+ bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
+ XBF_DONE | XBF_WRITE_FAIL);
+ bp->b_last_error = bp->b_error;
+ bp->b_retries = 0;
+ bp->b_first_retry_time = jiffies;
+
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_submit(bp);
+ return true;
+ }
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+ /*
+ * Repeated failure on an async write. Take action according to the
+ * error configuration we have been set up to use.
+ */
+ cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
- xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
+ if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+ ++bp->b_retries > cfg->max_retries)
+ goto permanent_error;
+ if (cfg->retry_timeout &&
+ time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+ goto permanent_error;
- if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
- bp->b_flags |= XBF_WRITE | XBF_ASYNC |
- XBF_DONE | XBF_WRITE_FAIL;
- xfs_buf_submit(bp);
- } else {
- xfs_buf_relse(bp);
- }
+ /* At unmount we may treat errors differently */
+ if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ goto permanent_error;
- return;
- }
+ /* still a transient error, higher layers will retry */
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_relse(bp);
+ return true;
/*
- * If the write of the buffer was synchronous, we want to make
- * sure to return the error to the caller of xfs_bwrite().
+ * Permanent error - we need to trigger a shutdown if we haven't already
+ * to indicate that inconsistency will result from this action.
*/
+permanent_error:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+out_stale:
xfs_buf_stale(bp);
bp->b_flags |= XBF_DONE;
-
trace_xfs_buf_error_relse(bp, _RET_IP_);
+ return false;
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks attached
+ * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
+ * callback list, mark the buffer as having no more callbacks and then push the
+ * buffer through IO completion processing.
+ */
+void
+xfs_buf_iodone_callbacks(
+ struct xfs_buf *bp)
+{
+ /*
+ * If there is an error, process it. Some errors require us
+ * to run callbacks after failure processing is done so we
+ * detect that and take appropriate action.
+ */
+ if (bp->b_error && xfs_buf_iodone_callback_error(bp))
+ return;
+
+ /*
+ * Successful IO or permanent error. Either way, we can clear the
+ * retry state here in preparation for the next error that may occur.
+ */
+ bp->b_last_error = 0;
+ bp->b_retries = 0;
-do_callbacks:
xfs_buf_do_callbacks(bp);
bp->b_fspriv = NULL;
bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 316b2a1bdba5..e0646659ce16 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -614,11 +614,10 @@ xfs_qm_dqread(
trace_xfs_dqread(dqp);
if (flags & XFS_QMOPT_DQALLOC) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
- XFS_QM_DQALLOC_SPACE_RES(mp), 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+ XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
if (error)
- goto error1;
+ goto error0;
}
/*
@@ -692,7 +691,7 @@ error0:
* end of the chunk, skip ahead to first id in next allocated chunk
* using the SEEK_DATA interface.
*/
-int
+static int
xfs_dq_get_next_id(
xfs_mount_t *mp,
uint type,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 85ce3032f815..47fc63295422 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -145,12 +145,10 @@ xfs_update_prealloc_flags(
struct xfs_trans *tp;
int error;
- tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
- error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+ 0, 0, 0, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -1553,7 +1551,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
} else {
ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
@@ -1587,7 +1585,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1624,8 +1622,7 @@ xfs_filemap_pmd_fault(
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
- NULL);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ee3aaa0a5317..b4d75825ae37 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -198,14 +198,10 @@ xfs_growfs_data_private(
return error;
}
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
- tp->t_flags |= XFS_TRANS_RESERVE;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
- XFS_GROWFS_SPACE_RES(mp), 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+ XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
return error;
- }
/*
* Write new AG headers to disk. Non-transactional, but written
@@ -243,8 +239,8 @@ xfs_growfs_data_private(
agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
- agf->agf_flfirst = 0;
- agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+ agf->agf_flfirst = cpu_to_be32(1);
+ agf->agf_fllast = 0;
agf->agf_flcount = 0;
tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
agf->agf_freeblks = cpu_to_be32(tmpsize);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index bf2d60749278..99ee6eee5e0b 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -37,9 +37,6 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
-STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
- struct xfs_perag *pag, struct xfs_inode *ip);
-
/*
* Allocate and initialise an xfs_inode.
*/
@@ -94,13 +91,6 @@ xfs_inode_free_callback(
struct inode *inode = container_of(head, struct inode, i_rcu);
struct xfs_inode *ip = XFS_I(inode);
- kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
- struct xfs_inode *ip)
-{
switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
@@ -118,6 +108,25 @@ xfs_inode_free(
ip->i_itemp = NULL;
}
+ kmem_zone_free(xfs_inode_zone, ip);
+}
+
+static void
+__xfs_inode_free(
+ struct xfs_inode *ip)
+{
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!xfs_isiflocked(ip));
+ XFS_STATS_DEC(ip->i_mount, vn_active);
+
+ call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+void
+xfs_inode_free(
+ struct xfs_inode *ip)
+{
/*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
@@ -129,12 +138,123 @@ xfs_inode_free(
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
- /* asserts to verify all state is correct here */
- ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!xfs_isiflocked(ip));
- XFS_STATS_DEC(ip->i_mount, vn_active);
+ __xfs_inode_free(ip);
+}
- call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+ struct xfs_mount *mp)
+{
+
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+ msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_reclaim_work);
+
+ xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+ xfs_reclaim_work_queue(mp);
+}
+
+static void
+xfs_perag_set_reclaim_tag(
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ if (pag->pag_ici_reclaimable++)
+ return;
+
+ /* propagate the reclaim tag up into the perag radix tree */
+ spin_lock(&mp->m_perag_lock);
+ radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&mp->m_perag_lock);
+
+ /* schedule periodic background inode reclaim */
+ xfs_reclaim_work_queue(mp);
+
+ trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+static void
+xfs_perag_clear_reclaim_tag(
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ if (--pag->pag_ici_reclaimable)
+ return;
+
+ /* clear the reclaim tag from the perag radix tree */
+ spin_lock(&mp->m_perag_lock);
+ radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&mp->m_perag_lock);
+ trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ xfs_perag_set_reclaim_tag(pag);
+ __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+STATIC void
+xfs_inode_clear_reclaim_tag(
+ struct xfs_perag *pag,
+ xfs_ino_t ino)
+{
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(pag->pag_mount, ino),
+ XFS_ICI_RECLAIM_TAG);
+ xfs_perag_clear_reclaim_tag(pag);
}
/*
@@ -264,7 +384,7 @@ xfs_iget_cache_hit(
*/
ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
ip->i_flags |= XFS_INEW;
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
inode->i_state = I_NEW;
ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
@@ -723,121 +843,6 @@ xfs_inode_ag_iterator_tag(
}
/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
- struct xfs_mount *mp)
-{
-
- rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
- queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
- msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
- }
- rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_reclaim_work);
-
- xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
- xfs_reclaim_work_queue(mp);
-}
-
-static void
-__xfs_inode_set_reclaim_tag(
- struct xfs_perag *pag,
- struct xfs_inode *ip)
-{
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
-
- if (!pag->pag_ici_reclaimable) {
- /* propagate the reclaim tag up into the perag radix tree */
- spin_lock(&ip->i_mount->m_perag_lock);
- radix_tree_tag_set(&ip->i_mount->m_perag_tree,
- XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
- spin_unlock(&ip->i_mount->m_perag_lock);
-
- /* schedule periodic background inode reclaim */
- xfs_reclaim_work_queue(ip->i_mount);
-
- trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
- -1, _RET_IP_);
- }
- pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
- xfs_inode_t *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- spin_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- __xfs_inode_set_reclaim_tag(pag, ip);
- __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- spin_unlock(&ip->i_flags_lock);
- spin_unlock(&pag->pag_ici_lock);
- xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
- xfs_perag_t *pag,
- xfs_inode_t *ip)
-{
- pag->pag_ici_reclaimable--;
- if (!pag->pag_ici_reclaimable) {
- /* clear the reclaim tag from the perag radix tree */
- spin_lock(&ip->i_mount->m_perag_lock);
- radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
- XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
- spin_unlock(&ip->i_mount->m_perag_lock);
- trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
- -1, _RET_IP_);
- }
-}
-
-STATIC void
-__xfs_inode_clear_reclaim_tag(
- xfs_mount_t *mp,
- xfs_perag_t *pag,
- xfs_inode_t *ip)
-{
- radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
- __xfs_inode_clear_reclaim(pag, ip);
-}
-
-/*
* Grab the inode for reclaim exclusively.
* Return 0 if we grabbed it, non-zero otherwise.
*/
@@ -929,6 +934,7 @@ xfs_reclaim_inode(
int sync_mode)
{
struct xfs_buf *bp = NULL;
+ xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
int error;
restart:
@@ -993,6 +999,22 @@ restart:
xfs_iflock(ip);
reclaim:
+ /*
+ * Because we use RCU freeing we need to ensure the inode always appears
+ * to be reclaimed with an invalid inode number when in the free state.
+ * We do this as early as possible under the ILOCK and flush lock so
+ * that xfs_iflush_cluster() can be guaranteed to detect races with us
+ * here. By doing this, we guarantee that once xfs_iflush_cluster has
+ * locked both the XFS_ILOCK and the flush lock that it will see either
+ * a valid, flushable inode that will serialise correctly against the
+ * locks below, or it will see a clean (and invalid) inode that it can
+ * skip.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags = XFS_IRECLAIM;
+ ip->i_ino = 0;
+ spin_unlock(&ip->i_flags_lock);
+
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1006,9 +1028,9 @@ reclaim:
*/
spin_lock(&pag->pag_ici_lock);
if (!radix_tree_delete(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+ XFS_INO_TO_AGINO(ip->i_mount, ino)))
ASSERT(0);
- __xfs_inode_clear_reclaim(pag, ip);
+ xfs_perag_clear_reclaim_tag(pag);
spin_unlock(&pag->pag_ici_lock);
/*
@@ -1023,7 +1045,7 @@ reclaim:
xfs_qm_dqdetach(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_inode_free(ip);
+ __xfs_inode_free(ip);
return error;
out_ifunlock:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96f606deee31..ee6799e0476f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1030,7 +1030,7 @@ xfs_dir_ialloc(
tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
}
- code = xfs_trans_roll(&tp, 0);
+ code = xfs_trans_roll(&tp, NULL);
if (committed != NULL)
*committed = 1;
@@ -1161,11 +1161,9 @@ xfs_create(
rdev = 0;
resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
tres = &M_RES(mp)->tr_mkdir;
- tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
} else {
resblks = XFS_CREATE_SPACE_RES(mp, name->len);
tres = &M_RES(mp)->tr_create;
- tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
}
/*
@@ -1174,20 +1172,19 @@ xfs_create(
* the case we'll drop the one we have and get a more
* appropriate transaction later.
*/
- error = xfs_trans_reserve(tp, tres, resblks, 0);
+ error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
if (error == -ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(mp);
- error = xfs_trans_reserve(tp, tres, resblks, 0);
+ error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
}
if (error == -ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
resblks = 0;
- error = xfs_trans_reserve(tp, tres, 0, 0);
+ error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
}
if (error)
- goto out_trans_cancel;
-
+ goto out_release_inode;
xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -1337,17 +1334,16 @@ xfs_create_tmpfile(
return error;
resblks = XFS_IALLOC_SPACE_RES(mp);
- tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
-
tres = &M_RES(mp)->tr_create_tmpfile;
- error = xfs_trans_reserve(tp, tres, resblks, 0);
+
+ error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
if (error == -ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
resblks = 0;
- error = xfs_trans_reserve(tp, tres, 0, 0);
+ error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
}
if (error)
- goto out_trans_cancel;
+ goto out_release_inode;
error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
pdqp, resblks, 1, 0);
@@ -1432,15 +1428,14 @@ xfs_link(
if (error)
goto std_return;
- tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
if (error == -ENOSPC) {
resblks = 0;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
}
if (error)
- goto error_return;
+ goto std_return;
xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
@@ -1710,11 +1705,9 @@ xfs_inactive_truncate(
struct xfs_trans *tp;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp);
return error;
}
@@ -1764,8 +1757,6 @@ xfs_inactive_ifree(
struct xfs_trans *tp;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
/*
* The ifree transaction might need to allocate blocks for record
* insertion to the finobt. We don't want to fail here at ENOSPC, so
@@ -1781,9 +1772,8 @@ xfs_inactive_ifree(
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- tp->t_flags |= XFS_TRANS_RESERVE;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
- XFS_IFREE_SPACE_RES(mp), 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+ XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
if (error) {
if (error == -ENOSPC) {
xfs_warn_ratelimited(mp,
@@ -1792,7 +1782,6 @@ xfs_inactive_ifree(
} else {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
}
- xfs_trans_cancel(tp);
return error;
}
@@ -2525,11 +2514,6 @@ xfs_remove(
if (error)
goto std_return;
- if (is_dir)
- tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
- else
- tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-
/*
* We try to get the real space reservation first,
* allowing for directory btree deletion(s) implying
@@ -2540,14 +2524,15 @@ xfs_remove(
* block from the directory.
*/
resblks = XFS_REMOVE_SPACE_RES(mp);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
if (error == -ENOSPC) {
resblks = 0;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
+ &tp);
}
if (error) {
ASSERT(error != -ENOSPC);
- goto out_trans_cancel;
+ goto std_return;
}
xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
@@ -2855,6 +2840,7 @@ xfs_rename_alloc_whiteout(
* and flag it as linkable.
*/
drop_nlink(VFS_I(tmpfile));
+ xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
VFS_I(tmpfile)->i_state |= I_LINKABLE;
@@ -2910,15 +2896,15 @@ xfs_rename(
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
- tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
spaceres = 0;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
+ &tp);
}
if (error)
- goto out_trans_cancel;
+ goto out_release_wip;
/*
* Attach the dquots to the inodes
@@ -3155,6 +3141,7 @@ out_bmap_cancel:
xfs_bmap_cancel(&free_list);
out_trans_cancel:
xfs_trans_cancel(tp);
+out_release_wip:
if (wip)
IRELE(wip);
return error;
@@ -3162,16 +3149,16 @@ out_trans_cancel:
STATIC int
xfs_iflush_cluster(
- xfs_inode_t *ip,
- xfs_buf_t *bp)
+ struct xfs_inode *ip,
+ struct xfs_buf *bp)
{
- xfs_mount_t *mp = ip->i_mount;
+ struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
unsigned long first_index, mask;
unsigned long inodes_per_cluster;
- int ilist_size;
- xfs_inode_t **ilist;
- xfs_inode_t *iq;
+ int cilist_size;
+ struct xfs_inode **cilist;
+ struct xfs_inode *cip;
int nr_found;
int clcount = 0;
int bufwasdelwri;
@@ -3180,23 +3167,23 @@ xfs_iflush_cluster(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
- ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
- ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
- if (!ilist)
+ cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+ cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
+ if (!cilist)
goto out_put;
mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
rcu_read_lock();
/* really need a gang lookup range call here */
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
first_index, inodes_per_cluster);
if (nr_found == 0)
goto out_free;
for (i = 0; i < nr_found; i++) {
- iq = ilist[i];
- if (iq == ip)
+ cip = cilist[i];
+ if (cip == ip)
continue;
/*
@@ -3205,20 +3192,30 @@ xfs_iflush_cluster(
* We need to check under the i_flags_lock for a valid inode
* here. Skip it if it is not valid or the wrong inode.
*/
- spin_lock(&ip->i_flags_lock);
- if (!ip->i_ino ||
- (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
- spin_unlock(&ip->i_flags_lock);
+ spin_lock(&cip->i_flags_lock);
+ if (!cip->i_ino ||
+ __xfs_iflags_test(cip, XFS_ISTALE)) {
+ spin_unlock(&cip->i_flags_lock);
continue;
}
- spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Once we fall off the end of the cluster, no point checking
+ * any more inodes in the list because they will also all be
+ * outside the cluster.
+ */
+ if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
+ spin_unlock(&cip->i_flags_lock);
+ break;
+ }
+ spin_unlock(&cip->i_flags_lock);
/*
* Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated
* later after the appropriate locks are acquired.
*/
- if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+ if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
continue;
/*
@@ -3226,15 +3223,28 @@ xfs_iflush_cluster(
* then this inode cannot be flushed and is skipped.
*/
- if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+ if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
+ continue;
+ if (!xfs_iflock_nowait(cip)) {
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
continue;
- if (!xfs_iflock_nowait(iq)) {
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ }
+ if (xfs_ipincount(cip)) {
+ xfs_ifunlock(cip);
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
continue;
}
- if (xfs_ipincount(iq)) {
- xfs_ifunlock(iq);
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
+
+
+ /*
+ * Check the inode number again, just to be certain we are not
+ * racing with freeing in xfs_reclaim_inode(). See the comments
+ * in that function for more information as to why the initial
+ * check is not sufficient.
+ */
+ if (!cip->i_ino) {
+ xfs_ifunlock(cip);
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
continue;
}
@@ -3242,18 +3252,18 @@ xfs_iflush_cluster(
* arriving here means that this inode can be flushed. First
* re-check that it's dirty before flushing.
*/
- if (!xfs_inode_clean(iq)) {
+ if (!xfs_inode_clean(cip)) {
int error;
- error = xfs_iflush_int(iq, bp);
+ error = xfs_iflush_int(cip, bp);
if (error) {
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
goto cluster_corrupt_out;
}
clcount++;
} else {
- xfs_ifunlock(iq);
+ xfs_ifunlock(cip);
}
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
}
if (clcount) {
@@ -3263,7 +3273,7 @@ xfs_iflush_cluster(
out_free:
rcu_read_unlock();
- kmem_free(ilist);
+ kmem_free(cilist);
out_put:
xfs_perag_put(pag);
return 0;
@@ -3306,8 +3316,8 @@ cluster_corrupt_out:
/*
* Unlocks the flush lock
*/
- xfs_iflush_abort(iq, false);
- kmem_free(ilist);
+ xfs_iflush_abort(cip, false);
+ kmem_free(cilist);
xfs_perag_put(pag);
return -EFSCORRUPTED;
}
@@ -3327,7 +3337,7 @@ xfs_iflush(
struct xfs_buf **bpp)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_buf *bp;
+ struct xfs_buf *bp = NULL;
struct xfs_dinode *dip;
int error;
@@ -3369,14 +3379,22 @@ xfs_iflush(
}
/*
- * Get the buffer containing the on-disk inode.
+ * Get the buffer containing the on-disk inode. We are doing a try-lock
+ * operation here, so we may get an EAGAIN error. In that case, we
+ * simply want to return with the inode still dirty.
+ *
+ * If we get any other error, we effectively have a corruption situation
+ * and we cannot flush the inode, so we treat it the same as failing
+ * xfs_iflush_int().
*/
error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
0);
- if (error || !bp) {
+ if (error == -EAGAIN) {
xfs_ifunlock(ip);
return error;
}
+ if (error)
+ goto corrupt_out;
/*
* First flush out the inode that xfs_iflush was called with.
@@ -3404,7 +3422,8 @@ xfs_iflush(
return 0;
corrupt_out:
- xfs_buf_relse(bp);
+ if (bp)
+ xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
cluster_corrupt_out:
error = -EFSCORRUPTED;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 43e1d51b15eb..e52d7c7aeb5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -440,6 +440,9 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
/* from xfs_iops.c */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
+
/*
* When setting up a newly allocated inode, we need to call
* xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -447,7 +450,6 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
* before we've completed instantiation. Otherwise we can do it
* the moment the inode lookup is complete.
*/
-extern void xfs_setup_inode(struct xfs_inode *ip);
static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
{
xfs_iflags_clear(ip, XFS_INEW);
@@ -458,6 +460,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
{
xfs_setup_inode(ip);
+ xfs_setup_iops(ip);
xfs_finish_inode_setup(ip);
}
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c48b5b18d771..a1b07612224c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork(
*/
data_bytes = roundup(ip->i_df.if_bytes, 4);
ASSERT(ip->i_df.if_real_bytes == 0 ||
- ip->i_df.if_real_bytes == data_bytes);
+ ip->i_df.if_real_bytes >= data_bytes);
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_d.di_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
@@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork(
*/
data_bytes = roundup(ip->i_afp->if_bytes, 4);
ASSERT(ip->i_afp->if_real_bytes == 0 ||
- ip->i_afp->if_real_bytes == data_bytes);
+ ip->i_afp->if_real_bytes >= data_bytes);
ASSERT(ip->i_afp->if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
ip->i_afp->if_u1.if_data,
@@ -479,6 +479,8 @@ STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
struct list_head *buffer_list)
+ __releases(&lip->li_ailp->xa_lock)
+ __acquires(&lip->li_ailp->xa_lock)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcb6c19ce3ea..dbca7375deef 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -277,7 +277,6 @@ xfs_readlink_by_handle(
{
struct dentry *dentry;
__u32 olen;
- void *link;
int error;
if (!capable(CAP_SYS_ADMIN))
@@ -288,7 +287,7 @@ xfs_readlink_by_handle(
return PTR_ERR(dentry);
/* Restrict this handle operation to symlinks only. */
- if (!d_is_symlink(dentry)) {
+ if (!d_inode(dentry)->i_op->readlink) {
error = -EINVAL;
goto out_dput;
}
@@ -298,21 +297,8 @@ xfs_readlink_by_handle(
goto out_dput;
}
- link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
- if (!link) {
- error = -ENOMEM;
- goto out_dput;
- }
-
- error = xfs_readlink(XFS_I(d_inode(dentry)), link);
- if (error)
- goto out_kfree;
- error = readlink_copy(hreq->ohandle, olen, link);
- if (error)
- goto out_kfree;
+ error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen);
- out_kfree:
- kfree(link);
out_dput:
dput(dentry);
return error;
@@ -334,12 +320,10 @@ xfs_set_dmattrs(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
return error;
- }
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -1141,10 +1125,9 @@ xfs_ioctl_setattr_get_trans(
if (XFS_FORCED_SHUTDOWN(mp))
goto out_unlock;
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
if (error)
- goto out_cancel;
+ return ERR_PTR(error);
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d81bdc080370..58391355a44d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -132,6 +132,7 @@ xfs_iomap_write_direct(
int error;
int lockmode;
int bmapi_flags = XFS_BMAPI_PREALLOC;
+ uint tflags = 0;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
@@ -192,11 +193,6 @@ xfs_iomap_write_direct(
return error;
/*
- * Allocate and setup the transaction
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-
- /*
* For DAX, we do not allocate unwritten extents, but instead we zero
* the block before we commit the transaction. Ideally we'd like to do
* this outside the transaction context, but if we commit and then crash
@@ -209,23 +205,17 @@ xfs_iomap_write_direct(
* the reserve block pool for bmbt block allocation if there is no space
* left but we need to do unwritten extent conversion.
*/
-
if (IS_DAX(VFS_I(ip))) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
if (ISUNWRITTEN(imap)) {
- tp->t_flags |= XFS_TRANS_RESERVE;
+ tflags |= XFS_TRANS_RESERVE;
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
}
}
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- resblks, resrtextents);
- /*
- * Check for running out of space, note: need lock to return
- */
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
+ tflags, &tp);
+ if (error)
return error;
- }
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockmode);
@@ -726,15 +716,13 @@ xfs_iomap_write_allocate(
nimaps = 0;
while (nimaps == 0) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
- tp->t_flags |= XFS_TRANS_RESERVE;
nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- nres, 0);
- if (error) {
- xfs_trans_cancel(tp);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+ 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
return error;
- }
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -878,25 +866,18 @@ xfs_iomap_write_unwritten(
do {
/*
- * set up a transaction to convert the range of extents
+ * Set up a transaction to convert the range of extents
* from unwritten to real. Do allocations in a loop until
* we have covered the range passed in.
*
- * Note that we open code the transaction allocation here
- * to pass KM_NOFS--we can't risk to recursing back into
- * the filesystem here as we might be asked to write out
- * the same inode that we complete here and might deadlock
- * on the iolock.
+ * Note that we can't risk to recursing back into the filesystem
+ * here as we might be asked to write out the same inode that we
+ * complete here and might deadlock on the iolock.
*/
- sb_start_intwrite(mp->m_super);
- tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
- tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- resblks, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index fb7dc61f4a29..c5d4eba6972e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -181,6 +181,8 @@ xfs_generic_create(
}
#endif
+ xfs_setup_iops(ip);
+
if (tmpfile)
d_tmpfile(dentry, inode);
else
@@ -368,6 +370,8 @@ xfs_vn_symlink(
if (unlikely(error))
goto out_cleanup_inode;
+ xfs_setup_iops(cip);
+
d_instantiate(dentry, inode);
xfs_finish_inode_setup(cip);
return 0;
@@ -442,6 +446,16 @@ xfs_vn_get_link(
return ERR_PTR(error);
}
+STATIC const char *
+xfs_vn_get_link_inline(
+ struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+ return XFS_I(inode)->i_df.if_u1.if_data;
+}
+
STATIC int
xfs_vn_getattr(
struct vfsmount *mnt,
@@ -599,12 +613,12 @@ xfs_setattr_nonsize(
return error;
}
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
if (error)
- goto out_trans_cancel;
+ goto out_dqrele;
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
/*
* Change file ownership. Must be the owner or privileged.
@@ -633,12 +647,10 @@ xfs_setattr_nonsize(
NULL, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (error) /* out of quota */
- goto out_unlock;
+ goto out_cancel;
}
}
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* Change file ownership. Must be the owner or privileged.
*/
@@ -722,10 +734,9 @@ xfs_setattr_nonsize(
return 0;
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_trans_cancel:
+out_cancel:
xfs_trans_cancel(tp);
+out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
return error;
@@ -834,7 +845,7 @@ xfs_setattr_size(
* We have to do all the page cache truncate work outside the
* transaction context as the "lock" order is page lock->log space
* reservation as defined by extent allocation in the writeback path.
- * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+ * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
* having already truncated the in-memory version of the file (i.e. made
* user visible changes). There's not much we can do about this, except
* to hope that the caller sees ENOMEM and retries the truncate
@@ -849,10 +860,9 @@ xfs_setattr_size(
return error;
truncate_setsize(inode, newsize);
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error)
- goto out_trans_cancel;
+ return error;
lock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -971,12 +981,9 @@ xfs_vn_update_time(
trace_xfs_update_time(ip);
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (flags & S_CTIME)
@@ -1167,6 +1174,18 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.update_time = xfs_vn_update_time,
};
+static const struct inode_operations xfs_inline_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .get_link = xfs_vn_get_link_inline,
+ .getattr = xfs_vn_getattr,
+ .setattr = xfs_vn_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = xfs_vn_listxattr,
+ .update_time = xfs_vn_update_time,
+};
+
STATIC void
xfs_diflags_to_iflags(
struct inode *inode,
@@ -1193,7 +1212,7 @@ xfs_diflags_to_iflags(
}
/*
- * Initialize the Linux inode and set up the operation vectors.
+ * Initialize the Linux inode.
*
* When reading existing inodes from disk this is called directly from xfs_iget,
* when creating a new inode it is called from xfs_ialloc after setting up the
@@ -1232,32 +1251,12 @@ xfs_setup_inode(
i_size_write(inode, ip->i_d.di_size);
xfs_diflags_to_iflags(inode, ip);
- ip->d_ops = ip->i_mount->m_nondir_inode_ops;
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- inode->i_op = &xfs_inode_operations;
- inode->i_fop = &xfs_file_operations;
- inode->i_mapping->a_ops = &xfs_address_space_operations;
- break;
- case S_IFDIR:
+ if (S_ISDIR(inode->i_mode)) {
lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
- if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
- inode->i_op = &xfs_dir_ci_inode_operations;
- else
- inode->i_op = &xfs_dir_inode_operations;
- inode->i_fop = &xfs_dir_file_operations;
ip->d_ops = ip->i_mount->m_dir_inode_ops;
- break;
- case S_IFLNK:
- inode->i_op = &xfs_symlink_inode_operations;
- if (!(ip->i_df.if_flags & XFS_IFINLINE))
- inode->i_mapping->a_ops = &xfs_address_space_operations;
- break;
- default:
- inode->i_op = &xfs_inode_operations;
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
- break;
+ } else {
+ ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+ lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
}
/*
@@ -1277,3 +1276,35 @@ xfs_setup_inode(
cache_no_acl(inode);
}
}
+
+void
+xfs_setup_iops(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = &ip->i_vnode;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &xfs_inode_operations;
+ inode->i_fop = &xfs_file_operations;
+ inode->i_mapping->a_ops = &xfs_address_space_operations;
+ break;
+ case S_IFDIR:
+ if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+ inode->i_op = &xfs_dir_ci_inode_operations;
+ else
+ inode->i_op = &xfs_dir_inode_operations;
+ inode->i_fop = &xfs_dir_file_operations;
+ break;
+ case S_IFLNK:
+ if (ip->i_df.if_flags & XFS_IFINLINE)
+ inode->i_op = &xfs_inline_symlink_inode_operations;
+ else
+ inode->i_op = &xfs_symlink_inode_operations;
+ break;
+ default:
+ inode->i_op = &xfs_inode_operations;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ break;
+ }
+}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b49ccf5c1d75..bde02f1fba73 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -435,8 +435,7 @@ xfs_log_reserve(
int cnt,
struct xlog_ticket **ticp,
__uint8_t client,
- bool permanent,
- uint t_type)
+ bool permanent)
{
struct xlog *log = mp->m_log;
struct xlog_ticket *tic;
@@ -456,7 +455,6 @@ xfs_log_reserve(
if (!tic)
return -ENOMEM;
- tic->t_trans_type = t_type;
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
} while (iclog != first_iclog);
#endif
if (! (XLOG_FORCED_SHUTDOWN(log))) {
- error = xfs_log_reserve(mp, 600, 1, &tic,
- XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
+ error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
if (!error) {
/* the data section must be 32 bit size aligned */
struct {
@@ -2032,58 +2029,8 @@ xlog_print_tic_res(
REG_TYPE_STR(ICREATE, "inode create")
};
#undef REG_TYPE_STR
-#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type
- static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
- TRANS_TYPE_STR(SETATTR_NOT_SIZE),
- TRANS_TYPE_STR(SETATTR_SIZE),
- TRANS_TYPE_STR(INACTIVE),
- TRANS_TYPE_STR(CREATE),
- TRANS_TYPE_STR(CREATE_TRUNC),
- TRANS_TYPE_STR(TRUNCATE_FILE),
- TRANS_TYPE_STR(REMOVE),
- TRANS_TYPE_STR(LINK),
- TRANS_TYPE_STR(RENAME),
- TRANS_TYPE_STR(MKDIR),
- TRANS_TYPE_STR(RMDIR),
- TRANS_TYPE_STR(SYMLINK),
- TRANS_TYPE_STR(SET_DMATTRS),
- TRANS_TYPE_STR(GROWFS),
- TRANS_TYPE_STR(STRAT_WRITE),
- TRANS_TYPE_STR(DIOSTRAT),
- TRANS_TYPE_STR(WRITEID),
- TRANS_TYPE_STR(ADDAFORK),
- TRANS_TYPE_STR(ATTRINVAL),
- TRANS_TYPE_STR(ATRUNCATE),
- TRANS_TYPE_STR(ATTR_SET),
- TRANS_TYPE_STR(ATTR_RM),
- TRANS_TYPE_STR(ATTR_FLAG),
- TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
- TRANS_TYPE_STR(SB_CHANGE),
- TRANS_TYPE_STR(DUMMY1),
- TRANS_TYPE_STR(DUMMY2),
- TRANS_TYPE_STR(QM_QUOTAOFF),
- TRANS_TYPE_STR(QM_DQALLOC),
- TRANS_TYPE_STR(QM_SETQLIM),
- TRANS_TYPE_STR(QM_DQCLUSTER),
- TRANS_TYPE_STR(QM_QINOCREATE),
- TRANS_TYPE_STR(QM_QUOTAOFF_END),
- TRANS_TYPE_STR(FSYNC_TS),
- TRANS_TYPE_STR(GROWFSRT_ALLOC),
- TRANS_TYPE_STR(GROWFSRT_ZERO),
- TRANS_TYPE_STR(GROWFSRT_FREE),
- TRANS_TYPE_STR(SWAPEXT),
- TRANS_TYPE_STR(CHECKPOINT),
- TRANS_TYPE_STR(ICREATE),
- TRANS_TYPE_STR(CREATE_TMPFILE)
- };
-#undef TRANS_TYPE_STR
xfs_warn(mp, "xlog_write: reservation summary:");
- xfs_warn(mp, " trans type = %s (%u)",
- ((ticket->t_trans_type <= 0 ||
- ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
- "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
- ticket->t_trans_type);
xfs_warn(mp, " unit res = %d bytes",
ticket->t_unit_res);
xfs_warn(mp, " current res = %d bytes",
@@ -3378,7 +3325,7 @@ xfs_log_force(
{
int error;
- trace_xfs_log_force(mp, 0);
+ trace_xfs_log_force(mp, 0, _RET_IP_);
error = _xfs_log_force(mp, flags, NULL);
if (error)
xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3527,7 +3474,7 @@ xfs_log_force_lsn(
{
int error;
- trace_xfs_log_force(mp, lsn);
+ trace_xfs_log_force(mp, lsn, _RET_IP_);
error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
if (error)
xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3709,7 +3656,6 @@ xlog_ticket_alloc(
tic->t_tid = prandom_u32();
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
- tic->t_trans_type = 0;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index aa533a7d50f2..80ba0c047090 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -161,8 +161,7 @@ int xfs_log_reserve(struct xfs_mount *mp,
int count,
struct xlog_ticket **ticket,
__uint8_t clientid,
- bool permanent,
- uint t_type);
+ bool permanent);
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
int xfs_log_unmount_write(struct xfs_mount *mp);
void xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4e7649351f5a..5e54e7955ea6 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -51,7 +51,6 @@ xlog_cil_ticket_alloc(
tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
KM_SLEEP|KM_NOFS);
- tic->t_trans_type = XFS_TRANS_CHECKPOINT;
/*
* set the current reservation to zero so we know to steal the basic
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ed8896310c00..765f084759b5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -175,7 +175,6 @@ typedef struct xlog_ticket {
char t_cnt; /* current count : 1 */
char t_clientid; /* who does this belong to; : 1 */
char t_flags; /* properties of reservation : 1 */
- uint t_trans_type; /* transaction type : 4 */
/* reservation array fields */
uint t_res_num; /* num in array : 4 */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 396565f43247..835997843846 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+ ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4205,10 +4205,9 @@ xlog_recover_process_efi(
}
}
- tp = xfs_trans_alloc(mp, 0);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error)
- goto abort_error;
+ return error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
@@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket(
int offset;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
if (error)
- goto out_abort;
+ goto out_error;
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index cfd4210dd015..e39b02351b4a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -89,7 +89,6 @@ xfs_uuid_mount(
if (hole < 0) {
xfs_uuid_table = kmem_realloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- xfs_uuid_table_size * sizeof(*xfs_uuid_table),
KM_SLEEP);
hole = xfs_uuid_table_size++;
}
@@ -681,6 +680,9 @@ xfs_mountfs(
xfs_set_maxicount(mp);
+ /* enable fail_at_unmount as default */
+ mp->m_fail_unmount = 1;
+
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
if (error)
goto out;
@@ -690,10 +692,15 @@ xfs_mountfs(
if (error)
goto out_remove_sysfs;
- error = xfs_uuid_mount(mp);
+ error = xfs_error_sysfs_init(mp);
if (error)
goto out_del_stats;
+
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out_remove_error_sysfs;
+
/*
* Set the minimum read and write sizes
*/
@@ -957,6 +964,7 @@ xfs_mountfs(
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
out_log_dealloc:
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -968,6 +976,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_remove_error_sysfs:
+ xfs_error_sysfs_del(mp);
out_del_stats:
xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
@@ -1006,6 +1016,14 @@ xfs_unmountfs(
xfs_log_force(mp, XFS_LOG_SYNC);
/*
+ * We now need to tell the world we are unmounting. This will allow
+ * us to detect that the filesystem is going away and we should error
+ * out anything that we have been retrying in the background. This will
+ * prevent neverending retries in AIL pushing from hanging the unmount.
+ */
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
+ /*
* Flush all pending changes from the AIL.
*/
xfs_ail_push_all_sync(mp->m_ail);
@@ -1056,6 +1074,7 @@ xfs_unmountfs(
#endif
xfs_free_perag(mp);
+ xfs_error_sysfs_del(mp);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index eafe257b357a..c1b798c72126 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -37,6 +37,32 @@ enum {
XFS_LOWSP_MAX,
};
+/*
+ * Error Configuration
+ *
+ * Error classes define the subsystem the configuration belongs to.
+ * Error numbers define the errors that are configurable.
+ */
+enum {
+ XFS_ERR_METADATA,
+ XFS_ERR_CLASS_MAX,
+};
+enum {
+ XFS_ERR_DEFAULT,
+ XFS_ERR_EIO,
+ XFS_ERR_ENOSPC,
+ XFS_ERR_ENODEV,
+ XFS_ERR_ERRNO_MAX,
+};
+
+#define XFS_ERR_RETRY_FOREVER -1
+
+struct xfs_error_cfg {
+ struct xfs_kobj kobj;
+ int max_retries;
+ unsigned long retry_timeout; /* in jiffies, 0 = no timeout */
+};
+
typedef struct xfs_mount {
struct super_block *m_super;
xfs_tid_t m_tid; /* next unused tid for fs */
@@ -127,6 +153,9 @@ typedef struct xfs_mount {
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
struct xfs_kobj m_kobj;
+ struct xfs_kobj m_error_kobj;
+ struct xfs_kobj m_error_meta_kobj;
+ struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
@@ -148,6 +177,7 @@ typedef struct xfs_mount {
*/
__uint32_t m_generation;
+ bool m_fail_unmount;
#ifdef DEBUG
/*
* DEBUG mode instrumentation to test and/or trigger delayed allocation
@@ -166,6 +196,7 @@ typedef struct xfs_mount {
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
must be synchronous except
for space allocations */
+#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
@@ -364,4 +395,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
xfs_off_t count_fsb);
+struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
+ int error_class, int error);
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 51ddaf2c2b8c..d5b756669fb5 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -308,12 +308,9 @@ xfs_fs_commit_blocks(
goto out_drop_iolock;
}
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
goto out_drop_iolock;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index be125e1758c1..a60d9e2739d1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -783,13 +783,10 @@ xfs_qm_qino_alloc(
}
}
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
- XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+ XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp);
+ if (error)
return error;
- }
if (need_alloc) {
error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index f4d0e0a8f517..475a3882a81f 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
- xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
goto out_put;
}
@@ -436,12 +434,9 @@ xfs_qm_scall_setqlim(
defq = xfs_get_defquota(dqp, q);
xfs_dqunlock(dqp);
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
+ if (error)
goto out_rele;
- }
xfs_dqlock(dqp);
xfs_trans_dqjoin(tp, dqp);
@@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end(
int error;
xfs_qoff_logitem_t *qoffi;
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+ if (error)
return error;
- }
qoffi = xfs_trans_get_qoff_item(tp, startqoff,
flags & XFS_ALL_QUOTA_ACCT);
@@ -603,12 +594,9 @@ xfs_qm_log_quotaoff(
*qoffstartp = NULL;
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
+ if (error)
goto out;
- }
qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
xfs_trans_log_quotaoff_item(tp, qoffi);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index abf44435d04a..3938b37d1043 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -780,15 +780,14 @@ xfs_growfs_rt_alloc(
* Allocate space to the file, as necessary.
*/
while (oblocks < nblocks) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
/*
* Reserve space & log for one extent added to the file.
*/
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
- resblks, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
+ 0, 0, &tp);
if (error)
- goto out_trans_cancel;
+ return error;
/*
* Lock the inode.
*/
@@ -823,14 +822,13 @@ xfs_growfs_rt_alloc(
for (bno = map.br_startoff, fsbno = map.br_startblock;
bno < map.br_startoff + map.br_blockcount;
bno++, fsbno++) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
/*
* Reserve log for one block zeroing.
*/
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
- 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
+ 0, 0, 0, &tp);
if (error)
- goto out_trans_cancel;
+ return error;
/*
* Lock the bitmap inode.
*/
@@ -994,11 +992,10 @@ xfs_growfs_rt(
/*
* Start a transaction, get the log reservation.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
- 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0,
+ &tp);
if (error)
- goto error_cancel;
+ break;
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 187e14b696c2..11ea5d51db56 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -58,8 +58,7 @@
#include <linux/parser.h>
static const struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_ioend_zone;
-mempool_t *xfs_ioend_pool;
+struct bio_set *xfs_ioend_bioset;
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
@@ -350,6 +349,7 @@ xfs_parseargs(
case Opt_pqnoenforce:
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+ break;
case Opt_gquota:
case Opt_grpquota:
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
@@ -928,7 +928,7 @@ xfs_fs_alloc_inode(
/*
* Now that the generic code is guaranteed not to be accessing
- * the linux inode, we can reclaim the inode.
+ * the linux inode, we can inactivate and reclaim the inode.
*/
STATIC void
xfs_fs_destroy_inode(
@@ -938,9 +938,14 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- XFS_STATS_INC(ip->i_mount, vn_reclaim);
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ XFS_STATS_INC(ip->i_mount, vn_rele);
+ XFS_STATS_INC(ip->i_mount, vn_remove);
+
+ xfs_inactive(ip);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+ XFS_STATS_INC(ip->i_mount, vn_reclaim);
/*
* We should never get here with one of the reclaim flags already set.
@@ -987,24 +992,6 @@ xfs_fs_inode_init_once(
"xfsino", ip->i_ino);
}
-STATIC void
-xfs_fs_evict_inode(
- struct inode *inode)
-{
- xfs_inode_t *ip = XFS_I(inode);
-
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-
- trace_xfs_evict_inode(ip);
-
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
- XFS_STATS_INC(ip->i_mount, vn_rele);
- XFS_STATS_INC(ip->i_mount, vn_remove);
-
- xfs_inactive(ip);
-}
-
/*
* We do an unlocked check for XFS_IDONTCACHE here because we are already
* serialised against cache hits here via the inode->i_lock and igrab() in
@@ -1276,6 +1263,16 @@ xfs_fs_remount(
return -EINVAL;
}
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_ro_compat_feature(sbp,
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ return -EINVAL;
+ }
+
mp->m_flags &= ~XFS_MOUNT_RDONLY;
/*
@@ -1558,14 +1555,12 @@ xfs_fs_fill_super(
if (mp->m_flags & XFS_MOUNT_DAX) {
xfs_warn(mp,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- if (sb->s_blocksize != PAGE_SIZE) {
- xfs_alert(mp,
- "Filesystem block size invalid for DAX Turning DAX off.");
- mp->m_flags &= ~XFS_MOUNT_DAX;
- } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+
+ error = bdev_dax_supported(sb, sb->s_blocksize);
+ if (error) {
xfs_alert(mp,
- "Block device does not support DAX Turning DAX off.");
+ "DAX unsupported by block device. Turning off DAX.");
mp->m_flags &= ~XFS_MOUNT_DAX;
}
}
@@ -1663,7 +1658,6 @@ xfs_fs_free_cached_objects(
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
- .evict_inode = xfs_fs_evict_inode,
.drop_inode = xfs_fs_drop_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
@@ -1688,20 +1682,15 @@ MODULE_ALIAS_FS("xfs");
STATIC int __init
xfs_init_zones(void)
{
-
- xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
- if (!xfs_ioend_zone)
+ xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+ offsetof(struct xfs_ioend, io_inline_bio));
+ if (!xfs_ioend_bioset)
goto out;
- xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
- xfs_ioend_zone);
- if (!xfs_ioend_pool)
- goto out_destroy_ioend_zone;
-
xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
"xfs_log_ticket");
if (!xfs_log_ticket_zone)
- goto out_destroy_ioend_pool;
+ goto out_free_ioend_bioset;
xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
"xfs_bmap_free_item");
@@ -1797,10 +1786,8 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_bmap_free_item_zone);
out_destroy_log_ticket_zone:
kmem_zone_destroy(xfs_log_ticket_zone);
- out_destroy_ioend_pool:
- mempool_destroy(xfs_ioend_pool);
- out_destroy_ioend_zone:
- kmem_zone_destroy(xfs_ioend_zone);
+ out_free_ioend_bioset:
+ bioset_free(xfs_ioend_bioset);
out:
return -ENOMEM;
}
@@ -1826,9 +1813,7 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_btree_cur_zone);
kmem_zone_destroy(xfs_bmap_free_item_zone);
kmem_zone_destroy(xfs_log_ticket_zone);
- mempool_destroy(xfs_ioend_pool);
- kmem_zone_destroy(xfs_ioend_zone);
-
+ bioset_free(xfs_ioend_bioset);
}
STATIC int __init
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index b44284c1adda..08a46c6181fd 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -131,6 +131,8 @@ xfs_readlink(
trace_xfs_readlink(ip);
+ ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE));
+
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -150,12 +152,7 @@ xfs_readlink(
}
- if (ip->i_df.if_flags & XFS_IFINLINE) {
- memcpy(link, ip->i_df.if_u1.if_data, pathlen);
- link[pathlen] = '\0';
- } else {
- error = xfs_readlink_bmap(ip, link);
- }
+ error = xfs_readlink_bmap(ip, link);
out:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -221,7 +218,6 @@ xfs_symlink(
if (error)
return error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
/*
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
@@ -231,13 +227,15 @@ xfs_symlink(
else
fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
if (error == -ENOSPC && fs_blocks == 0) {
resblks = 0;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0,
+ &tp);
}
if (error)
- goto out_trans_cancel;
+ goto out_release_inode;
xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -302,19 +300,11 @@ xfs_symlink(
* If the symlink will fit into the inode, write it inline.
*/
if (pathlen <= XFS_IFORK_DSIZE(ip)) {
- xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
- memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
- ip->i_d.di_size = pathlen;
-
- /*
- * The inode was initially created in extent format.
- */
- ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
- ip->i_df.if_flags |= XFS_IFINLINE;
+ xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
+ ip->i_d.di_size = pathlen;
ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-
} else {
int offset;
@@ -455,12 +445,9 @@ xfs_inactive_symlink_rmt(
*/
ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 6ced4f143494..4c2c55086208 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -17,10 +17,11 @@
*/
#include "xfs.h"
-#include "xfs_sysfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_sysfs.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_stats.h"
@@ -362,3 +363,291 @@ struct kobj_type xfs_log_ktype = {
.sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_log_attrs,
};
+
+/*
+ * Metadata IO error configuration
+ *
+ * The sysfs structure here is:
+ * ...xfs/<dev>/error/<class>/<errno>/<error_attrs>
+ *
+ * where <class> allows us to discriminate between data IO and metadata IO,
+ * and any other future type of IO (e.g. special inode or directory error
+ * handling) we care to support.
+ */
+static inline struct xfs_error_cfg *
+to_error_cfg(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+ return container_of(kobj, struct xfs_error_cfg, kobj);
+}
+
+static inline struct xfs_mount *
+err_to_mp(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+ return container_of(kobj, struct xfs_mount, m_error_kobj);
+}
+
+static ssize_t
+max_retries_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+}
+
+static ssize_t
+max_retries_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < -1)
+ return -EINVAL;
+
+ cfg->max_retries = val;
+ return count;
+}
+XFS_SYSFS_ATTR_RW(max_retries);
+
+static ssize_t
+retry_timeout_seconds_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+ return snprintf(buf, PAGE_SIZE, "%ld\n",
+ jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+}
+
+static ssize_t
+retry_timeout_seconds_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ /* 1 day timeout maximum */
+ if (val < 0 || val > 86400)
+ return -EINVAL;
+
+ cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+ return count;
+}
+XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
+
+static ssize_t
+fail_at_unmount_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ struct xfs_mount *mp = err_to_mp(kobject);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+}
+
+static ssize_t
+fail_at_unmount_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_mount *mp = err_to_mp(kobject);
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < 0 || val > 1)
+ return -EINVAL;
+
+ mp->m_fail_unmount = val;
+ return count;
+}
+XFS_SYSFS_ATTR_RW(fail_at_unmount);
+
+static struct attribute *xfs_error_attrs[] = {
+ ATTR_LIST(max_retries),
+ ATTR_LIST(retry_timeout_seconds),
+ NULL,
+};
+
+
+struct kobj_type xfs_error_cfg_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_error_attrs,
+};
+
+struct kobj_type xfs_error_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+};
+
+/*
+ * Error initialization tables. These need to be ordered in the same
+ * order as the enums used to index the array. All class init tables need to
+ * define a "default" behaviour as the first entry, all other entries can be
+ * empty.
+ */
+struct xfs_error_init {
+ char *name;
+ int max_retries;
+ int retry_timeout; /* in seconds */
+};
+
+static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
+ { .name = "default",
+ .max_retries = XFS_ERR_RETRY_FOREVER,
+ .retry_timeout = 0,
+ },
+ { .name = "EIO",
+ .max_retries = XFS_ERR_RETRY_FOREVER,
+ .retry_timeout = 0,
+ },
+ { .name = "ENOSPC",
+ .max_retries = XFS_ERR_RETRY_FOREVER,
+ .retry_timeout = 0,
+ },
+ { .name = "ENODEV",
+ .max_retries = 0,
+ },
+};
+
+static int
+xfs_error_sysfs_init_class(
+ struct xfs_mount *mp,
+ int class,
+ const char *parent_name,
+ struct xfs_kobj *parent_kobj,
+ const struct xfs_error_init init[])
+{
+ struct xfs_error_cfg *cfg;
+ int error;
+ int i;
+
+ ASSERT(class < XFS_ERR_CLASS_MAX);
+
+ error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype,
+ &mp->m_error_kobj, parent_name);
+ if (error)
+ return error;
+
+ for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) {
+ cfg = &mp->m_error_cfg[class][i];
+ error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
+ parent_kobj, init[i].name);
+ if (error)
+ goto out_error;
+
+ cfg->max_retries = init[i].max_retries;
+ cfg->retry_timeout = msecs_to_jiffies(
+ init[i].retry_timeout * MSEC_PER_SEC);
+ }
+ return 0;
+
+out_error:
+ /* unwind the entries that succeeded */
+ for (i--; i >= 0; i--) {
+ cfg = &mp->m_error_cfg[class][i];
+ xfs_sysfs_del(&cfg->kobj);
+ }
+ xfs_sysfs_del(parent_kobj);
+ return error;
+}
+
+int
+xfs_error_sysfs_init(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ /* .../xfs/<dev>/error/ */
+ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
+ &mp->m_kobj, "error");
+ if (error)
+ return error;
+
+ error = sysfs_create_file(&mp->m_error_kobj.kobject,
+ ATTR_LIST(fail_at_unmount));
+
+ if (error)
+ goto out_error;
+
+ /* .../xfs/<dev>/error/metadata/ */
+ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
+ "metadata", &mp->m_error_meta_kobj,
+ xfs_error_meta_init);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ xfs_sysfs_del(&mp->m_error_kobj);
+ return error;
+}
+
+void
+xfs_error_sysfs_del(
+ struct xfs_mount *mp)
+{
+ struct xfs_error_cfg *cfg;
+ int i, j;
+
+ for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
+ for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
+ cfg = &mp->m_error_cfg[i][j];
+
+ xfs_sysfs_del(&cfg->kobj);
+ }
+ }
+ xfs_sysfs_del(&mp->m_error_meta_kobj);
+ xfs_sysfs_del(&mp->m_error_kobj);
+}
+
+struct xfs_error_cfg *
+xfs_error_get_cfg(
+ struct xfs_mount *mp,
+ int error_class,
+ int error)
+{
+ struct xfs_error_cfg *cfg;
+
+ switch (error) {
+ case EIO:
+ cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
+ break;
+ case ENOSPC:
+ cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC];
+ break;
+ case ENODEV:
+ cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV];
+ break;
+ default:
+ cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
+ break;
+ }
+
+ return cfg;
+}
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index be692e59938d..d04637181ef2 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -58,4 +58,7 @@ xfs_sysfs_del(
wait_for_completion(&kobj->complete);
}
+int xfs_error_sysfs_init(struct xfs_mount *mp);
+void xfs_error_sysfs_del(struct xfs_mount *mp);
+
#endif /* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8d58426008e..ea94ee0fe5ea 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -364,7 +364,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_bdstrat_shut);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone);
DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
@@ -944,7 +943,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
TP_ARGS(log, tic),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(unsigned, trans_type)
__field(char, ocnt)
__field(char, cnt)
__field(int, curr_res)
@@ -962,7 +960,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
),
TP_fast_assign(
__entry->dev = log->l_mp->m_super->s_dev;
- __entry->trans_type = tic->t_trans_type;
__entry->ocnt = tic->t_ocnt;
__entry->cnt = tic->t_cnt;
__entry->curr_res = tic->t_curr_res;
@@ -980,14 +977,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->curr_block = log->l_curr_block;
__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
),
- TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+ TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
"t_unit_res %u t_flags %s reserveq %s "
"writeq %s grant_reserve_cycle %d "
"grant_reserve_bytes %d grant_write_cycle %d "
"grant_write_bytes %d curr_cycle %d curr_block %d "
"tail_cycle %d tail_block %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
__entry->ocnt,
__entry->cnt,
__entry->curr_res,
@@ -1053,19 +1049,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
)
TRACE_EVENT(xfs_log_force,
- TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
- TP_ARGS(mp, lsn),
+ TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip),
+ TP_ARGS(mp, lsn, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_lsn_t, lsn)
+ __field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->lsn = lsn;
+ __entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d lsn 0x%llx",
+ TP_printk("dev %d:%d lsn 0x%llx caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->lsn)
+ __entry->lsn, (void *)__entry->caller_ip)
)
#define DEFINE_LOG_ITEM_EVENT(name) \
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 20c53666cb4b..5f3d33d16e67 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -47,47 +47,6 @@ xfs_trans_init(
}
/*
- * This routine is called to allocate a transaction structure.
- * The type parameter indicates the type of the transaction. These
- * are enumerated in xfs_trans.h.
- *
- * Dynamically allocate the transaction structure from the transaction
- * zone, initialize it, and return it to the caller.
- */
-xfs_trans_t *
-xfs_trans_alloc(
- xfs_mount_t *mp,
- uint type)
-{
- xfs_trans_t *tp;
-
- sb_start_intwrite(mp->m_super);
- tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
- tp->t_flags |= XFS_TRANS_FREEZE_PROT;
- return tp;
-}
-
-xfs_trans_t *
-_xfs_trans_alloc(
- xfs_mount_t *mp,
- uint type,
- xfs_km_flags_t memflags)
-{
- xfs_trans_t *tp;
-
- WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
- atomic_inc(&mp->m_active_trans);
-
- tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
- tp->t_magic = XFS_TRANS_HEADER_MAGIC;
- tp->t_type = type;
- tp->t_mountp = mp;
- INIT_LIST_HEAD(&tp->t_items);
- INIT_LIST_HEAD(&tp->t_busy);
- return tp;
-}
-
-/*
* Free the transaction structure. If there is more clean up
* to do when the structure is freed, add it here.
*/
@@ -99,7 +58,7 @@ xfs_trans_free(
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
atomic_dec(&tp->t_mountp->m_active_trans);
- if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+ if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
kmem_zone_free(xfs_trans_zone, tp);
@@ -125,7 +84,6 @@ xfs_trans_dup(
* Initialize the new transaction structure.
*/
ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
- ntp->t_type = tp->t_type;
ntp->t_mountp = tp->t_mountp;
INIT_LIST_HEAD(&ntp->t_items);
INIT_LIST_HEAD(&ntp->t_busy);
@@ -135,9 +93,9 @@ xfs_trans_dup(
ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
(tp->t_flags & XFS_TRANS_RESERVE) |
- (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+ (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
/* We gave our writer reference to the new transaction */
- tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
+ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
tp->t_blk_res = tp->t_blk_res_used;
@@ -165,7 +123,7 @@ xfs_trans_dup(
* This does not do quota reservations. That typically is done by the
* caller afterwards.
*/
-int
+static int
xfs_trans_reserve(
struct xfs_trans *tp,
struct xfs_trans_res *resp,
@@ -219,7 +177,7 @@ xfs_trans_reserve(
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
- permanent, tp->t_type);
+ permanent);
}
if (error)
@@ -268,6 +226,42 @@ undo_blocks:
return error;
}
+int
+xfs_trans_alloc(
+ struct xfs_mount *mp,
+ struct xfs_trans_res *resp,
+ uint blocks,
+ uint rtextents,
+ uint flags,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+ sb_start_intwrite(mp->m_super);
+
+ WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+ atomic_inc(&mp->m_active_trans);
+
+ tp = kmem_zone_zalloc(xfs_trans_zone,
+ (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
+ tp->t_magic = XFS_TRANS_HEADER_MAGIC;
+ tp->t_flags = flags;
+ tp->t_mountp = mp;
+ INIT_LIST_HEAD(&tp->t_items);
+ INIT_LIST_HEAD(&tp->t_busy);
+
+ error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ *tpp = tp;
+ return 0;
+}
+
/*
* Record the indicated change to the given field for application
* to the file system's superblock when the transaction commits.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e7c49cf43fbc..9a462e892e4f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -90,7 +90,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
*/
typedef struct xfs_trans {
unsigned int t_magic; /* magic number */
- unsigned int t_type; /* transaction type */
unsigned int t_log_res; /* amt of log space resvd */
unsigned int t_log_count; /* count for perm log res */
unsigned int t_blk_res; /* # of blocks resvd */
@@ -148,10 +147,9 @@ typedef struct xfs_trans {
/*
* XFS transaction mechanism exported interfaces.
*/
-xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
- uint, uint);
+int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
+ uint blocks, uint rtextents, uint flags,
+ struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index d111f691f313..ea62245fee26 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -74,11 +74,12 @@ xfs_forget_acl(
}
static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, const void *value, size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
{
int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ struct xfs_inode *ip = XFS_I(inode);
int error;
/* Convert Linux syscall to XFS internal ATTR flags */
@@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
error = xfs_attr_set(ip, (unsigned char *)name,
(void *)value, size, xflags);
if (!error)
- xfs_forget_acl(d_inode(dentry), name, xflags);
+ xfs_forget_acl(inode, name, xflags);
return error;
}
@@ -146,7 +147,7 @@ __xfs_xattr_put_listent(
arraytop = context->count + prefix_len + namelen + 1;
if (arraytop > context->firstu) {
context->count = -1; /* insufficient space */
- return 1;
+ return 0;
}
offset = (char *)context->alist + context->count;
strncpy(offset, prefix, prefix_len);
@@ -166,8 +167,7 @@ xfs_xattr_put_listent(
int flags,
unsigned char *name,
int namelen,
- int valuelen,
- unsigned char *value)
+ int valuelen)
{
char *prefix;
int prefix_len;
@@ -221,11 +221,15 @@ xfs_xattr_put_listent(
}
ssize_t
-xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+xfs_vn_listxattr(
+ struct dentry *dentry,
+ char *data,
+ size_t size)
{
struct xfs_attr_list_context context;
struct attrlist_cursor_kern cursor = { 0 };
- struct inode *inode = d_inode(dentry);
+ struct inode *inode = d_inode(dentry);
+ int error;
/*
* First read the regular on-disk attributes.
@@ -239,7 +243,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
context.firstu = context.bufsize;
context.put_listent = xfs_xattr_put_listent;
- xfs_attr_list_int(&context);
+ error = xfs_attr_list_int(&context);
+ if (error)
+ return error;
if (context.count < 0)
return -ERANGE;