summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_file.c8
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/adfs/adfs.h28
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/file.c8
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/attr.c2
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/block_dev.c49
-rw-r--r--fs/btrfs/Makefile5
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/async-thread.c4
-rw-r--r--fs/btrfs/backref.c25
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c105
-rw-r--r--fs/btrfs/ctree.c11
-rw-r--r--fs/btrfs/ctree.h181
-rw-r--r--fs/btrfs/delayed-inode.c7
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/delayed-ref.h8
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/disk-io.c178
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/extent-tree.c237
-rw-r--r--fs/btrfs/extent-tree.h0
-rw-r--r--fs/btrfs/extent_io.c310
-rw-r--r--fs/btrfs/extent_io.h136
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c56
-rw-r--r--fs/btrfs/free-space-cache.c28
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/free-space-tree.c1591
-rw-r--r--fs/btrfs/free-space-tree.h72
-rw-r--r--fs/btrfs/inode-map.c17
-rw-r--r--fs/btrfs/inode-map.h1
-rw-r--r--fs/btrfs/inode.c343
-rw-r--r--fs/btrfs/ioctl.c116
-rw-r--r--fs/btrfs/locking.c2
-rw-r--r--fs/btrfs/raid56.c102
-rw-r--r--fs/btrfs/relocation.c16
-rw-r--r--fs/btrfs/scrub.c24
-rw-r--r--fs/btrfs/send.c16
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/super.c113
-rw-r--r--fs/btrfs/tests/btrfs-tests.c58
-rw-r--r--fs/btrfs/tests/btrfs-tests.h10
-rw-r--r--fs/btrfs/tests/extent-io-tests.c149
-rw-r--r--fs/btrfs/tests/free-space-tests.c239
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c571
-rw-r--r--fs/btrfs/tests/inode-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c20
-rw-r--r--fs/btrfs/transaction.c50
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-defrag.c27
-rw-r--r--fs/btrfs/volumes.c198
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c6
-rw-r--r--fs/cachefiles/interface.c4
-rw-r--r--fs/cachefiles/namei.c40
-rw-r--r--fs/ceph/cache.c4
-rw-r--r--fs/ceph/caps.c4
-rw-r--r--fs/ceph/dir.c4
-rw-r--r--fs/ceph/export.c4
-rw-r--r--fs/ceph/file.c18
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/file.c20
-rw-r--r--fs/coda/coda_linux.h3
-rw-r--r--fs/coda/dir.c4
-rw-r--r--fs/coda/file.c8
-rw-r--r--fs/coda/inode.c6
-rw-r--r--fs/configfs/dir.c204
-rw-r--r--fs/configfs/file.c8
-rw-r--r--fs/configfs/inode.c4
-rw-r--r--fs/coredump.c20
-rw-r--r--fs/dax.c575
-rw-r--r--fs/dcache.c9
-rw-r--r--fs/debugfs/inode.c22
-rw-r--r--fs/devpts/inode.c12
-rw-r--r--fs/direct-io.c8
-rw-r--r--fs/dlm/user.c2
-rw-r--r--fs/ecryptfs/inode.c32
-rw-r--r--fs/ecryptfs/main.c6
-rw-r--r--fs/ecryptfs/mmap.c4
-rw-r--r--fs/efivarfs/file.c4
-rw-r--r--fs/efivarfs/super.c4
-rw-r--r--fs/efs/super.c6
-rw-r--r--fs/eventfd.c4
-rw-r--r--fs/eventpoll.c24
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exofs/file.c4
-rw-r--r--fs/exofs/super.c4
-rw-r--r--fs/exportfs/expfs.c12
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/ioctl.c12
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/crypto.c6
-rw-r--r--fs/ext4/crypto_key.c4
-rw-r--r--fs/ext4/ext4.h101
-rw-r--r--fs/ext4/extents.c173
-rw-r--r--fs/ext4/file.c102
-rw-r--r--fs/ext4/ialloc.c7
-rw-r--r--fs/ext4/inline.c10
-rw-r--r--fs/ext4/inode.c278
-rw-r--r--fs/ext4/ioctl.c388
-rw-r--r--fs/ext4/namei.c38
-rw-r--r--fs/ext4/super.c103
-rw-r--r--fs/ext4/truncate.h2
-rw-r--r--fs/f2fs/data.c4
-rw-r--r--fs/f2fs/file.c20
-rw-r--r--fs/f2fs/super.c5
-rw-r--r--fs/fat/cache.c79
-rw-r--r--fs/fat/dir.c6
-rw-r--r--fs/fat/fat.h8
-rw-r--r--fs/fat/fatent.c24
-rw-r--r--fs/fat/file.c69
-rw-r--r--fs/fat/inode.c106
-rw-r--r--fs/file.c7
-rw-r--r--fs/filesystems.c6
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/dir.c10
-rw-r--r--fs/fuse/file.c101
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/gfs2/quota.c8
-rw-r--r--fs/hfs/catalog.c6
-rw-r--r--fs/hfs/dir.c4
-rw-r--r--fs/hfs/inode.c8
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/dir.c4
-rw-r--r--fs/hfsplus/inode.c8
-rw-r--r--fs/hfsplus/ioctl.c4
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c6
-rw-r--r--fs/hpfs/dir.c6
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c186
-rw-r--r--fs/inode.c12
-rw-r--r--fs/ioctl.c4
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jffs2/build.c8
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/fs.c5
-rw-r--r--fs/jffs2/super.c7
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/ioctl.c6
-rw-r--r--fs/jfs/super.c8
-rw-r--r--fs/kernfs/dir.c13
-rw-r--r--fs/libfs.c10
-rw-r--r--fs/lockd/svc.c79
-rw-r--r--fs/locks.c6
-rw-r--r--fs/logfs/Kconfig2
-rw-r--r--fs/logfs/file.c8
-rw-r--r--fs/logfs/inode.c3
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/namei.c74
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/ncpfs/dir.c8
-rw-r--r--fs/ncpfs/file.c4
-rw-r--r--fs/ncpfs/inode.c2
-rw-r--r--fs/nfs/callback_proc.c52
-rw-r--r--fs/nfs/dir.c34
-rw-r--r--fs/nfs/direct.c60
-rw-r--r--fs/nfs/file.c10
-rw-r--r--fs/nfs/filelayout/filelayout.c20
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c211
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c115
-rw-r--r--fs/nfs/inode.c88
-rw-r--r--fs/nfs/internal.h41
-rw-r--r--fs/nfs/nfs42proc.c37
-rw-r--r--fs/nfs/nfs4file.c24
-rw-r--r--fs/nfs/nfs4proc.c71
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/nfs4trace.c1
-rw-r--r--fs/nfs/nfs4trace.h431
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/pagelist.c126
-rw-r--r--fs/nfs/pnfs.c180
-rw-r--r--fs/nfs/pnfs.h54
-rw-r--r--fs/nfs/pnfs_nfs.c10
-rw-r--r--fs/nfs/read.c43
-rw-r--r--fs/nfs/write.c140
-rw-r--r--fs/nfsd/lockd.c2
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfs4callback.c6
-rw-r--r--fs/nfsd/nfs4layouts.c39
-rw-r--r--fs/nfsd/nfs4proc.c4
-rw-r--r--fs/nfsd/nfs4recover.c18
-rw-r--r--fs/nfsd/nfs4state.c65
-rw-r--r--fs/nfsd/nfsfh.h27
-rw-r--r--fs/nfsd/nfssvc.c75
-rw-r--r--fs/nfsd/state.h4
-rw-r--r--fs/nfsd/trace.h41
-rw-r--r--fs/nfsd/vfs.c19
-rw-r--r--fs/nilfs2/inode.c4
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/super.c3
-rw-r--r--fs/notify/inode_mark.c3
-rw-r--r--fs/notify/mark.c66
-rw-r--r--fs/ntfs/dir.c4
-rw-r--r--fs/ntfs/file.c8
-rw-r--r--fs/ntfs/quota.c6
-rw-r--r--fs/ntfs/super.c16
-rw-r--r--fs/ocfs2/alloc.c47
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h11
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c35
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c15
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/dlmglue.c14
-rw-r--r--fs/ocfs2/file.c20
-rw-r--r--fs/ocfs2/inode.c12
-rw-r--r--fs/ocfs2/ioctl.c16
-rw-r--r--fs/ocfs2/journal.c18
-rw-r--r--fs/ocfs2/localalloc.c26
-rw-r--r--fs/ocfs2/move_extents.c16
-rw-r--r--fs/ocfs2/namei.c49
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c6
-rw-r--r--fs/ocfs2/refcounttree.c12
-rw-r--r--fs/ocfs2/resize.c8
-rw-r--r--fs/ocfs2/slot_map.c14
-rw-r--r--fs/ocfs2/suballoc.c12
-rw-r--r--fs/ocfs2/super.c13
-rw-r--r--fs/ocfs2/xattr.c14
-rw-r--r--fs/open.c12
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/overlayfs/copy_up.c45
-rw-r--r--fs/overlayfs/dir.c12
-rw-r--r--fs/overlayfs/inode.c40
-rw-r--r--fs/overlayfs/overlayfs.h3
-rw-r--r--fs/overlayfs/readdir.c23
-rw-r--r--fs/overlayfs/super.c50
-rw-r--r--fs/pipe.c47
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c34
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/meminfo.c5
-rw-r--r--fs/proc/namespaces.c4
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/self.c4
-rw-r--r--fs/proc/task_mmu.c137
-rw-r--r--fs/proc/thread_self.c4
-rw-r--r--fs/pstore/inode.c6
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/quota/dquot.c22
-rw-r--r--fs/quota/netlink.c5
-rw-r--r--fs/quota/quota_v2.c4
-rw-r--r--fs/read_write.c7
-rw-r--r--fs/readdir.c2
-rw-r--r--fs/reiserfs/dir.c4
-rw-r--r--fs/reiserfs/file.c4
-rw-r--r--fs/reiserfs/ioctl.c2
-rw-r--r--fs/reiserfs/super.c5
-rw-r--r--fs/reiserfs/xattr.c64
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/squashfs/super.c3
-rw-r--r--fs/stat.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/tracefs/inode.c34
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/file.c4
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/xattr.c4
-rw-r--r--fs/udf/balloc.c98
-rw-r--r--fs/udf/file.c10
-rw-r--r--fs/udf/inode.c245
-rw-r--r--fs/udf/super.c24
-rw-r--r--fs/udf/udfdecl.h4
-rw-r--r--fs/udf/unicode.c21
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/utimes.c4
-rw-r--r--fs/xattr.c8
-rw-r--r--fs/xfs/kmem.h1
-rw-r--r--fs/xfs/libxfs/xfs_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_fs.h38
-rw-r--r--fs/xfs/xfs_buf.c10
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_inode.c60
-rw-r--r--fs/xfs/xfs_ioctl.c92
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_pnfs.c4
-rw-r--r--fs/xfs/xfs_super.c4
-rw-r--r--fs/xfs/xfs_trans_ail.c1
306 files changed, 8864 insertions, 3520 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca025019d..072e7599583a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void)
v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
sizeof(struct v9fs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
v9fs_inode_init_once);
if (!v9fs_inode_cache)
return -ENOMEM;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 7bf835f85bc8..eadc894faea2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
if (retval)
return retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
v9fs_blank_wstat(&wstat);
retval = p9_client_wstat(fid, &wstat);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
@@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
if (retval)
return retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
retval = p9_client_fsync(fid, datasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index 2bb1ef86c411..9adee0d7536e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,7 +50,8 @@ config FS_DAX_PMD
bool
default FS_DAX
depends on FS_DAX
- depends on BROKEN
+ depends on ZONE_DEVICE
+ depends on TRANSPARENT_HUGEPAGE
endif # BLOCK
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index ea4aba56f29d..fadf408bdd46 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -44,24 +44,24 @@ struct adfs_dir_ops;
*/
struct adfs_sb_info {
union { struct {
- struct adfs_discmap *s_map; /* bh list containing map */
- const struct adfs_dir_ops *s_dir; /* directory operations */
+ struct adfs_discmap *s_map; /* bh list containing map */
+ const struct adfs_dir_ops *s_dir; /* directory operations */
};
- struct rcu_head rcu; /* used only at shutdown time */
+ struct rcu_head rcu; /* used only at shutdown time */
};
- kuid_t s_uid; /* owner uid */
- kgid_t s_gid; /* owner gid */
- umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
- umode_t s_other_mask; /* ADFS other perm -> unix perm */
+ kuid_t s_uid; /* owner uid */
+ kgid_t s_gid; /* owner gid */
+ umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
+ umode_t s_other_mask; /* ADFS other perm -> unix perm */
int s_ftsuffix; /* ,xyz hex filetype suffix option */
- __u32 s_ids_per_zone; /* max. no ids in one zone */
- __u32 s_idlen; /* length of ID in map */
- __u32 s_map_size; /* sector size of a map */
- unsigned long s_size; /* total size (in blocks) of this fs */
- signed int s_map2blk; /* shift left by this for map->sector */
- unsigned int s_log2sharesize;/* log2 share size */
- __le32 s_version; /* disc format version */
+ __u32 s_ids_per_zone; /* max. no ids in one zone */
+ __u32 s_idlen; /* length of ID in map */
+ __u32 s_map_size; /* sector size of a map */
+ unsigned long s_size; /* total size (in blocks) of this fs */
+ signed int s_map2blk; /* shift left by this for map->sector*/
+ unsigned int s_log2sharesize;/* log2 share size */
+ __le32 s_version; /* disc format version */
unsigned int s_namelen; /* maximum number of characters in name */
};
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4d4a0df8344f..c9fdfb112933 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int __init init_inodecache(void)
adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
sizeof(struct adfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (adfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 659c579c4588..0548c53f41d5 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp)
inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (inode->i_size != AFFS_I(inode)->mmu_private)
affs_truncate(inode);
affs_free_prealloc(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
@@ -958,12 +958,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = write_inode_now(inode, 0);
err = sync_blockdev(inode->i_sb->s_bdev);
if (!ret)
ret = err;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
const struct file_operations affs_file_operations = {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8836df5f1e11..2a6713b6b9f4 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -132,7 +132,7 @@ static int __init init_inodecache(void)
affs_inode_cachep = kmem_cache_create("affs_inode_cache",
sizeof(struct affs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (affs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 4baf1d2b39e4..d91a9c9cfbd0 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
fl->fl_type = F_UNLCK;
- mutex_lock(&vnode->vfs_inode.i_mutex);
+ inode_lock(&vnode->vfs_inode);
/* check local lock records first */
ret = 0;
@@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
}
error:
- mutex_unlock(&vnode->vfs_inode.i_mutex);
+ inode_unlock(&vnode->vfs_inode);
_leave(" = %d [%hd]", ret, fl->fl_type);
return ret;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fb4a5129f7d..81afefe7d8a6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
afs_inode_cachep = kmem_cache_create("afs_inode_cache",
sizeof(struct afs_vnode),
0,
- SLAB_HWCACHE_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
afs_i_init_once);
if (!afs_inode_cachep) {
printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 0714abcd7f32..dfef94f70667 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* use a writeback record as a marker in the queue - when this reaches
* the front of the queue, all the outstanding writes are either
@@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
afs_put_writeback(wb);
_leave(" = %d", ret);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/attr.c b/fs/attr.c
index 6530ced19697..25b24d0f6c88 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
struct timespec now;
unsigned int ia_valid = attr->ia_valid;
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(inode));
if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 25250fa87086..cc0e08252913 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -434,7 +434,7 @@ befs_init_inodecache(void)
befs_inode_cachep = kmem_cache_create("befs_inode_cache",
sizeof (struct befs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (befs_inode_cachep == NULL) {
pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fdcb4d69f430..1e5c896f6b79 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -270,7 +270,7 @@ static int __init init_inodecache(void)
bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
sizeof(struct bfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (bfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 78f005f37847..3a3ced779fc7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
case 3:
/* Delete this handler. */
root = dget(file->f_path.dentry->d_sb->s_root);
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
kill_node(e);
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
dput(root);
break;
default:
@@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
return PTR_ERR(e);
root = dget(sb->s_root);
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
dentry = lookup_one_len(e->name, root, strlen(e->name));
err = PTR_ERR(dentry);
if (IS_ERR(dentry))
@@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
out2:
dput(dentry);
out:
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
dput(root);
if (err) {
@@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
case 3:
/* Delete all handlers. */
root = dget(file->f_path.dentry->d_sb->s_root);
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
while (!list_empty(&entries))
kill_node(list_entry(entries.next, Node, list));
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
dput(root);
break;
default:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d878e4860fb7..7b9cd49622b1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+ if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
return;
invalidate_bh_lrus();
@@ -346,9 +346,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
struct inode *bd_inode = bdev_file_inode(file);
loff_t retval;
- mutex_lock(&bd_inode->i_mutex);
+ inode_lock(bd_inode);
retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
- mutex_unlock(&bd_inode->i_mutex);
+ inode_unlock(bd_inode);
return retval;
}
@@ -400,7 +400,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
- result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+ result = blk_queue_enter(bdev->bd_queue, false);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
@@ -437,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+ result = blk_queue_enter(bdev->bd_queue, false);
if (result)
return result;
@@ -455,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
/**
* bdev_direct_access() - Get the address for directly-accessibly memory
* @bdev: The device containing the memory
- * @sector: The offset within the device
- * @addr: Where to put the address of the memory
- * @pfn: The Page Frame Number for the memory
- * @size: The number of bytes requested
+ * @dax: control and output parameters for ->direct_access
*
* If a block device is made up of directly addressable memory, this function
* will tell the caller the PFN and the address of the memory. The address
@@ -469,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
* Return: negative errno if an error occurs, otherwise the number of bytes
* accessible at this address.
*/
-long bdev_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **addr, unsigned long *pfn, long size)
+long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
{
- long avail;
+ sector_t sector = dax->sector;
+ long avail, size = dax->size;
const struct block_device_operations *ops = bdev->bd_disk->fops;
/*
@@ -491,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, addr, pfn);
+ avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
if (!avail)
return -ERANGE;
+ if (avail > 0 && avail & ~PAGE_MASK)
+ return -ENXIO;
return min(avail, size);
}
EXPORT_SYMBOL_GPL(bdev_direct_access);
@@ -595,7 +594,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
@@ -701,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode)
spin_lock(&bdev_lock);
bdev = inode->i_bdev;
if (bdev) {
- ihold(bdev->bd_inode);
+ bdgrab(bdev);
spin_unlock(&bdev_lock);
return bdev;
}
@@ -717,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode)
* So, we can access it via ->i_mapping always
* without igrab().
*/
- ihold(bdev->bd_inode);
+ bdgrab(bdev);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -740,7 +739,7 @@ void bd_forget(struct inode *inode)
spin_unlock(&bdev_lock);
if (bdev)
- iput(bdev->bd_inode);
+ bdput(bdev);
}
/**
@@ -1143,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
{
unsigned bsize = bdev_logical_block_size(bdev);
- mutex_lock(&bdev->bd_inode->i_mutex);
+ inode_lock(bdev->bd_inode);
i_size_write(bdev->bd_inode, size);
- mutex_unlock(&bdev->bd_inode->i_mutex);
+ inode_unlock(bdev->bd_inode);
while (bsize < PAGE_CACHE_SIZE) {
if (size & bsize)
break;
@@ -1742,9 +1741,9 @@ static void blkdev_vm_open(struct vm_area_struct *vma)
struct inode *bd_inode = bdev_file_inode(vma->vm_file);
struct block_device *bdev = I_BDEV(bd_inode);
- mutex_lock(&bd_inode->i_mutex);
+ inode_lock(bd_inode);
bdev->bd_map_count++;
- mutex_unlock(&bd_inode->i_mutex);
+ inode_unlock(bd_inode);
}
static void blkdev_vm_close(struct vm_area_struct *vma)
@@ -1752,9 +1751,9 @@ static void blkdev_vm_close(struct vm_area_struct *vma)
struct inode *bd_inode = bdev_file_inode(vma->vm_file);
struct block_device *bdev = I_BDEV(bd_inode);
- mutex_lock(&bd_inode->i_mutex);
+ inode_lock(bd_inode);
bdev->bd_map_count--;
- mutex_unlock(&bd_inode->i_mutex);
+ inode_unlock(bd_inode);
}
static const struct vm_operations_struct blkdev_dax_vm_ops = {
@@ -1778,7 +1777,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
struct block_device *bdev = I_BDEV(bd_inode);
file_accessed(file);
- mutex_lock(&bd_inode->i_mutex);
+ inode_lock(bd_inode);
bdev->bd_map_count++;
if (IS_DAX(bd_inode)) {
vma->vm_ops = &blkdev_dax_vm_ops;
@@ -1786,7 +1785,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
} else {
vma->vm_ops = &blkdev_default_vm_ops;
}
- mutex_unlock(&bd_inode->i_mutex);
+ inode_unlock(bd_inode);
return 0;
}
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6d1d0b93b1aa..128ce17a80b0 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o
+ uuid-tree.o props.o hash.o free-space-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
- tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
+ tests/free-space-tree-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f89db0c21b51..6d263bb1621c 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, "", 0);
if (size > 0) {
- value = kzalloc(size, GFP_NOFS);
+ value = kzalloc(size, GFP_KERNEL);
if (!value)
return ERR_PTR(-ENOMEM);
size = __btrfs_getxattr(inode, name, value, size);
@@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_NOFS);
+ value = kmalloc(size, GFP_KERNEL);
if (!value) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 3e36e4adc4a3..88d9af3d4581 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -97,7 +97,7 @@ static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+ struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -148,7 +148,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
int limit_active,
int thresh)
{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d453d62ab0c6..b90cd3776f8e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -520,13 +520,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1,
static int __add_missing_keys(struct btrfs_fs_info *fs_info,
struct list_head *head)
{
- struct list_head *pos;
+ struct __prelim_ref *ref;
struct extent_buffer *eb;
- list_for_each(pos, head) {
- struct __prelim_ref *ref;
- ref = list_entry(pos, struct __prelim_ref, list);
-
+ list_for_each_entry(ref, head, list) {
if (ref->parent)
continue;
if (ref->key_for_search.type)
@@ -563,23 +560,15 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
*/
static void __merge_refs(struct list_head *head, int mode)
{
- struct list_head *pos1;
+ struct __prelim_ref *pos1;
- list_for_each(pos1, head) {
- struct list_head *n2;
- struct list_head *pos2;
- struct __prelim_ref *ref1;
+ list_for_each_entry(pos1, head, list) {
+ struct __prelim_ref *pos2 = pos1, *tmp;
- ref1 = list_entry(pos1, struct __prelim_ref, list);
-
- for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
- pos2 = n2, n2 = pos2->next) {
- struct __prelim_ref *ref2;
- struct __prelim_ref *xchg;
+ list_for_each_entry_safe_continue(pos2, tmp, head, list) {
+ struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2;
struct extent_inode_elem *eie;
- ref2 = list_entry(pos2, struct __prelim_ref, list);
-
if (!ref_for_same_block(ref1, ref2))
continue;
if (mode == 1) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0ef5cc13fae2..61205e3bbefa 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -192,6 +192,10 @@ struct btrfs_inode {
/* File creation time. */
struct timespec i_otime;
+ /* Hook into fs_info->delayed_iputs */
+ struct list_head delayed_iput;
+ long delayed_iput_count;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 0340c57bf377..861d472564c1 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -531,13 +531,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup(
(((unsigned int)(dev_bytenr >> 16)) ^
((unsigned int)((uintptr_t)bdev))) &
(BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_block *const b =
- list_entry(elem, struct btrfsic_block,
- collision_resolving_node);
+ struct btrfsic_block *b;
+ list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
return b;
}
@@ -588,13 +584,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
((unsigned int)((uintptr_t)bdev_ref_to)) ^
((unsigned int)((uintptr_t)bdev_ref_from))) &
(BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_block_link *const l =
- list_entry(elem, struct btrfsic_block_link,
- collision_resolving_node);
+ struct btrfsic_block_link *l;
+ list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
BUG_ON(NULL == l->block_ref_to);
BUG_ON(NULL == l->block_ref_from);
if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
@@ -639,13 +631,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
const unsigned int hashval =
(((unsigned int)((uintptr_t)bdev)) &
(BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_dev_state *const ds =
- list_entry(elem, struct btrfsic_dev_state,
- collision_resolving_node);
+ struct btrfsic_dev_state *ds;
+ list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
if (ds->bdev == bdev)
return ds;
}
@@ -1720,29 +1708,20 @@ static int btrfsic_read_block(struct btrfsic_state *state,
static void btrfsic_dump_database(struct btrfsic_state *state)
{
- struct list_head *elem_all;
+ const struct btrfsic_block *b_all;
BUG_ON(NULL == state);
printk(KERN_INFO "all_blocks_list:\n");
- list_for_each(elem_all, &state->all_blocks_list) {
- const struct btrfsic_block *const b_all =
- list_entry(elem_all, struct btrfsic_block,
- all_blocks_node);
- struct list_head *elem_ref_to;
- struct list_head *elem_ref_from;
+ list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
+ const struct btrfsic_block_link *l;
printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
b_all->logical_bytenr, b_all->dev_state->name,
b_all->dev_bytenr, b_all->mirror_num);
- list_for_each(elem_ref_to, &b_all->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
" refers %u* to"
" %c @%llu (%s/%llu/%d)\n",
@@ -1757,12 +1736,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
l->block_ref_to->mirror_num);
}
- list_for_each(elem_ref_from, &b_all->ref_from_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_from,
- struct btrfsic_block_link,
- node_ref_from);
-
+ list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
" is ref %u* from"
" %c @%llu (%s/%llu/%d)\n",
@@ -1845,8 +1819,7 @@ again:
&state->block_hashtable);
if (NULL != block) {
u64 bytenr = 0;
- struct list_head *elem_ref_to;
- struct list_head *tmp_ref_to;
+ struct btrfsic_block_link *l, *tmp;
if (block->is_superblock) {
bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
@@ -1967,13 +1940,8 @@ again:
* because it still carries valueable information
* like whether it was ever written and IO completed.
*/
- list_for_each_safe(elem_ref_to, tmp_ref_to,
- &block->ref_to_list) {
- struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry_safe(l, tmp, &block->ref_to_list,
+ node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
btrfsic_print_rem_link(state, l);
l->ref_cnt--;
@@ -2436,7 +2404,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
struct btrfsic_block *const block,
int recursion_level)
{
- struct list_head *elem_ref_to;
+ const struct btrfsic_block_link *l;
int ret = 0;
if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
@@ -2464,11 +2432,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack
* space is very small and the max recursion depth is limited.
*/
- list_for_each(elem_ref_to, &block->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to, struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2561,7 +2525,7 @@ static int btrfsic_is_block_ref_by_superblock(
const struct btrfsic_block *block,
int recursion_level)
{
- struct list_head *elem_ref_from;
+ const struct btrfsic_block_link *l;
if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
/* refer to comment at "abort cyclic linkage (case 1)" */
@@ -2576,11 +2540,7 @@ static int btrfsic_is_block_ref_by_superblock(
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- list_for_each(elem_ref_from, &block->ref_from_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_from, struct btrfsic_block_link,
- node_ref_from);
-
+ list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2669,7 +2629,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
const struct btrfsic_block *block,
int indent_level)
{
- struct list_head *elem_ref_to;
+ const struct btrfsic_block_link *l;
int indent_add;
static char buf[80];
int cursor_position;
@@ -2704,11 +2664,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
}
cursor_position = indent_level;
- list_for_each(elem_ref_to, &block->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to, struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
while (cursor_position < indent_level) {
printk(" ");
cursor_position++;
@@ -3165,8 +3121,7 @@ int btrfsic_mount(struct btrfs_root *root,
void btrfsic_unmount(struct btrfs_root *root,
struct btrfs_fs_devices *fs_devices)
{
- struct list_head *elem_all;
- struct list_head *tmp_all;
+ struct btrfsic_block *b_all, *tmp_all;
struct btrfsic_state *state;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
@@ -3206,20 +3161,12 @@ void btrfsic_unmount(struct btrfs_root *root,
* just free all memory that was allocated dynamically.
* Free the blocks and the block_links.
*/
- list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
- struct btrfsic_block *const b_all =
- list_entry(elem_all, struct btrfsic_block,
- all_blocks_node);
- struct list_head *elem_ref_to;
- struct list_head *tmp_ref_to;
-
- list_for_each_safe(elem_ref_to, tmp_ref_to,
- &b_all->ref_to_list) {
- struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
+ list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
+ all_blocks_node) {
+ struct btrfsic_block_link *l, *tmp;
+ list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
+ node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
btrfsic_print_rem_link(state, l);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5b8e235c4b6d..769e0ff1b4ce 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1555,7 +1555,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
- search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+ search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
btrfs_set_lock_blocking(parent);
@@ -2248,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root,
u64 target;
u64 nread = 0;
u64 gen;
- int direction = path->reada;
struct extent_buffer *eb;
u32 nr;
u32 blocksize;
@@ -2276,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root,
nr = slot;
while (1) {
- if (direction < 0) {
+ if (path->reada == READA_BACK) {
if (nr == 0)
break;
nr--;
- } else if (direction > 0) {
+ } else if (path->reada == READA_FORWARD) {
nr++;
if (nr >= nritems)
break;
}
- if (path->reada < 0 && objectid) {
+ if (path->reada == READA_BACK && objectid) {
btrfs_node_key(node, &disk_key, nr);
if (btrfs_disk_key_objectid(&disk_key) != objectid)
break;
@@ -2493,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
free_extent_buffer(tmp);
- if (p->reada)
+ if (p->reada != READA_NONE)
reada_for_search(root, p, level, slot, key->objectid);
btrfs_release_path(p);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7e4e344e8e0..bfe4a337fb4d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
#include <linux/btrfs.h>
#include <linux/workqueue.h>
#include <linux/security.h>
+#include <linux/sizes.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -96,6 +97,9 @@ struct btrfs_ordered_sum;
/* for storing items that use the BTRFS_UUID_KEY* types */
#define BTRFS_UUID_TREE_OBJECTID 9ULL
+/* tracks free space in block groups. */
+#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+
/* for storing balance parameters in the root tree */
#define BTRFS_BALANCE_OBJECTID -4ULL
@@ -174,7 +178,7 @@ struct btrfs_ordered_sum;
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
-static int btrfs_csum_sizes[] = { 4 };
+static const int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -196,9 +200,9 @@ static int btrfs_csum_sizes[] = { 4 };
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
-#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
-#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
/*
* The key defines the order in the tree, and so it also defines (optimal)
@@ -500,6 +504,8 @@ struct btrfs_super_block {
* Compat flags that we support. If any incompat flags are set other than the
* ones specified below then we will fail to mount
*/
+#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0)
+
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
@@ -526,7 +532,10 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP \
+ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
+
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -590,14 +599,15 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
+enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
/* if there is real range locking, this locks field will change */
- int locks[BTRFS_MAX_LEVEL];
- int reada;
+ u8 locks[BTRFS_MAX_LEVEL];
+ u8 reada;
/* keep some upper locks as we walk down */
- int lowest_level;
+ u8 lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
@@ -1088,6 +1098,13 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+struct btrfs_free_space_info {
+ __le32 extent_count;
+ __le32 flags;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
+
#define BTRFS_QGROUP_LEVEL_SHIFT 48
static inline u64 btrfs_qgroup_level(u64 qgroupid)
{
@@ -1296,6 +1313,9 @@ struct btrfs_caching_control {
atomic_t count;
};
+/* Once caching_thread() finds this much free space, it will wake up waiters. */
+#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+
struct btrfs_io_ctl {
void *cur, *orig;
struct page *page;
@@ -1321,8 +1341,20 @@ struct btrfs_block_group_cache {
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
- u64 sectorsize;
u64 cache_generation;
+ u32 sectorsize;
+
+ /*
+ * If the free space extent count exceeds this number, convert the block
+ * group to bitmaps.
+ */
+ u32 bitmap_high_thresh;
+
+ /*
+ * If the free space extent count drops below this number, convert the
+ * block group back to extents.
+ */
+ u32 bitmap_low_thresh;
/*
* It is just used for the delayed data space allocation because
@@ -1378,6 +1410,15 @@ struct btrfs_block_group_cache {
struct list_head io_list;
struct btrfs_io_ctl io_ctl;
+
+ /* Lock for free space tree operations. */
+ struct mutex free_space_lock;
+
+ /*
+ * Does the block group need to be added to the free space tree?
+ * Protected by free_space_lock.
+ */
+ int needs_free_space;
};
/* delayed seq elem */
@@ -1429,6 +1470,7 @@ struct btrfs_fs_info {
struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
+ struct btrfs_root *free_space_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -1572,7 +1614,7 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
- struct rw_semaphore delayed_iput_sem;
+ struct mutex cleaner_delayed_iput_mutex;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@@ -1816,6 +1858,8 @@ struct btrfs_fs_info {
* and will be latter freed. Protected by fs_info->chunk_mutex.
*/
struct list_head pinned_chunks;
+
+ int creating_free_space_tree;
};
struct btrfs_subvolume_writers {
@@ -2092,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+/*
+ * Every block group is represented in the free space tree by a free space info
+ * item, which stores some accounting information. It is keyed on
+ * (block_group_start, FREE_SPACE_INFO, block_group_length).
+ */
+#define BTRFS_FREE_SPACE_INFO_KEY 198
+
+/*
+ * A free space extent tracks an extent of space that is free in a block group.
+ * It is keyed on (start, FREE_SPACE_EXTENT, length).
+ */
+#define BTRFS_FREE_SPACE_EXTENT_KEY 199
+
+/*
+ * When a block group becomes very fragmented, we convert it to use bitmaps
+ * instead of extents. A free space bitmap is keyed on
+ * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+ * (length / sectorsize) bits.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_KEY 200
+
#define BTRFS_DEV_EXTENT_KEY 204
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
@@ -2184,6 +2249,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
+#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2506,6 +2572,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags,
BTRFS_SETGET_STACK_FUNCS(block_group_flags,
struct btrfs_block_group_item, flags, 64);
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+ extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
/* struct btrfs_inode_ref */
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
@@ -3570,9 +3641,13 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
int __get_raid_index(u64 flags);
int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const u64 type);
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end);
+
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3737,6 +3812,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->csum_root);
kfree(fs_info->quota_root);
kfree(fs_info->uuid_root);
+ kfree(fs_info->free_space_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
security_free_mnt_opts(&fs_info->security_opts);
@@ -3906,7 +3982,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
- int wait;
int delay_iput;
struct completion completion;
struct list_head list;
@@ -3914,7 +3989,7 @@ struct btrfs_delalloc_work {
};
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput);
+ int delay_iput);
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
@@ -4253,16 +4328,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
}
}
+#define btrfs_clear_fs_incompat(__fs_info, opt) \
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
#define btrfs_fs_incompat(fs_info, opt) \
__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
-static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
{
struct btrfs_super_block *disk_super;
disk_super = fs_info->super_copy;
return !!(btrfs_super_incompat_flags(disk_super) & flag);
}
+#define btrfs_set_fs_compat_ro(__fs_info, opt) \
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "setting %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_fs_compat_ro(fs_info, opt) \
+ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+}
+
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e0941fbb913c..0be47e4b8136 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -54,16 +54,11 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
atomic_set(&delayed_node->refs, 0);
- delayed_node->count = 0;
- delayed_node->flags = 0;
delayed_node->ins_root = RB_ROOT;
delayed_node->del_root = RB_ROOT;
mutex_init(&delayed_node->mutex);
- delayed_node->index_cnt = 0;
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
- delayed_node->bytes_reserved = 0;
- memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
}
static inline int btrfs_is_continuous_delayed_item(
@@ -132,7 +127,7 @@ again:
if (node)
return node;
- node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
if (!node)
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e06dd75ad13f..914ac13bd92f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -493,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
memcpy(&existing_ref->extent_op->key,
&ref->extent_op->key,
sizeof(ref->extent_op->key));
- existing_ref->extent_op->update_key = 1;
+ existing_ref->extent_op->update_key = true;
}
if (ref->extent_op->update_flags) {
existing_ref->extent_op->flags_to_set |=
ref->extent_op->flags_to_set;
- existing_ref->extent_op->update_flags = 1;
+ existing_ref->extent_op->update_flags = true;
}
btrfs_free_delayed_extent_op(ref->extent_op);
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 00ed02cbf3e9..c24b653c7343 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -75,11 +75,11 @@ struct btrfs_delayed_ref_node {
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
+ u8 level;
+ bool update_key;
+ bool update_flags;
+ bool is_data;
u64 flags_to_set;
- int level;
- unsigned int update_key:1;
- unsigned int update_flags:1;
- unsigned int is_data:1;
};
/*
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1e668fb7dd4c..cbb7dbfb3fff 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
em = lookup_extent_mapping(em_tree, start, (u64)-1);
if (!em)
break;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 42a378a4eefb..dd08e29f5117 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,7 @@
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
@@ -54,6 +55,12 @@
#include <asm/cpufeature.h>
#endif
+#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
+ BTRFS_HEADER_FLAG_RELOC |\
+ BTRFS_SUPER_FLAG_ERROR |\
+ BTRFS_SUPER_FLAG_SEEDING |\
+ BTRFS_SUPER_FLAG_METADUMP)
+
static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
@@ -362,7 +369,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- 0, &cached_state);
+ &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
@@ -1582,8 +1589,23 @@ int btrfs_init_fs_root(struct btrfs_root *root)
ret = get_anon_bdev(&root->anon_dev);
if (ret)
goto free_writers;
+
+ mutex_lock(&root->objectid_mutex);
+ ret = btrfs_find_highest_objectid(root,
+ &root->highest_objectid);
+ if (ret) {
+ mutex_unlock(&root->objectid_mutex);
+ goto free_root_dev;
+ }
+
+ ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ mutex_unlock(&root->objectid_mutex);
+
return 0;
+free_root_dev:
+ free_anon_bdev(root->anon_dev);
free_writers:
btrfs_free_subvolume_writers(root->subv_writers);
fail:
@@ -1650,6 +1672,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
return fs_info->uuid_root ? fs_info->uuid_root :
ERR_PTR(-ENOENT);
+ if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return fs_info->free_space_root ? fs_info->free_space_root :
+ ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root) {
@@ -1782,7 +1807,10 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
+
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -2148,6 +2176,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->uuid_root);
if (chunk_root)
free_root_extent_buffers(info->chunk_root);
+ free_root_extent_buffers(info->free_space_root);
}
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2448,6 +2477,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
fs_info->uuid_root = root;
}
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->free_space_root = root;
+ }
+
return 0;
}
@@ -2542,8 +2580,8 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
- init_rwsem(&fs_info->delayed_iput_sem);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
@@ -2668,6 +2706,7 @@ int open_ctree(struct super_block *sb,
if (btrfs_check_super_csum(bh->b_data)) {
printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
err = -EINVAL;
+ brelse(bh);
goto fail_alloc;
}
@@ -2727,26 +2766,6 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- /*
- * Leafsize and nodesize were always equal, this is only a sanity check.
- */
- if (le32_to_cpu(disk_super->__unused_leafsize) !=
- btrfs_super_nodesize(disk_super)) {
- printk(KERN_ERR "BTRFS: couldn't mount because metadata "
- "blocksizes don't match. node %d leaf %d\n",
- btrfs_super_nodesize(disk_super),
- le32_to_cpu(disk_super->__unused_leafsize));
- err = -EINVAL;
- goto fail_alloc;
- }
- if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
- printk(KERN_ERR "BTRFS: couldn't mount because metadata "
- "blocksize (%d) was too large\n",
- btrfs_super_nodesize(disk_super));
- err = -EINVAL;
- goto fail_alloc;
- }
-
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
@@ -2809,7 +2828,7 @@ int open_ctree(struct super_block *sb,
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+ SZ_4M / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
tree_root->sectorsize = sectorsize;
@@ -2818,17 +2837,6 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
- if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
- goto fail_sb_buffer;
- }
-
- if (sectorsize != PAGE_SIZE) {
- printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
- "found on %s\n", (unsigned long)sectorsize, sb->s_id);
- goto fail_sb_buffer;
- }
-
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2900,6 +2908,18 @@ retry_root_backup:
tree_root->commit_root = btrfs_root_node(tree_root);
btrfs_set_root_refs(&tree_root->root_item, 1);
+ mutex_lock(&tree_root->objectid_mutex);
+ ret = btrfs_find_highest_objectid(tree_root,
+ &tree_root->highest_objectid);
+ if (ret) {
+ mutex_unlock(&tree_root->objectid_mutex);
+ goto recovery_tree_root;
+ }
+
+ ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ mutex_unlock(&tree_root->objectid_mutex);
+
ret = btrfs_read_roots(fs_info, tree_root);
if (ret)
goto recovery_tree_root;
@@ -3051,6 +3071,18 @@ retry_root_backup:
if (sb->s_flags & MS_RDONLY)
return 0;
+ if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: creating free space tree\n");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to create free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
down_read(&fs_info->cleanup_work_sem);
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
@@ -3076,6 +3108,18 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
+ if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: clearing free space tree\n");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to clear free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
if (!fs_info->uuid_root) {
pr_info("BTRFS: creating UUID tree\n");
ret = btrfs_create_uuid_tree(fs_info);
@@ -3902,11 +3946,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
return !ret;
}
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
-{
- return set_extent_buffer_uptodate(buf);
-}
-
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
struct btrfs_root *root;
@@ -3962,7 +4001,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
balance_dirty_pages_ratelimited(
root->fs_info->btree_inode->i_mapping);
}
- return;
}
void btrfs_btree_balance_dirty(struct btrfs_root *root)
@@ -3985,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
struct btrfs_super_block *sb = fs_info->super_copy;
+ u64 nodesize = btrfs_super_nodesize(sb);
+ u64 sectorsize = btrfs_super_sectorsize(sb);
int ret = 0;
+ if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+ printk(KERN_ERR "BTRFS: no valid FS found\n");
+ ret = -EINVAL;
+ }
+ if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
+ printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
@@ -4004,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
}
/*
- * The common minimum, we don't know if we can trust the nodesize/sectorsize
- * items yet, they'll be verified later. Issue just a warning.
+ * Check sectorsize and nodesize first, other check will need it.
+ * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
*/
- if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
+ if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize);
+ ret = -EINVAL;
+ }
+ /* Only PAGE SIZE is supported yet */
+ if (sectorsize != PAGE_CACHE_SIZE) {
+ printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n",
+ sectorsize, PAGE_CACHE_SIZE);
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+ nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize);
+ ret = -EINVAL;
+ }
+ if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+ printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n",
+ le32_to_cpu(sb->__unused_leafsize),
+ nodesize);
+ ret = -EINVAL;
+ }
+
+ /* Root alignment check */
+ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
btrfs_super_root(sb));
- if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+ ret = -EINVAL;
+ }
+ if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
btrfs_super_chunk_root(sb));
- if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
- printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
- btrfs_super_log_root(sb));
-
- /*
- * Check the lower bound, the alignment and other constraints are
- * checked later.
- */
- if (btrfs_super_nodesize(sb) < 4096) {
- printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
- btrfs_super_nodesize(sb));
ret = -EINVAL;
}
- if (btrfs_super_sectorsize(sb) < 4096) {
- printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
- btrfs_super_sectorsize(sb));
+ if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+ printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
+ btrfs_super_log_root(sb));
ret = -EINVAL;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index adeb31830b9c..8e79d0070bcf 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,7 +19,7 @@
#ifndef __DISKIO__
#define __DISKIO__
-#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
#define BTRFS_SUPER_INFO_SIZE 4096
#define BTRFS_SUPER_MIRROR_MAX 3
@@ -35,7 +35,7 @@ enum btrfs_wq_endio_type {
static inline u64 btrfs_sb_offset(int mirror)
{
- u64 start = 16 * 1024;
+ u64 start = SZ_16K;
if (mirror)
return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
return BTRFS_SUPER_INFO_OFFSET;
@@ -116,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c4661db2b72a..e2287c7c10be 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "math.h"
#include "sysfs.h"
#include "qgroup.h"
@@ -357,8 +358,8 @@ static void fragment_free_space(struct btrfs_root *root,
* we need to check the pinned_extents for any extents that can't be used yet
* since their free space will be released as soon as the transaction commits.
*/
-static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end)
{
u64 extent_start, extent_end, size, total_added = 0;
int ret;
@@ -395,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
return total_added;
}
-static noinline void caching_thread(struct btrfs_work *work)
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
- struct btrfs_caching_control *caching_ctl;
struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -407,17 +407,16 @@ static noinline void caching_thread(struct btrfs_work *work)
u64 total_found = 0;
u64 last = 0;
u32 nritems;
- int ret = -ENOMEM;
+ int ret;
bool wakeup = true;
- caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
extent_root = fs_info->extent_root;
path = btrfs_alloc_path();
if (!path)
- goto out;
+ return -ENOMEM;
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -438,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 1;
+ path->reada = READA_FORWARD;
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
- mutex_lock(&caching_ctl->mutex);
- /* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->commit_root_sem);
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto err;
+ goto out;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -477,12 +472,14 @@ next:
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
- goto again;
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+ goto next;
}
ret = btrfs_next_leaf(extent_root, path);
if (ret < 0)
- goto err;
+ goto out;
if (ret)
break;
leaf = path->nodes[0];
@@ -521,7 +518,7 @@ next:
else
last = key.objectid + key.offset;
- if (total_found > (1024 * 1024 * 2)) {
+ if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
if (wakeup)
wake_up(&caching_ctl->wait);
@@ -534,9 +531,37 @@ next:
total_found += add_new_free_space(block_group, fs_info, last,
block_group->key.objectid +
block_group->key.offset);
+ caching_ctl->progress = (u64)-1;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_root *extent_root;
+ int ret;
+
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ extent_root = fs_info->extent_root;
+
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ ret = load_free_space_tree(caching_ctl);
+ else
+ ret = load_extent_tree_free(caching_ctl);
+
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_FINISHED;
+ block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
#ifdef CONFIG_BTRFS_DEBUG
@@ -555,20 +580,11 @@ next:
#endif
caching_ctl->progress = (u64)-1;
-err:
- btrfs_free_path(path);
- up_read(&fs_info->commit_root_sem);
-
- free_excluded_extents(extent_root, block_group);
+ up_read(&fs_info->commit_root_sem);
+ free_excluded_extents(fs_info->extent_root, block_group);
mutex_unlock(&caching_ctl->mutex);
-out:
- if (ret) {
- spin_lock(&block_group->lock);
- block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_ERROR;
- spin_unlock(&block_group->lock);
- }
+
wake_up(&caching_ctl->wait);
put_caching_control(caching_ctl);
@@ -680,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
} else {
/*
- * We are not going to do the fast caching, set cached to the
- * appropriate value and wakeup any waiters.
+ * We're either using the free space tree or no caching at all.
+ * Set cached to the appropriate value and wakeup any waiters.
*/
spin_lock(&cache->lock);
if (load_cache_only) {
@@ -2115,7 +2131,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
@@ -2141,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
@@ -2254,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
path, 0, 1);
@@ -2910,6 +2926,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (trans->aborted)
return 0;
+ if (root->fs_info->creating_free_space_tree)
+ return 0;
+
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
@@ -2988,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
return -ENOMEM;
extent_op->flags_to_set = flags;
- extent_op->update_flags = 1;
- extent_op->update_key = 0;
- extent_op->is_data = is_data ? 1 : 0;
+ extent_op->update_flags = true;
+ extent_op->update_key = false;
+ extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
@@ -3328,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
* If this block group is smaller than 100 megs don't bother caching the
* block group.
*/
- if (block_group->key.offset < (100 * 1024 * 1024)) {
+ if (block_group->key.offset < (100 * SZ_1M)) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -3428,7 +3447,7 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
+ num_pages = div_u64(block_group->key.offset, SZ_256M);
if (!num_pages)
num_pages = 1;
@@ -3684,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return -ENOMEM;
/*
- * We don't need the lock here since we are protected by the transaction
- * commit. We want to do the cache_save_setup first and then run the
+ * Even though we are in the critical section of the transaction commit,
+ * we can still have concurrent tasks adding elements to this
+ * transaction's list of dirty block groups. These tasks correspond to
+ * endio free space workers started when writeback finishes for a
+ * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+ * allocate new block groups as a result of COWing nodes of the root
+ * tree when updating the free space inode. The writeback for the space
+ * caches is triggered by an earlier call to
+ * btrfs_start_dirty_block_groups() and iterations of the following
+ * loop.
+ * Also we want to do the cache_save_setup first and then run the
* delayed refs to make sure we have the best chance at doing this all
* in one shot.
*/
+ spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
@@ -3700,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* finish and then do it all again
*/
if (!list_empty(&cache->io_list)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path,
cache->key.objectid);
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
/*
@@ -3712,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* on any pending IO
*/
list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
should_put = 1;
cache_save_setup(cache, trans, path);
@@ -3736,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
if (!ret) {
ret = write_one_cache_group(trans, root, path, cache);
+ /*
+ * One of the free space endio workers might have
+ * created a new block group while updating a free space
+ * cache's inode (at inode.c:btrfs_finish_ordered_io())
+ * and hasn't released its transaction handle yet, in
+ * which case the new block group is still attached to
+ * its transaction handle and its creation has not
+ * finished yet (no block group item in the extent tree
+ * yet, etc). If this is the case, wait for all free
+ * space endio workers to finish and retry. This is a
+ * a very rare case so no need for a more efficient and
+ * complex approach.
+ */
+ if (ret == -ENOENT) {
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+ ret = write_one_cache_group(trans, root, path,
+ cache);
+ }
if (ret)
btrfs_abort_transaction(trans, root, ret);
}
@@ -3743,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
+ spin_unlock(&cur_trans->dirty_bgs_lock);
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
@@ -4086,8 +4139,10 @@ commit_trans:
!atomic_read(&root->fs_info->open_ioctl_trans)) {
need_commit--;
- if (need_commit > 0)
+ if (need_commit > 0) {
+ btrfs_start_delalloc_roots(fs_info, 0, -1);
btrfs_wait_ordered_roots(fs_info, -1);
+ }
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
@@ -4100,11 +4155,12 @@ commit_trans:
if (ret)
return ret;
/*
- * make sure that all running delayed iput are
- * done
+ * The cleaner kthread might still be doing iput
+ * operations. Wait for it to finish so that
+ * more space is released.
*/
- down_write(&root->fs_info->delayed_iput_sem);
- up_write(&root->fs_info->delayed_iput_sem);
+ mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
+ mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
goto again;
} else {
btrfs_end_transaction(trans, root);
@@ -4239,14 +4295,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- thresh = max_t(u64, 64 * 1024 * 1024,
- div_factor_fine(thresh, 1));
+ thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
if (num_bytes - num_allocated < thresh)
return 1;
}
- if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+ if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -4446,7 +4501,7 @@ out:
* transaction.
*/
if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ trans->chunk_bytes_reserved >= (u64)SZ_2M) {
btrfs_create_pending_block_groups(trans, trans->root);
btrfs_trans_release_chunk_metadata(trans);
}
@@ -4544,7 +4599,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
return nr;
}
-#define EXTENT_SIZE_PER_ITEM (256 * 1024)
+#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
* shrink metadata reservation for delalloc
@@ -4749,8 +4804,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
u64 expected;
u64 to_reclaim;
- to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
- 16 * 1024 * 1024);
+ to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
spin_lock(&space_info->lock);
if (can_overcommit(root, space_info, to_reclaim,
BTRFS_RESERVE_FLUSH_ALL)) {
@@ -4761,8 +4815,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
- if (can_overcommit(root, space_info, 1024 * 1024,
- BTRFS_RESERVE_FLUSH_ALL))
+ if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -5318,7 +5371,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
- block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
+ block_rsv->size = min_t(u64, num_bytes, SZ_512M);
num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
sinfo->bytes_reserved + sinfo->bytes_readonly +
@@ -6222,11 +6275,11 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
return ret;
if (ssd)
- *empty_cluster = 2 * 1024 * 1024;
+ *empty_cluster = SZ_2M;
if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
ret = &root->fs_info->meta_alloc_cluster;
if (!ssd)
- *empty_cluster = 64 * 1024;
+ *empty_cluster = SZ_64K;
} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
ret = &root->fs_info->data_alloc_cluster;
}
@@ -6438,7 +6491,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@@ -6661,6 +6714,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+ num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
@@ -7672,6 +7732,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ ins->offset);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7752,6 +7817,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ num_bytes);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7834,7 +7904,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
- btrfs_set_buffer_uptodate(buf);
+ set_extent_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
@@ -7980,12 +8050,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
else
memset(&extent_op->key, 0, sizeof(extent_op->key));
extent_op->flags_to_set = flags;
- if (skinny_metadata)
- extent_op->update_key = 0;
- else
- extent_op->update_key = 1;
- extent_op->update_flags = 1;
- extent_op->is_data = 0;
+ extent_op->update_key = skinny_metadata ? false : true;
+ extent_op->update_flags = true;
+ extent_op->is_data = false;
extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
@@ -9124,7 +9191,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
if ((sinfo->flags &
(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
!force)
- min_allocable_bytes = 1 * 1024 * 1024;
+ min_allocable_bytes = SZ_1M;
else
min_allocable_bytes = 0;
@@ -9656,6 +9723,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
cache->full_stripe_len = btrfs_full_stripe_len(root,
&root->fs_info->mapping_tree,
start);
+ set_free_space_tree_thresholds(cache);
+
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
@@ -9667,6 +9736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
+ mutex_init(&cache->free_space_lock);
return cache;
}
@@ -9691,7 +9761,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (btrfs_test_opt(root, SPACE_CACHE) &&
@@ -9877,6 +9947,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
key.objectid, key.offset);
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ add_block_group_free_space(trans, root->fs_info, block_group);
+ /* already aborted the transaction if it failed. */
next:
list_del_init(&block_group->bg_list);
}
@@ -9907,6 +9979,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->needs_free_space = 1;
ret = exclude_super_stripes(root, cache);
if (ret) {
/*
@@ -10277,6 +10350,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
unlock_chunks(root);
+ ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+ if (ret)
+ goto out;
+
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -10325,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
num_items = 3 + map->num_stripes;
free_extent_map(em);
@@ -10512,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
- return 1;
+ return -EINVAL;
features = btrfs_super_incompat_flags(disk_super);
if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
@@ -10742,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
}
return 1;
}
+
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
+{
+ while (true) {
+ int ret;
+
+ ret = btrfs_start_write_no_snapshoting(root);
+ if (ret)
+ break;
+ wait_on_atomic_t(&root->will_be_snapshoted,
+ wait_snapshoting_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+ }
+}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/fs/btrfs/extent-tree.h
+++ /dev/null
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9abe18763a7f..2e7c97a3f344 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1285,20 +1285,6 @@ search_again:
}
/* wrappers around set/clear extent bit */
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
-}
-
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, bits, NULL,
- NULL, mask);
-}
-
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask,
struct extent_changeset *changeset)
@@ -1323,17 +1309,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
cached, mask, NULL);
}
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask)
-{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
-}
-
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask,
struct extent_changeset *changeset)
@@ -1348,63 +1323,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
changeset);
}
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE,
- NULL, cached_state, mask);
-}
-
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, mask);
-}
-
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
-}
-
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
- NULL, mask);
-}
-
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
-}
-
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, mask);
-}
-
/*
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_state **cached_state)
+ struct extent_state **cached_state)
{
int err;
u64 failed_start;
while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
EXTENT_LOCKED, &failed_start,
cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
@@ -1417,11 +1347,6 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
return err;
}
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, 0, NULL);
-}
-
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
int err;
@@ -1438,20 +1363,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
return 1;
}
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- mask);
-}
-
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- GFP_NOFS);
-}
-
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1464,10 +1376,9 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1481,13 +1392,12 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
/*
* helper function to set both pages and extents in the tree writeback
*/
-static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1500,7 +1410,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
/* find the first state struct with 'bits' set after 'start', and
@@ -1800,7 +1709,7 @@ again:
BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
+ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1820,7 +1729,7 @@ out_failed:
return found;
}
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
@@ -1835,7 +1744,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
if (page_ops == 0)
- return 0;
+ return;
if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
mapping_set_error(inode->i_mapping, -EIO);
@@ -1869,7 +1778,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
index += ret;
cond_resched();
}
- return 0;
}
/*
@@ -2516,7 +2424,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
/* lots and lots of room for performance fixes in the end_bio funcs */
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
struct extent_io_tree *tree;
@@ -2537,7 +2445,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
ret = ret < 0 ? ret : -EIO;
mapping_set_error(page->mapping, ret);
}
- return 0;
}
/*
@@ -2579,9 +2486,7 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- if (end_extent_writepage(page, bio->bi_error, start, end))
- continue;
-
+ end_extent_writepage(page, bio->bi_error, start, end);
end_page_writeback(page);
}
@@ -4326,7 +4231,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, 0, &cached_state);
+ lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -4387,7 +4292,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 end = start + PAGE_CACHE_SIZE - 1;
if (gfpflags_allow_blocking(mask) &&
- page->mapping->host->i_size > 16 * 1024 * 1024) {
+ page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
len = end - start + 1;
@@ -4536,7 +4441,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -4797,24 +4702,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
return new;
}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len)
{
struct extent_buffer *eb;
- unsigned long len;
unsigned long num_pages;
unsigned long i;
- if (!fs_info) {
- /*
- * Called only from tests that don't always have a fs_info
- * available, but we know that nodesize is 4096
- */
- len = 4096;
- } else {
- len = fs_info->tree_root->nodesize;
- }
- num_pages = num_extent_pages(0, len);
+ num_pages = num_extent_pages(start, len);
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
@@ -4837,6 +4732,24 @@ err:
return NULL;
}
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ unsigned long len;
+
+ if (!fs_info) {
+ /*
+ * Called only from tests that don't always have a fs_info
+ * available, but we know that nodesize is 4096
+ */
+ len = 4096;
+ } else {
+ len = fs_info->tree_root->nodesize;
+ }
+
+ return __alloc_dummy_extent_buffer(fs_info, start, len);
+}
+
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
@@ -5227,7 +5140,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
return was_dirty;
}
-int clear_extent_buffer_uptodate(struct extent_buffer *eb)
+void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
@@ -5240,10 +5153,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
if (page)
ClearPageUptodate(page);
}
- return 0;
}
-int set_extent_buffer_uptodate(struct extent_buffer *eb)
+void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
@@ -5255,7 +5167,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
page = eb->pages[i];
SetPageUptodate(page);
}
- return 0;
}
int extent_buffer_uptodate(struct extent_buffer *eb)
@@ -5594,6 +5505,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
}
}
+/*
+ * The extent buffer bitmap operations are done with byte granularity because
+ * bitmap items are not guaranteed to be aligned to a word and therefore a
+ * single word in a bitmap may straddle two pages in the extent buffer.
+ */
+#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
+#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BITMAP_FIRST_BYTE_MASK(start) \
+ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
+#define BITMAP_LAST_BYTE_MASK(nbits) \
+ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+
+/*
+ * eb_bitmap_offset() - calculate the page and offset of the byte containing the
+ * given bit number
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @page_index: return index of the page in the extent buffer that contains the
+ * given bit number
+ * @page_offset: return offset into the page given by page_index
+ *
+ * This helper hides the ugliness of finding the byte in an extent buffer which
+ * contains a given bit.
+ */
+static inline void eb_bitmap_offset(struct extent_buffer *eb,
+ unsigned long start, unsigned long nr,
+ unsigned long *page_index,
+ size_t *page_offset)
+{
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ size_t byte_offset = BIT_BYTE(nr);
+ size_t offset;
+
+ /*
+ * The byte we want is the offset of the extent buffer + the offset of
+ * the bitmap item in the extent buffer + the offset of the byte in the
+ * bitmap item.
+ */
+ offset = start_offset + start + byte_offset;
+
+ *page_index = offset >> PAGE_CACHE_SHIFT;
+ *page_offset = offset & (PAGE_CACHE_SIZE - 1);
+}
+
+/**
+ * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number to test
+ */
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+
+ eb_bitmap_offset(eb, start, nr, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
+}
+
+/**
+ * extent_buffer_bitmap_set - set an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to set
+ */
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+
+ while (len >= bits_to_set) {
+ kaddr[offset] |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_BYTE;
+ mask_to_set = ~0U;
+ if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] |= mask_to_set;
+ }
+}
+
+
+/**
+ * extent_buffer_bitmap_clear - clear an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to clear
+ */
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+
+ while (len >= bits_to_clear) {
+ kaddr[offset] &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_BYTE;
+ mask_to_clear = ~0U;
+ if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] &= ~mask_to_clear;
+ }
+}
+
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
{
unsigned long distance = (src > dst) ? src - dst : dst - src;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f4c1ae11855f..0377413bd4b9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -199,12 +199,14 @@ int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_state **cached);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached, gfp_t mask);
+ struct extent_state **cached);
+
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return lock_extent_bits(tree, start, end, NULL);
+}
+
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
@@ -221,39 +223,105 @@ void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int filled,
struct extent_state *cached_state);
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask,
struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask);
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ GFP_NOFS);
+}
+
+static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits, gfp_t mask)
+{
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+}
+
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask,
struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, mask);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, mask);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+ NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+}
+
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
+ NULL, cached_state, mask);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, mask);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ cached_state, mask);
+}
+
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state);
@@ -282,8 +350,10 @@ void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -328,19 +398,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+void set_extent_buffer_uptodate(struct extent_buffer *eb);
+void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
unsigned long *map_len);
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
@@ -357,7 +433,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
int mirror_num);
int clean_io_failure(struct inode *inode, u64 start, struct page *page,
unsigned int pg_offset);
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6a98bddd8f33..84fb56d5c018 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->bdev);
+ kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b2991fd8583e..eb8b8fae036b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -32,7 +32,15 @@ struct extent_map {
u64 block_len;
u64 generation;
unsigned long flags;
- struct block_device *bdev;
+ union {
+ struct block_device *bdev;
+
+ /*
+ * used for chunk mappings
+ * flags & EXTENT_FLAG_FS_MAPPING must be set
+ */
+ struct map_lookup *map_lookup;
+ };
atomic_t refs;
unsigned int compress_type;
struct list_head list;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 58ece6558430..a67e1c828d0f 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
}
if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
- path->reada = 2;
+ path->reada = READA_FORWARD;
WARN_ON(bio->bi_vcnt <= 0);
@@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (search_commit) {
path->skip_locking = 1;
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e3d9022bfd4e..098bb8f690c9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
*/
-static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
- size_t write_bytes,
+static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
struct page **prepared_pages,
struct iov_iter *i)
{
@@ -1394,7 +1393,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos, 0, cached_state);
+ start_pos, last_pos, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
@@ -1588,8 +1587,7 @@ again:
ret = 0;
}
- copied = btrfs_copy_from_user(pos, num_pages,
- write_bytes, pages, i);
+ copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
/*
* if we have trouble faulting in the pages, fall
@@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
loff_t pos;
size_t count;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = generic_write_checks(iocb, from);
if (err <= 0) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
current->backing_dev_info = inode_to_bdi(inode);
err = file_remove_privs(file);
if (err) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
@@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* to stop this write operation to ensure FS consistency.
*/
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
err = -EROFS;
goto out;
}
@@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
end_pos = round_up(pos + count, root->sectorsize);
err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
if (err) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
}
@@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
iocb->ki_pos = pos + num_written;
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* We also have to set last_sub_trans to the current log transid,
@@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = start_ordered_ops(inode, start, end);
}
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
atomic_inc(&root->log_batch);
@@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
@@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
trans->sync = true;
@@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* If any of the ordered extents had an error, just return it to user
@@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
ret = find_first_non_hole(inode, &offset, &len);
if (ret < 0)
@@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncated_page = true;
ret = btrfs_truncate_page(inode, offset, 0, 0);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
@@ -2398,7 +2396,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
/*
@@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
ret = btrfs_wait_ordered_range(inode, lockstart,
lockend - lockstart + 1);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
@@ -2576,7 +2574,7 @@ out_only_mutex:
ret = btrfs_end_transaction(trans, root);
}
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret && !err)
err = ret;
return err;
@@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret < 0)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
goto out;
@@ -2705,7 +2703,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state);
+ locked_end, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
@@ -2818,7 +2816,7 @@ out:
* So this is completely used as cleanup.
*/
btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/* Let go of our reservation. */
btrfs_free_reserved_data_space(inode, alloc_start,
alloc_end - alloc_start);
@@ -2852,7 +2850,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
lockend--;
len = lockend - lockstart + 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
while (start < inode->i_size) {
@@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file->f_mapping->host;
int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case SEEK_END:
case SEEK_CUR:
@@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
case SEEK_DATA:
case SEEK_HOLE:
if (offset >= i_size_read(inode)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
ret = find_desired_extent(inode, &offset, whence);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cfe99bec49de..8f835bfa1bdd 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -30,7 +30,7 @@
#include "volumes.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
-#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+#define MAX_CACHE_BYTES_PER_GIG SZ_32K
struct btrfs_trim_range {
u64 start;
@@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root,
static noinline_for_stack int
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
{
- struct list_head *pos, *n;
+ struct btrfs_free_space *entry, *next;
int ret;
/* Write out the bitmaps */
- list_for_each_safe(pos, n, bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
-
+ list_for_each_entry_safe(entry, next, bitmap_list, list) {
ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
return -ENOSPC;
@@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode)
static void noinline_for_stack
cleanup_bitmap_list(struct list_head *bitmap_list)
{
- struct list_head *pos, *n;
+ struct btrfs_free_space *entry, *next;
- list_for_each_safe(pos, n, bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
+ list_for_each_entry_safe(entry, next, bitmap_list, list)
list_del_init(&entry->list);
- }
}
static void noinline_for_stack
@@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
goto out;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state);
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1656,11 +1650,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
* at or below 32k, so we need to adjust how much memory we allow to be
* used by extent based free space tracking
*/
- if (size < 1024 * 1024 * 1024)
+ if (size < SZ_1G)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
- max_bytes = MAX_CACHE_BYTES_PER_GIG *
- div_u64(size, 1024 * 1024 * 1024);
+ max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
/*
* we want to account for 1 more bitmap than what we have so we can make
@@ -2016,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
return true;
}
-static struct btrfs_free_space_op free_space_op = {
+static const struct btrfs_free_space_op free_space_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -2489,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
* track of free space, and if we pass 1/2 of that we want to
* start converting things over to using bitmaps
*/
- ctl->extents_thresh = ((1024 * 32) / 2) /
- sizeof(struct btrfs_free_space);
+ ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
}
/*
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index f251865eb6f3..33178c490ace 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -37,7 +37,7 @@ struct btrfs_free_space_ctl {
int total_bitmaps;
int unit;
u64 start;
- struct btrfs_free_space_op *op;
+ const struct btrfs_free_space_op *op;
void *private;
struct mutex cache_writeout_mutex;
struct list_head trimming_ranges;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
new file mode 100644
index 000000000000..393e36bd5845
--- /dev/null
+++ b/fs/btrfs/free-space-tree.c
@@ -0,0 +1,1591 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "free-space-tree.h"
+#include "transaction.h"
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
+{
+ u32 bitmap_range;
+ size_t bitmap_size;
+ u64 num_bitmaps, total_bitmap_size;
+
+ /*
+ * We convert to bitmaps when the disk space required for using extents
+ * exceeds that required for using bitmaps.
+ */
+ bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
+ bitmap_range);
+ bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
+ total_bitmap_size = num_bitmaps * bitmap_size;
+ cache->bitmap_high_thresh = div_u64(total_bitmap_size,
+ sizeof(struct btrfs_item));
+
+ /*
+ * We allow for a small buffer between the high threshold and low
+ * threshold to avoid thrashing back and forth between the two formats.
+ */
+ if (cache->bitmap_high_thresh > 100)
+ cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100;
+ else
+ cache->bitmap_low_thresh = 0;
+}
+
+static int add_new_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->key.offset;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ info = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_info);
+ btrfs_set_free_space_extent_count(leaf, info, 0);
+ btrfs_set_free_space_flags(leaf, info, 0);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->key.offset;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret != 0) {
+ btrfs_warn(fs_info, "missing free space info for %llu\n",
+ block_group->key.objectid);
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_free_space_info);
+}
+
+/*
+ * btrfs_search_slot() but we're looking for the greatest key less than the
+ * passed key.
+ */
+static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int ins_len, int cow)
+{
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, p, ins_len, cow);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+
+ if (p->slots[0] == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+ p->slots[0]--;
+
+ return 0;
+}
+
+static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
+{
+ return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
+}
+
+static unsigned long *alloc_bitmap(u32 bitmap_size)
+{
+ return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+}
+
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ char *bitmap_cursor;
+ u64 start, end;
+ u64 bitmap_range, i;
+ u32 bitmap_size, flags, expected_extent_count;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ block_group->sectorsize);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
+ u64 first, last;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ first = div_u64(found_key.objectid - start,
+ block_group->sectorsize);
+ last = div_u64(found_key.objectid + found_key.offset - start,
+ block_group->sectorsize);
+ bitmap_set(bitmap, first, last - first);
+
+ extent_count++;
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ bitmap_cursor = (char *)bitmap;
+ bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ i = start;
+ while (i < end) {
+ unsigned long ptr;
+ u64 extent_size;
+ u32 data_size;
+
+ extent_size = min(end - i, bitmap_range);
+ data_size = free_space_bitmap_size(extent_size,
+ block_group->sectorsize);
+
+ key.objectid = i;
+ key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
+ key.offset = extent_size;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ data_size);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ i += extent_size;
+ bitmap_cursor += data_size;
+ }
+
+ ret = 0;
+out:
+ vfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ u64 start, end;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 offset;
+ u32 bitmap_size, flags, expected_extent_count;
+ int prev_bit = 0, bit, bitnr;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ block_group->sectorsize);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ unsigned long ptr;
+ char *bitmap_cursor;
+ u32 bitmap_pos, data_size;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ bitmap_pos = div_u64(found_key.objectid - start,
+ block_group->sectorsize *
+ BITS_PER_BYTE);
+ bitmap_cursor = ((char *)bitmap) + bitmap_pos;
+ data_size = free_space_bitmap_size(found_key.offset,
+ block_group->sectorsize);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ read_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ offset = start;
+ bitnr = 0;
+ while (offset < end) {
+ bit = !!test_bit(bitnr, bitmap);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ key.objectid = extent_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = offset - extent_start;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += block_group->sectorsize;
+ bitnr++;
+ }
+ if (prev_bit == 1) {
+ key.objectid = extent_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = end - extent_start;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ vfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ int new_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ u32 extent_count;
+ int ret = 0;
+
+ if (new_extents == 0)
+ return 0;
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ extent_count += new_extents;
+ btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+
+ if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count > block_group->bitmap_high_thresh) {
+ ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
+ path);
+ } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count < block_group->bitmap_low_thresh) {
+ ret = convert_free_space_to_extents(trans, fs_info, block_group,
+ path);
+ }
+
+out:
+ return ret;
+}
+
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ unsigned long ptr, i;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(offset >= found_start && offset < found_end);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ i = div_u64(offset - found_start, block_group->sectorsize);
+ return !!extent_buffer_test_bit(leaf, ptr, i);
+}
+
+static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ int bit)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 end = *start + *size;
+ u64 found_start, found_end;
+ unsigned long ptr, first, last;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(*start >= found_start && *start < found_end);
+ ASSERT(end > found_start);
+
+ if (end > found_end)
+ end = found_end;
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ first = div_u64(*start - found_start, block_group->sectorsize);
+ last = div_u64(end - found_start, block_group->sectorsize);
+ if (bit)
+ extent_buffer_bitmap_set(leaf, ptr, first, last - first);
+ else
+ extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
+ btrfs_mark_buffer_dirty(leaf);
+
+ *size -= end - *start;
+ *start = end;
+}
+
+/*
+ * We can't use btrfs_next_item() in modify_free_space_bitmap() because
+ * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy
+ * tree walking in btrfs_next_leaf() anyways because we know exactly what we're
+ * looking for.
+ */
+static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p)
+{
+ struct btrfs_key key;
+
+ if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) {
+ p->slots[0]++;
+ return 0;
+ }
+
+ btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]);
+ btrfs_release_path(p);
+
+ key.objectid += key.offset;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ return btrfs_search_prev_slot(trans, root, &key, p, 0, 1);
+}
+
+/*
+ * If remove is 1, then we are removing free space, thus clearing bits in the
+ * bitmap. If remove is 0, then we are adding free space, thus setting bits in
+ * the bitmap.
+ */
+static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size, int remove)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ u64 end = start + size;
+ u64 cur_start, cur_size;
+ int prev_bit, next_bit;
+ int new_extents;
+ int ret;
+
+ /*
+ * Read the bit for the block immediately before the extent of space if
+ * that block is within the block group.
+ */
+ if (start > block_group->key.objectid) {
+ u64 prev_block = start - block_group->sectorsize;
+
+ key.objectid = prev_block;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = free_space_test_bit(block_group, path, prev_block);
+
+ /* The previous block may have been in the previous bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (start >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+ } else {
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = -1;
+ }
+
+ /*
+ * Iterate over all of the bitmaps overlapped by the extent of space,
+ * clearing/setting bits as required.
+ */
+ cur_start = start;
+ cur_size = size;
+ while (1) {
+ free_space_set_bits(block_group, path, &cur_start, &cur_size,
+ !remove);
+ if (cur_size == 0)
+ break;
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * Read the bit for the block immediately after the extent of space if
+ * that block is within the block group.
+ */
+ if (end < block_group->key.objectid + block_group->key.offset) {
+ /* The next block may be in the next bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (end >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ next_bit = free_space_test_bit(block_group, path, end);
+ } else {
+ next_bit = -1;
+ }
+
+ if (remove) {
+ new_extents = -1;
+ if (prev_bit == 1) {
+ /* Leftover on the left. */
+ new_extents++;
+ }
+ if (next_bit == 1) {
+ /* Leftover on the right. */
+ new_extents++;
+ }
+ } else {
+ new_extents = 1;
+ if (prev_bit == 1) {
+ /* Merging with neighbor on the left. */
+ new_extents--;
+ }
+ if (next_bit == 1) {
+ /* Merging with neighbor on the right. */
+ new_extents--;
+ }
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+static int remove_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = -1;
+ int ret;
+
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(start >= found_start && end <= found_end);
+
+ /*
+ * Okay, now that we've found the free space extent which contains the
+ * free space that we are removing, there are four cases:
+ *
+ * 1. We're using the whole extent: delete the key we found and
+ * decrement the free space extent count.
+ * 2. We are using part of the extent starting at the beginning: delete
+ * the key we found and insert a new key representing the leftover at
+ * the end. There is no net change in the number of extents.
+ * 3. We are using part of the extent ending at the end: delete the key
+ * we found and insert a new key representing the leftover at the
+ * beginning. There is no net change in the number of extents.
+ * 4. We are using part of the extent in the middle: delete the key we
+ * found and insert two new keys representing the leftovers on each
+ * side. Where we used to have one extent, we now have two, so increment
+ * the extent count. We may need to convert the block group to bitmaps
+ * as a result.
+ */
+
+ /* Delete the existing key (cases 1-4). */
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+
+ /* Add a key for leftovers at the beginning (cases 3 and 4). */
+ if (start > found_start) {
+ key.objectid = found_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = start - found_start;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ /* Add a key for leftovers at the end (cases 2 and 4). */
+ if (end < found_end) {
+ key.objectid = end;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = found_end - end;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (block_group->needs_free_space) {
+ ret = __add_block_group_free_space(trans, fs_info, block_group,
+ path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, fs_info, block_group,
+ path, start, size, 1);
+ } else {
+ return remove_free_space_extent(trans, fs_info, block_group,
+ path, start, size);
+ }
+}
+
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
+ start, size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+static int add_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key, new_key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = 1;
+ int ret;
+
+ /*
+ * We are adding a new extent of free space, but we need to merge
+ * extents. There are four cases here:
+ *
+ * 1. The new extent does not have any immediate neighbors to merge
+ * with: add the new key and increment the free space extent count. We
+ * may need to convert the block group to bitmaps as a result.
+ * 2. The new extent has an immediate neighbor before it: remove the
+ * previous key and insert a new key combining both of them. There is no
+ * net change in the number of extents.
+ * 3. The new extent has an immediate neighbor after it: remove the next
+ * key and insert a new key combining both of them. There is no net
+ * change in the number of extents.
+ * 4. The new extent has immediate neighbors on both sides: remove both
+ * of the keys and insert a new key combining all of them. Where we used
+ * to have two extents, we now have one, so decrement the extent count.
+ */
+
+ new_key.objectid = start;
+ new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ new_key.offset = size;
+
+ /* Search for a neighbor on the left. */
+ if (start == block_group->key.objectid)
+ goto right;
+ key.objectid = start - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto right;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->key.objectid &&
+ found_end > block_group->key.objectid);
+ ASSERT(found_start < start && found_end <= start);
+
+ /*
+ * Delete the neighbor on the left and absorb it into the new key (cases
+ * 2 and 4).
+ */
+ if (found_end == start) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.objectid = found_start;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+right:
+ /* Search for a neighbor on the right. */
+ if (end == block_group->key.objectid + block_group->key.offset)
+ goto insert;
+ key.objectid = end;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto insert;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->key.objectid &&
+ found_end > block_group->key.objectid);
+ ASSERT((found_start < start && found_end <= start) ||
+ (found_start >= end && found_end > end));
+
+ /*
+ * Delete the neighbor on the right and absorb it into the new key
+ * (cases 3 and 4).
+ */
+ if (found_start == end) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+insert:
+ /* Insert the new key (cases 1-4). */
+ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (block_group->needs_free_space) {
+ ret = __add_block_group_free_space(trans, fs_info, block_group,
+ path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, fs_info, block_group,
+ path, start, size, 0);
+ } else {
+ return add_free_space_extent(trans, fs_info, block_group, path,
+ start, size);
+ }
+}
+
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
+ size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+/*
+ * Populate the free space tree by walking the extent tree. Operations on the
+ * extent tree that happen as a result of writes to the free space tree will go
+ * through the normal add/remove hooks.
+ */
+static int populate_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_path *path, *path2;
+ struct btrfs_key key;
+ u64 start, end;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+ path2 = btrfs_alloc_path();
+ if (!path2) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+ if (ret)
+ goto out;
+
+ mutex_lock(&block_group->free_space_lock);
+
+ /*
+ * Iterate through all of the extent and metadata items in this block
+ * group, adding the free space between them and the free space at the
+ * end. Note that EXTENT_ITEM and METADATA_ITEM are less than
+ * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
+ * contained in.
+ */
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out_locked;
+ ASSERT(ret == 0);
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+ while (1) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (key.objectid >= end)
+ break;
+
+ if (start < key.objectid) {
+ ret = __add_to_free_space_tree(trans, fs_info,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
+ if (ret)
+ goto out_locked;
+ }
+ start = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ start += fs_info->tree_root->nodesize;
+ else
+ start += key.offset;
+ } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ if (key.objectid != block_group->key.objectid)
+ break;
+ }
+
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ goto out_locked;
+ if (ret)
+ break;
+ }
+ if (start < end) {
+ ret = __add_to_free_space_tree(trans, fs_info, block_group,
+ path2, start, end - start);
+ if (ret)
+ goto out_locked;
+ }
+
+ ret = 0;
+out_locked:
+ mutex_unlock(&block_group->free_space_lock);
+out:
+ btrfs_free_path(path2);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root;
+ struct btrfs_block_group_cache *block_group;
+ struct rb_node *node;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ fs_info->creating_free_space_tree = 1;
+ free_space_root = btrfs_create_tree(trans, fs_info,
+ BTRFS_FREE_SPACE_TREE_OBJECTID);
+ if (IS_ERR(free_space_root)) {
+ ret = PTR_ERR(free_space_root);
+ goto abort;
+ }
+ fs_info->free_space_root = free_space_root;
+
+ node = rb_first(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group_cache,
+ cache_node);
+ ret = populate_free_space_tree(trans, fs_info, block_group);
+ if (ret)
+ goto abort;
+ node = rb_next(node);
+ }
+
+ btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ fs_info->creating_free_space_tree = 0;
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ return 0;
+
+abort:
+ fs_info->creating_free_space_tree = 0;
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
+ return ret;
+}
+
+static int clear_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int nr;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ nr = btrfs_header_nritems(path->nodes[0]);
+ if (!nr)
+ break;
+
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root = fs_info->free_space_root;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ fs_info->free_space_root = NULL;
+
+ ret = clear_free_space_tree(trans, free_space_root);
+ if (ret)
+ goto abort;
+
+ ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key);
+ if (ret)
+ goto abort;
+
+ list_del(&free_space_root->dirty_list);
+
+ btrfs_tree_lock(free_space_root->node);
+ clean_tree_block(trans, tree_root->fs_info, free_space_root->node);
+ btrfs_tree_unlock(free_space_root->node);
+ btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
+ 0, 1);
+
+ free_extent_buffer(free_space_root->node);
+ free_extent_buffer(free_space_root->commit_root);
+ kfree(free_space_root);
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ return 0;
+
+abort:
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
+ return ret;
+}
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ u64 start, end;
+ int ret;
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ block_group->needs_free_space = 0;
+
+ ret = add_new_free_space_info(trans, fs_info, block_group, path);
+ if (ret)
+ return ret;
+
+ return __add_to_free_space_tree(trans, fs_info, block_group, path,
+ block_group->key.objectid,
+ block_group->key.offset);
+}
+
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ mutex_lock(&block_group->free_space_lock);
+ if (!block_group->needs_free_space)
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+
+out:
+ btrfs_free_path(path);
+ mutex_unlock(&block_group->free_space_lock);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_path *path;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ u64 start, end;
+ int done = 0, nr;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ if (block_group->needs_free_space) {
+ /* We never added this block group to the free space tree. */
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ nr++;
+ path->slots[0]--;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY ||
+ found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 end, offset;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = fs_info->free_space_root;
+
+ end = block_group->key.objectid + block_group->key.offset;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ caching_ctl->progress = key.objectid;
+
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(block_group, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ total_found += add_new_free_space(block_group,
+ fs_info,
+ extent_start,
+ offset);
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += block_group->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ total_found += add_new_free_space(block_group, fs_info,
+ extent_start, end);
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ caching_ctl->progress = (u64)-1;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ u64 end;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = fs_info->free_space_root;
+
+ end = block_group->key.objectid + block_group->key.offset;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ caching_ctl->progress = key.objectid;
+
+ total_found += add_new_free_space(block_group, fs_info,
+ key.objectid,
+ key.objectid + key.offset);
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ caching_ctl->progress = (u64)-1;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_free_space_info *info;
+ struct btrfs_path *path;
+ u32 extent_count, flags;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Just like caching_thread() doesn't want to deadlock on the extent
+ * tree, we don't want to deadlock on the free space tree.
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = 1;
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+
+ /*
+ * We left path pointing to the free space info item, so now
+ * load_free_space_foo can just iterate through the free space tree from
+ * there.
+ */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
+ ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ else
+ ret = load_free_space_extents(caching_ctl, path, extent_count);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
new file mode 100644
index 000000000000..54ffced3bce8
--- /dev/null
+++ b/fs/btrfs/free-space-tree.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_TREE
+#define __BTRFS_FREE_SPACE_TREE
+
+/*
+ * The default size for new free space bitmap items. The last bitmap in a block
+ * group may be truncated, and none of the free space tree code assumes that
+ * existing bitmaps are this size.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
+#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group);
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size);
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size);
+
+/* Exposed for testing. */
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow);
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 offset);
+
+#endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 767a6056ac45..e50316c4af15 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -48,7 +48,7 @@ static int caching_kthread(void *data)
/* Since the commit root is read-only, we can safely skip locking. */
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 2;
+ path->reada = READA_FORWARD;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.offset = 0;
@@ -282,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
}
}
-#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
/*
@@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
return true;
}
-static struct btrfs_free_space_op free_ino_op = {
+static const struct btrfs_free_space_op free_ino_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
return false;
}
-static struct btrfs_free_space_op pinned_free_ino_op = {
+static const struct btrfs_free_space_op pinned_free_ino_op = {
.recalc_thresholds = pinned_recalc_thresholds,
.use_bitmap = pinned_use_bitmap,
};
@@ -515,7 +515,7 @@ out:
return ret;
}
-static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
{
struct btrfs_path *path;
int ret;
@@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
int ret;
mutex_lock(&root->objectid_mutex);
- if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
- if (ret)
- goto out;
- }
-
if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
ret = -ENOSPC;
goto out;
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
index ddb347bfee23..c8e864b2d530 100644
--- a/fs/btrfs/inode-map.h
+++ b/fs/btrfs/inode-map.h
@@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans);
int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b8856e182ae..e28f3d4691af 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
struct btrfs_root *root;
};
+struct btrfs_dio_data {
+ u64 outstanding_extents;
+ u64 reserve;
+ u64 unsubmitted_oe_range_start;
+ u64 unsubmitted_oe_range_end;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
@@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
-static struct extent_io_ops btrfs_extent_io_ops;
+static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
-static struct kmem_cache *btrfs_delalloc_work_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
#define S_SHIFT 12
-static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
@@ -414,15 +420,15 @@ static noinline void compress_file_range(struct inode *inode,
unsigned long nr_pages_ret = 0;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
- unsigned long max_compressed = 128 * 1024;
- unsigned long max_uncompressed = 128 * 1024;
+ unsigned long max_compressed = SZ_128K;
+ unsigned long max_uncompressed = SZ_128K;
int i;
int will_compress;
int compress_type = root->fs_info->compress_type;
int redirty = 0;
/* if this is a small write inside eof, kick off a defrag */
- if ((end - start + 1) < 16 * 1024 &&
+ if ((end - start + 1) < SZ_16K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
@@ -430,7 +436,7 @@ static noinline void compress_file_range(struct inode *inode,
again:
will_compress = 0;
nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
- nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+ nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
/*
* we don't want to send crud past the end of i_size through
@@ -944,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode,
disk_num_bytes = num_bytes;
/* if this is a small write inside eof, kick off defrag */
- if (num_bytes < 64 * 1024 &&
+ if (num_bytes < SZ_64K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
@@ -1107,7 +1113,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
* atomic_sub_return implies a barrier for waitqueue_active
*/
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
- 5 * 1024 * 1024 &&
+ 5 * SZ_1M &&
waitqueue_active(&root->fs_info->async_submit_wait))
wake_up(&root->fs_info->async_submit_wait);
@@ -1132,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
u64 cur_end;
- int limit = 10 * 1024 * 1024;
+ int limit = 10 * SZ_1M;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1, 0, NULL, GFP_NOFS);
@@ -1148,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
!btrfs_test_opt(root, FORCE_COMPRESS))
cur_end = end;
else
- cur_end = min(end, start + 512 * 1024 - 1);
+ cur_end = min(end, start + SZ_512K - 1);
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
@@ -1989,7 +1995,7 @@ again:
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
/* already ordered? We're done */
@@ -2482,7 +2488,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
lock_start = backref->file_pos;
lock_end = backref->file_pos + backref->num_bytes - 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- 0, &cached);
+ &cached);
ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
if (ordered) {
@@ -2874,7 +2880,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- 0, &cached_state);
+ &cached_state);
ret = test_range_bit(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
@@ -3106,56 +3112,46 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
start, (size_t)(end - start + 1));
}
-struct delayed_iput {
- struct list_head list;
- struct inode *inode;
-};
-
-/* JDM: If this is fs-wide, why can't we add a pointer to
- * btrfs_inode instead and avoid the allocation? */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct delayed_iput *delayed;
+ struct btrfs_inode *binode = BTRFS_I(inode);
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
- delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
- delayed->inode = inode;
-
spin_lock(&fs_info->delayed_iput_lock);
- list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+ if (binode->delayed_iput_count == 0) {
+ ASSERT(list_empty(&binode->delayed_iput));
+ list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+ } else {
+ binode->delayed_iput_count++;
+ }
spin_unlock(&fs_info->delayed_iput_lock);
}
void btrfs_run_delayed_iputs(struct btrfs_root *root)
{
- LIST_HEAD(list);
struct btrfs_fs_info *fs_info = root->fs_info;
- struct delayed_iput *delayed;
- int empty;
spin_lock(&fs_info->delayed_iput_lock);
- empty = list_empty(&fs_info->delayed_iputs);
- spin_unlock(&fs_info->delayed_iput_lock);
- if (empty)
- return;
-
- down_read(&fs_info->delayed_iput_sem);
-
- spin_lock(&fs_info->delayed_iput_lock);
- list_splice_init(&fs_info->delayed_iputs, &list);
- spin_unlock(&fs_info->delayed_iput_lock);
-
- while (!list_empty(&list)) {
- delayed = list_entry(list.next, struct delayed_iput, list);
- list_del(&delayed->list);
- iput(delayed->inode);
- kfree(delayed);
+ while (!list_empty(&fs_info->delayed_iputs)) {
+ struct btrfs_inode *inode;
+
+ inode = list_first_entry(&fs_info->delayed_iputs,
+ struct btrfs_inode, delayed_iput);
+ if (inode->delayed_iput_count) {
+ inode->delayed_iput_count--;
+ list_move_tail(&inode->delayed_iput,
+ &fs_info->delayed_iputs);
+ } else {
+ list_del_init(&inode->delayed_iput);
+ }
+ spin_unlock(&fs_info->delayed_iput_lock);
+ iput(&inode->vfs_inode);
+ spin_lock(&fs_info->delayed_iput_lock);
}
-
- up_read(&root->fs_info->delayed_iput_sem);
+ spin_unlock(&fs_info->delayed_iput_lock);
}
/*
@@ -3351,7 +3347,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = -ENOMEM;
goto out;
}
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
@@ -4318,7 +4314,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
/*
* We want to drop from the next block forward in case this new size is
@@ -4349,7 +4345,7 @@ search_again:
* up a huge file in a single leaf. Most of the time that
* bytes_deleted is > 0, it will be huge by the time we get here
*/
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
if (btrfs_should_end_transaction(trans, root)) {
err = -EAGAIN;
goto error;
@@ -4592,7 +4588,7 @@ error:
btrfs_free_path(path);
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
@@ -4669,7 +4665,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@ -4800,7 +4796,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+ lock_extent_bits(io_tree, hole_start, block_end - 1,
&cached_state);
ordered = btrfs_lookup_ordered_range(inode, hole_start,
block_end - hole_start);
@@ -4876,26 +4872,6 @@ next:
return err;
}
-static int wait_snapshoting_atomic_t(atomic_t *a)
-{
- schedule();
- return 0;
-}
-
-static void wait_for_snapshot_creation(struct btrfs_root *root)
-{
- while (true) {
- int ret;
-
- ret = btrfs_start_write_no_snapshoting(root);
- if (ret)
- break;
- wait_on_atomic_t(&root->will_be_snapshoted,
- wait_snapshoting_atomic_t,
- TASK_UNINTERRUPTIBLE);
- }
-}
-
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4927,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* truncation, it must capture all writes that happened before
* this truncation.
*/
- wait_for_snapshot_creation(root);
+ btrfs_wait_for_snapshot_creation(root);
ret = btrfs_cont_expand(inode, oldsize, newsize);
if (ret) {
btrfs_end_write_no_snapshoting(root);
@@ -5112,7 +5088,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ lock_extent_bits(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5305,7 +5281,6 @@ void btrfs_evict_inode(struct inode *inode)
no_delete:
btrfs_remove_delayed_node(inode);
clear_inode(inode);
- return;
}
/*
@@ -5754,7 +5729,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
if (key_type == BTRFS_DIR_INDEX_KEY) {
INIT_LIST_HEAD(&ins_list);
@@ -6482,7 +6457,7 @@ out_unlock_inode:
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
u64 index;
@@ -6508,6 +6483,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
goto fail;
}
@@ -6541,9 +6517,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, inode, NULL, parent);
}
- btrfs_end_transaction(trans, root);
btrfs_balance_delayed_items(root);
fail:
+ if (trans)
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -6688,7 +6665,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
}
static noinline int uncompress_inline(struct btrfs_path *path,
- struct inode *inode, struct page *page,
+ struct page *page,
size_t pg_offset, u64 extent_offset,
struct btrfs_file_extent_item *item)
{
@@ -6785,7 +6762,7 @@ again:
* Chances are we'll be called again, so go ahead and do
* readahead
*/
- path->reada = 1;
+ path->reada = READA_FORWARD;
}
ret = btrfs_lookup_file_extent(trans, root, path,
@@ -6884,8 +6861,7 @@ next:
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
- ret = uncompress_inline(path, inode, page,
- pg_offset,
+ ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
if (ret) {
err = ret;
@@ -7381,7 +7357,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, cached_state);
+ cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
@@ -7409,25 +7385,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
} else {
- /* Screw you mmap */
- ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
- if (ret)
- break;
- ret = filemap_fdatawait_range(inode->i_mapping,
- lockstart,
- lockend);
- if (ret)
- break;
-
/*
- * If we found a page that couldn't be invalidated just
- * fall back to buffered.
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readpages() (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readpages() wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
*/
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- lockstart >> PAGE_CACHE_SHIFT,
- lockend >> PAGE_CACHE_SHIFT);
- if (ret)
- break;
+ ret = -ENOTBLK;
+ break;
}
cond_resched();
@@ -7483,11 +7455,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
return em;
}
-struct btrfs_dio_data {
- u64 outstanding_extents;
- u64 reserve;
-};
-
static void adjust_dio_outstanding_extents(struct inode *inode,
struct btrfs_dio_data *dio_data,
const u64 len)
@@ -7671,6 +7638,7 @@ unlock:
btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
+ dio_data->unsubmitted_oe_range_end = start + len;
current->journal_info = dio_data;
}
@@ -7993,22 +7961,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
bio_put(bio);
}
-static void btrfs_endio_direct_write(struct bio *bio)
+static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+ const u64 offset,
+ const u64 bytes,
+ const int uptodate)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered = NULL;
- u64 ordered_offset = dip->logical_offset;
- u64 ordered_bytes = dip->bytes;
- struct bio *dio_bio;
+ u64 ordered_offset = offset;
+ u64 ordered_bytes = bytes;
int ret;
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
ordered_bytes,
- !bio->bi_error);
+ uptodate);
if (!ret)
goto out_test;
@@ -8021,13 +7989,22 @@ out_test:
* our bio might span multiple ordered extents. If we haven't
* completed the accounting for the whole dio, go back and try again
*/
- if (ordered_offset < dip->logical_offset + dip->bytes) {
- ordered_bytes = dip->logical_offset + dip->bytes -
- ordered_offset;
+ if (ordered_offset < offset + bytes) {
+ ordered_bytes = offset + bytes - ordered_offset;
ordered = NULL;
goto again;
}
- dio_bio = dip->dio_bio;
+}
+
+static void btrfs_endio_direct_write(struct bio *bio)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct bio *dio_bio = dip->dio_bio;
+
+ btrfs_endio_direct_write_update_ordered(dip->inode,
+ dip->logical_offset,
+ dip->bytes,
+ !bio->bi_error);
kfree(dip);
@@ -8335,6 +8312,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->subio_endio = btrfs_subio_endio_read;
}
+ /*
+ * Reset the range for unsubmitted ordered extents (to a 0 length range)
+ * even if we fail to submit a bio, because in such case we do the
+ * corresponding error handling below and it must not be done a second
+ * time by btrfs_direct_IO().
+ */
+ if (write) {
+ struct btrfs_dio_data *dio_data = current->journal_info;
+
+ dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+ dip->bytes;
+ dio_data->unsubmitted_oe_range_start =
+ dio_data->unsubmitted_oe_range_end;
+ }
+
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
@@ -8363,24 +8355,15 @@ free_ordered:
dip = NULL;
io_bio = NULL;
} else {
- if (write) {
- struct btrfs_ordered_extent *ordered;
-
- ordered = btrfs_lookup_ordered_extent(inode,
- file_offset);
- set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
- /*
- * Decrements our ref on the ordered extent and removes
- * the ordered extent from the inode's ordered tree,
- * doing all the proper resource cleanup such as for the
- * reserved space and waking up any waiters for this
- * ordered extent (through btrfs_remove_ordered_extent).
- */
- btrfs_finish_ordered_io(ordered);
- } else {
+ if (write)
+ btrfs_endio_direct_write_update_ordered(inode,
+ file_offset,
+ dio_bio->bi_iter.bi_size,
+ 0);
+ else
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
- }
+
dio_bio->bi_error = -EIO;
/*
* Releases and cleans up our dio_bio, no need to bio_put()
@@ -8464,7 +8447,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
relock = true;
}
ret = btrfs_delalloc_reserve_space(inode, offset, count);
@@ -8480,6 +8463,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* originally calculated. Abuse current->journal_info for this.
*/
dio_data.reserve = round_up(count, root->sectorsize);
+ dio_data.unsubmitted_oe_range_start = (u64)offset;
+ dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
@@ -8498,6 +8483,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (dio_data.reserve)
btrfs_delalloc_release_space(inode, offset,
dio_data.reserve);
+ /*
+ * On error we might have left some ordered extents
+ * without submitting corresponding bios for them, so
+ * cleanup them up to avoid other tasks getting them
+ * and waiting for them to complete forever.
+ */
+ if (dio_data.unsubmitted_oe_range_start <
+ dio_data.unsubmitted_oe_range_end)
+ btrfs_endio_direct_write_update_ordered(inode,
+ dio_data.unsubmitted_oe_range_start,
+ dio_data.unsubmitted_oe_range_end -
+ dio_data.unsubmitted_oe_range_start,
+ 0);
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
@@ -8506,7 +8504,7 @@ out:
if (wakeup)
inode_dio_end(inode);
if (relock)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
return ret;
}
@@ -8535,15 +8533,28 @@ int btrfs_readpage(struct file *file, struct page *page)
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
-
+ struct inode *inode = page->mapping->host;
+ int ret;
if (current->flags & PF_MEMALLOC) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
+
+ /*
+ * If we are under memory pressure we will call this directly from the
+ * VM, we need to make sure we have the inode referenced for the ordered
+ * extent. If not just return like we didn't do anything.
+ */
+ if (!igrab(inode)) {
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ btrfs_add_delayed_iput(inode);
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -8615,7 +8626,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(tree, page_start, page_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
/*
@@ -8653,7 +8664,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
btrfs_put_ordered_extent(ordered);
if (!inode_evicting) {
cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end, 0,
+ lock_extent_bits(tree, page_start, page_end,
&cached_state);
}
}
@@ -8751,7 +8762,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
/*
@@ -9025,6 +9036,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_log_commit = 0;
+ ei->delayed_iput_count = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
@@ -9049,6 +9061,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
+ INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
return inode;
@@ -9153,15 +9166,14 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_path_cachep);
if (btrfs_free_space_cachep)
kmem_cache_destroy(btrfs_free_space_cachep);
- if (btrfs_delalloc_work_cachep)
- kmem_cache_destroy(btrfs_delalloc_work_cachep);
}
int btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ init_once);
if (!btrfs_inode_cachep)
goto fail;
@@ -9189,13 +9201,6 @@ int btrfs_init_cachep(void)
if (!btrfs_free_space_cachep)
goto fail;
- btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
- sizeof(struct btrfs_delalloc_work), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_delalloc_work_cachep)
- goto fail;
-
return 0;
fail:
btrfs_destroy_cachep();
@@ -9419,14 +9424,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
delalloc_work = container_of(work, struct btrfs_delalloc_work,
work);
inode = delalloc_work->inode;
- if (delalloc_work->wait) {
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- } else {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
- }
if (delalloc_work->delay_iput)
btrfs_add_delayed_iput(inode);
@@ -9436,18 +9437,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
}
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput)
+ int delay_iput)
{
struct btrfs_delalloc_work *work;
- work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+ work = kmalloc(sizeof(*work), GFP_NOFS);
if (!work)
return NULL;
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- work->wait = wait;
work->delay_iput = delay_iput;
WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
@@ -9459,7 +9459,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
{
wait_for_completion(&work->completion);
- kmem_cache_free(btrfs_delalloc_work_cachep, work);
+ kfree(work);
}
/*
@@ -9495,7 +9495,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
}
spin_unlock(&root->delalloc_lock);
- work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ work = btrfs_alloc_delalloc_work(inode, delay_iput);
if (!work) {
if (delay_iput)
btrfs_add_delayed_iput(inode);
@@ -9637,9 +9637,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
/*
* 2 items for inode item and ref
* 2 items for dir items
+ * 1 item for updating parent inode item
+ * 1 item for the inline extent item
* 1 item for xattr if selinux is on
*/
- trans = btrfs_start_transaction(root, 5);
+ trans = btrfs_start_transaction(root, 7);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -9670,10 +9672,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock_inode;
- err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
- if (err)
- goto out_unlock_inode;
-
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
@@ -9711,6 +9709,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
+ /*
+ * Last step, add directory indexes for our symlink inode. This is the
+ * last step to avoid extra cleanup of these indexes if an error happens
+ * elsewhere above.
+ */
+ if (!err)
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err) {
drop_inode = 1;
goto out_unlock_inode;
@@ -9761,7 +9766,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
}
- cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+ cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
* If we are severely fragmented we could end up with really
@@ -10025,7 +10030,7 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static struct extent_io_ops btrfs_extent_io_ops = {
+static const struct extent_io_ops btrfs_extent_io_ops = {
.fill_delalloc = run_delalloc_range,
.submit_bio_hook = btrfs_submit_bio_hook,
.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e21997385d14..952172ca7e45 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ip_oldflags = ip->flags;
i_oldflags = inode->i_flags;
@@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mnt_drop_write_file(file);
return ret;
}
@@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
+ mutex_lock(&new_root->objectid_mutex);
+ new_root->highest_objectid = new_dirid;
+ mutex_unlock(&new_root->objectid_mutex);
+
/*
* insert the directory item
*/
@@ -655,22 +659,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
+ GFP_NOFS);
+ pending_snapshot->path = btrfs_alloc_path();
+ if (!pending_snapshot->root_item || !pending_snapshot->path) {
+ ret = -ENOMEM;
+ goto free_pending;
+ }
+
atomic_inc(&root->will_be_snapshoted);
smp_mb__after_atomic();
btrfs_wait_for_no_snapshoting_writes(root);
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
- goto out;
+ goto dec_and_free;
btrfs_wait_ordered_extents(root, -1);
- pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot) {
- ret = -ENOMEM;
- goto out;
- }
-
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
@@ -686,7 +696,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
&pending_snapshot->qgroup_reserved,
false);
if (ret)
- goto free;
+ goto dec_and_free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -737,11 +747,14 @@ fail:
btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
-free:
- kfree(pending_snapshot);
-out:
+dec_and_free:
if (atomic_dec_and_test(&root->will_be_snapshoted))
wake_up_atomic_t(&root->will_be_snapshoted);
+free_pending:
+ kfree(pending_snapshot->root_item);
+ btrfs_free_path(pending_snapshot->path);
+ kfree(pending_snapshot);
+
return ret;
}
@@ -868,7 +881,7 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return error;
}
@@ -992,7 +1005,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
u64 end = start + len - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, 0, &cached);
+ lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
@@ -1016,7 +1029,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+ (em->block_len > SZ_128K && next->block_len > SZ_128K))
ret = false;
free_extent_map(next);
@@ -1140,7 +1153,7 @@ again:
page_end = page_start + PAGE_CACHE_SIZE - 1;
while (1) {
lock_extent_bits(tree, page_start, page_end,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
unlock_extent_cached(tree, page_start, page_end,
@@ -1200,7 +1213,7 @@ again:
page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, 0, &cached_state);
+ page_start, page_end - 1, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
@@ -1262,9 +1275,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT;
unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)128 * 1024 - 1);
+ u64 new_align = ~((u64)SZ_128K - 1);
struct page **pages = NULL;
if (isize == 0)
@@ -1281,7 +1294,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
if (extent_thresh == 0)
- extent_thresh = 256 * 1024;
+ extent_thresh = SZ_256K;
/*
* if we were not given a file, allocate a readahead
@@ -1313,7 +1326,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (newer_than) {
ret = find_new_extents(root, inode, newer_than,
- &newer_off, 64 * 1024);
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
/*
@@ -1380,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index += cluster;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = compress_type;
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
if (ret < 0) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out_ra;
}
defrag_count += ret;
balance_dirty_pages_ratelimited(inode->i_mapping);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (newer_than) {
if (newer_off == (u64)-1)
@@ -1403,9 +1416,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
newer_off = max(newer_off + 1,
(u64)i << PAGE_CACHE_SHIFT);
- ret = find_new_extents(root, inode,
- newer_than, &newer_off,
- 64 * 1024);
+ ret = find_new_extents(root, inode, newer_than,
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
@@ -1453,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
out_ra:
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
if (!file)
kfree(ra);
@@ -1571,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = old_size + new_size;
}
- if (new_size < 256 * 1024 * 1024) {
+ if (new_size < SZ_256M) {
ret = -EINVAL;
goto out_free;
}
@@ -2160,7 +2172,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
struct inode *inode;
int ret;
size_t buf_size;
- const size_t buf_limit = 16 * 1024 * 1024;
+ const size_t buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2418,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Don't allow to delete a subvolume with send in progress. This is
@@ -2531,7 +2543,7 @@ out_up_write:
spin_unlock(&dest->root_item_lock);
}
out_unlock_inode:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!err) {
d_invalidate(dentry);
btrfs_invalidate_inodes(dest);
@@ -2547,7 +2559,7 @@ out_unlock_inode:
out_dput:
dput(dentry);
out_unlock_dir:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
out_drop_write:
mnt_drop_write_file(file);
out:
@@ -2845,8 +2857,8 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
{
- mutex_unlock(&inode1->i_mutex);
- mutex_unlock(&inode2->i_mutex);
+ inode_unlock(inode1);
+ inode_unlock(inode2);
}
static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
@@ -2854,8 +2866,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
if (inode1 < inode2)
swap(inode1, inode2);
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(inode1, I_MUTEX_PARENT);
+ inode_lock_nested(inode2, I_MUTEX_CHILD);
}
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -3014,7 +3026,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
return 0;
if (same_inode) {
- mutex_lock(&src->i_mutex);
+ inode_lock(src);
ret = extent_same_check_offsets(src, loff, &len, olen);
if (ret)
@@ -3089,14 +3101,14 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
btrfs_cmp_data_free(&cmp);
out_unlock:
if (same_inode)
- mutex_unlock(&src->i_mutex);
+ inode_unlock(src);
else
btrfs_double_inode_unlock(src, dst);
return ret;
}
-#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
struct file *dst_file, u64 dst_loff)
@@ -3396,7 +3408,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
return ret;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* clone data */
key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -3737,7 +3749,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
if (!same_inode) {
btrfs_double_inode_lock(src, inode);
} else {
- mutex_lock(&src->i_mutex);
+ inode_lock(src);
}
/* determine range to clone */
@@ -3808,7 +3820,7 @@ out_unlock:
if (!same_inode)
btrfs_double_inode_unlock(src, inode);
else
- mutex_unlock(&src->i_mutex);
+ inode_unlock(src);
return ret;
}
@@ -4039,7 +4051,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
return -ENOMEM;
space_args.total_spaces = 0;
- dest = kmalloc(alloc_size, GFP_NOFS);
+ dest = kmalloc(alloc_size, GFP_KERNEL);
if (!dest)
return -ENOMEM;
dest_orig = dest;
@@ -4416,7 +4428,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- size = min_t(u32, loi->size, 64 * 1024);
+ size = min_t(u32, loi->size, SZ_64K);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
@@ -4565,7 +4577,7 @@ locked:
goto out_bargs;
}
- bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
goto out_bargs;
@@ -4651,7 +4663,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
goto out;
}
- bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
if (!bargs) {
ret = -ENOMEM;
goto out;
@@ -4911,7 +4923,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+ qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
if (!qsa)
return -ENOMEM;
@@ -5041,7 +5053,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
goto out;
}
- args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
if (!args64) {
ret = -ENOMEM;
goto out;
@@ -5178,7 +5190,7 @@ out_unlock:
static int btrfs_ioctl_get_supported_features(struct file *file,
void __user *arg)
{
- static struct btrfs_ioctl_feature_flags features[3] = {
+ static const struct btrfs_ioctl_feature_flags features[3] = {
INIT_FEATURE_FLAGS(SUPP),
INIT_FEATURE_FLAGS(SAFE_SET),
INIT_FEATURE_FLAGS(SAFE_CLEAR)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 8077461fc56a..d13128c70ddd 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
atomic_dec(&eb->spinning_readers);
read_unlock(&eb->lock);
}
- return;
}
/*
@@ -96,7 +95,6 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
}
- return;
}
/*
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1a33d3eb36de..55161369fab1 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -503,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
}
spin_unlock_irqrestore(&table->cache_lock, flags);
- return;
}
/*
@@ -610,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
return 1;
}
+static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
+ int index)
+{
+ return stripe * rbio->stripe_npages + index;
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
+ int index)
+{
+ return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+}
+
/*
* helper to index into the pstripe
*/
static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
{
- index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
- return rbio->stripe_pages[index];
+ return rbio_stripe_page(rbio, rbio->nr_data, index);
}
/*
@@ -627,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
return NULL;
-
- index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
- PAGE_CACHE_SHIFT;
- return rbio->stripe_pages[index];
+ return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
}
/*
@@ -890,6 +901,7 @@ static void raid_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
int err = bio->bi_error;
+ int max_errors;
if (err)
fail_bio_stripe(rbio, bio);
@@ -902,11 +914,12 @@ static void raid_write_end_io(struct bio *bio)
err = 0;
/* OK, we have read all the stripes we need to. */
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
+ 0 : rbio->bbio->max_errors;
+ if (atomic_read(&rbio->error) > max_errors)
err = -EIO;
rbio_orig_end_io(rbio, err);
- return;
}
/*
@@ -949,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
*/
static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
{
- unsigned long nr = stripe_len * nr_stripes;
- return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
+ return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes;
}
/*
@@ -968,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
void *p;
rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
- DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
- GFP_NOFS);
+ DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
+ sizeof(long), GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -1023,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
if (!page)
return -ENOMEM;
rbio->stripe_pages[i] = page;
- ClearPageUptodate(page);
}
return 0;
}
-/* allocate pages for just the p/q stripes */
+/* only allocate pages for p/q stripes */
static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{
int i;
struct page *page;
- i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+ i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
for (; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i])
@@ -1123,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
}
/*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
-{
- int index;
- index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
- index += page;
- return rbio->stripe_pages[index];
-}
-
-/*
* helper function to walk our bio list and populate the bio_pages array with
* the result. This seems expensive, but it is faster than constantly
* searching through the bio list as we setup the IO in finish_rmw or stripe
@@ -1177,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_bio *bbio = rbio->bbio;
void *pointers[rbio->real_stripes];
- int stripe_len = rbio->stripe_len;
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
@@ -1185,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
int q_stripe = -1;
struct bio_list bio_list;
struct bio *bio;
- int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
int ret;
bio_list_init(&bio_list);
@@ -1228,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *p;
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
@@ -1270,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
* everything else.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1294,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
if (!bbio->tgtdev_map[stripe])
continue;
- for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1508,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
@@ -1527,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* stripe
*/
for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
/*
* we want to find all the pages missing from
@@ -1803,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
int pagenr, stripe;
void **pointers;
int faila = -1, failb = -1;
- int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
struct page *page;
int err;
int i;
@@ -1826,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
/*
* Now we just use bitmap to mark the horizontal stripes in
* which we have data when doing parity scrub.
@@ -1937,7 +1932,7 @@ pstripe:
* other endio functions will fiddle the uptodate bits
*/
if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < rbio->stripe_npages; i++) {
if (faila != -1) {
page = rbio_stripe_page(rbio, faila, i);
SetPageUptodate(page);
@@ -2033,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
@@ -2057,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
continue;
}
- for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *p;
/*
@@ -2281,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
if (!page)
return -ENOMEM;
rbio->stripe_pages[index] = page;
- ClearPageUptodate(page);
}
}
return 0;
}
-/*
- * end io function used by finish_rmw. When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_parity_end_io(struct bio *bio)
-{
- struct btrfs_raid_bio *rbio = bio->bi_private;
- int err = bio->bi_error;
-
- if (bio->bi_error)
- fail_bio_stripe(rbio, bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
-
- err = 0;
-
- if (atomic_read(&rbio->error))
- err = -EIO;
-
- rbio_orig_end_io(rbio, err);
-}
-
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
@@ -2464,7 +2432,7 @@ submit_write:
break;
bio->bi_private = rbio;
- bio->bi_end_io = raid_write_parity_end_io;
+ bio->bi_end_io = raid_write_end_io;
submit_bio(WRITE, bio);
}
return;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b4ca5454ef1a..fd1c4d982463 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -708,8 +708,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
err = -ENOMEM;
goto out;
}
- path1->reada = 1;
- path2->reada = 2;
+ path1->reada = READA_FORWARD;
+ path2->reada = READA_FORWARD;
node = alloc_backref_node(cache);
if (!node) {
@@ -2130,7 +2130,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@@ -3030,7 +3030,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
int ret = 0;
BUG_ON(cluster->start != cluster->boundary[0]);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = btrfs_check_data_free_space(inode, cluster->start,
cluster->end + 1 - cluster->start);
@@ -3057,7 +3057,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
btrfs_free_reserved_data_space(inode, cluster->start,
cluster->end + 1 - cluster->start);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -3527,7 +3527,7 @@ static int find_data_references(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
root = read_fs_root(rc->extent_root->fs_info, ref_root);
if (IS_ERR(root)) {
@@ -3917,7 +3917,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
ret = prepare_to_relocate(rc);
if (ret) {
@@ -4343,7 +4343,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_TREE_RELOC_OBJECTID;
key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b091d94ceef6..92bf5ee732fb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1514,8 +1514,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
-
- return;
}
static inline int scrub_check_fsid(u8 fsid[],
@@ -2815,7 +2813,7 @@ out:
static inline int scrub_calc_parity_bitmap_len(int nsectors)
{
- return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+ return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
}
static void scrub_parity_get(struct scrub_parity *sparity)
@@ -3460,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
return ret;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (em->start != chunk_offset)
goto out;
@@ -3507,7 +3505,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3735,27 +3733,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
fs_info->scrub_workers =
- btrfs_alloc_workqueue("btrfs-scrub", flags,
+ btrfs_alloc_workqueue("scrub", flags,
1, 4);
else
fs_info->scrub_workers =
- btrfs_alloc_workqueue("btrfs-scrub", flags,
+ btrfs_alloc_workqueue("scrub", flags,
max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+ btrfs_alloc_workqueue("scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
fs_info->scrub_nocow_workers =
- btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+ btrfs_alloc_workqueue("scrubnc", flags, 1, 0);
if (!fs_info->scrub_nocow_workers)
goto fail_scrub_nocow_workers;
fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+ btrfs_alloc_workqueue("scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
@@ -4211,7 +4209,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
io_tree = &BTRFS_I(inode)->io_tree;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
if (ordered) {
btrfs_put_ordered_extent(ordered);
@@ -4281,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
return PTR_ERR(inode);
/* Avoid truncate/dio/punch hole.. */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
inode_dio_wait(inode);
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
@@ -4360,7 +4358,7 @@ next_page:
}
ret = COPY_COMPLETE;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
return ret;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 355a458cba1a..63a6152be04b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1469,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret);
+ if (ret) {
+ /*
+ * An empty symlink inode. Can happen in rare error paths when
+ * creating a symlink (transaction committed before the inode
+ * eviction handler removed the symlink inode items and a crash
+ * happened in between or the subvol was snapshoted in between).
+ * Print an informative message to dmesg/syslog so that the user
+ * can delete the symlink.
+ */
+ btrfs_err(root->fs_info,
+ "Found empty symlink inode %llu at root %llu",
+ ino, root->root_key.objectid);
+ ret = -EIO;
+ goto out;
+ }
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 48d425aef05b..02e00166c4da 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -22,8 +22,8 @@
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
#define BTRFS_SEND_STREAM_VERSION 1
-#define BTRFS_SEND_BUF_SIZE (1024 * 64)
-#define BTRFS_SEND_READ_SIZE (1024 * 48)
+#define BTRFS_SEND_BUF_SIZE SZ_64K
+#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a0434c179ea9..d41e09fe8e38 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -295,10 +295,11 @@ enum {
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
- Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
- Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
- Opt_check_integrity, Opt_check_integrity_including_extent_data,
+ Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+ Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+ Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+ Opt_skip_balance, Opt_check_integrity,
+ Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
@@ -309,7 +310,7 @@ enum {
Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_degraded, "degraded"},
{Opt_subvol, "subvol=%s"},
{Opt_subvolid, "subvolid=%s"},
@@ -340,6 +341,7 @@ static match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_space_cache, "space_cache"},
+ {Opt_space_cache_version, "space_cache=%s"},
{Opt_clear_cache, "clear_cache"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
{Opt_enospc_debug, "enospc_debug"},
@@ -381,9 +383,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
int ret = 0;
char *compress_type;
bool compress_force = false;
+ enum btrfs_compression_type saved_compress_type;
+ bool saved_compress_force;
+ int no_compress = 0;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (cache_gen)
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+ else if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
if (!options)
@@ -458,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
+ saved_compress_type = btrfs_test_opt(root, COMPRESS) ?
+ info->compress_type : BTRFS_COMPRESS_NONE;
+ saved_compress_force =
+ btrfs_test_opt(root, FORCE_COMPRESS);
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
@@ -466,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
+ no_compress = 0;
} else if (strcmp(args[0].from, "lzo") == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
@@ -473,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
+ no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
+ no_compress++;
} else {
ret = -EINVAL;
goto out;
}
if (compress_force) {
- btrfs_set_and_info(root, FORCE_COMPRESS,
- "force %s compression",
- compress_type);
+ btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
} else {
- if (!btrfs_test_opt(root, COMPRESS))
- btrfs_info(root->fs_info,
- "btrfs: use %s compression",
- compress_type);
/*
* If we remount from compress-force=xxx to
* compress=xxx, we need clear FORCE_COMPRESS
@@ -500,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
+ if ((btrfs_test_opt(root, COMPRESS) &&
+ (info->compress_type != saved_compress_type ||
+ compress_force != saved_compress_force)) ||
+ (!btrfs_test_opt(root, COMPRESS) &&
+ no_compress == 1)) {
+ btrfs_info(root->fs_info,
+ "%s %s compression",
+ (compress_force) ? "force" : "use",
+ compress_type);
+ }
+ compress_force = false;
break;
case Opt_ssd:
btrfs_set_and_info(root, SSD,
@@ -617,15 +636,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
"turning off discard");
break;
case Opt_space_cache:
- btrfs_set_and_info(root, SPACE_CACHE,
- "enabling disk space caching");
+ case Opt_space_cache_version:
+ if (token == Opt_space_cache ||
+ strcmp(args[0].from, "v1") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ FREE_SPACE_TREE);
+ btrfs_set_and_info(root, SPACE_CACHE,
+ "enabling disk space caching");
+ } else if (strcmp(args[0].from, "v2") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ SPACE_CACHE);
+ btrfs_set_and_info(root, FREE_SPACE_TREE,
+ "enabling free space tree");
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
break;
case Opt_rescan_uuid_tree:
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
- btrfs_clear_and_info(root, SPACE_CACHE,
- "disabling disk space caching");
+ if (btrfs_test_opt(root, SPACE_CACHE)) {
+ btrfs_clear_and_info(root, SPACE_CACHE,
+ "disabling disk space caching");
+ }
+ if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+ btrfs_clear_and_info(root, FREE_SPACE_TREE,
+ "disabling free space tree");
+ }
break;
case Opt_inode_cache:
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
@@ -754,8 +793,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
}
out:
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, CLEAR_CACHE)) {
+ btrfs_err(root->fs_info, "cannot disable free space tree");
+ ret = -EINVAL;
+
+ }
if (!ret && btrfs_test_opt(root, SPACE_CACHE))
btrfs_info(root->fs_info, "disk space caching is enabled");
+ if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+ btrfs_info(root->fs_info, "using free space tree");
kfree(orig);
return ret;
}
@@ -1162,6 +1210,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",noacl");
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
+ else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+ seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(root, RESCAN_UUID_TREE))
@@ -1863,7 +1913,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*/
- skip_space = 1024 * 1024;
+ skip_space = SZ_1M;
/* user can set the offset in fs_info->alloc_start. */
if (fs_info->alloc_start &&
@@ -1954,6 +2004,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* there are other factors that may change the result (like a new metadata
* chunk).
*
+ * If metadata is exhausted, f_bavail will be 0.
+ *
* FIXME: not accurate for mixed block groups, total and free/used are ok,
* available appears slightly larger.
*/
@@ -1965,11 +2017,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
+ u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
+ u64 thresh = 0;
/*
* holding chunk_muext to avoid allocating new chunks, holding
@@ -1995,6 +2049,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
}
}
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ total_free_meta += found->disk_total - found->disk_used;
total_used += found->disk_used;
}
@@ -2017,6 +2073,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
+ /*
+ * We calculate the remaining metadata space minus global reserve. If
+ * this is (supposedly) smaller than zero, there's no space. But this
+ * does not hold in practice, the exhausted state happens where's still
+ * some positive delta. So we apply some guesswork and compare the
+ * delta to a 4M threshold. (Practically observed delta was ~2M.)
+ *
+ * We probably cannot calculate the exact threshold value because this
+ * depends on the internal reservations requested by various
+ * operations, so some operations that consume a few metadata will
+ * succeed even if the Avail is zero. But this is better than the other
+ * way around.
+ */
+ thresh = 4 * 1024 * 1024;
+
+ if (total_free_meta - thresh < block_rsv->size)
+ buf->f_bavail = 0;
+
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_namelen = BTRFS_NAME_LEN;
@@ -2223,6 +2297,9 @@ static int btrfs_run_sanity_tests(void)
if (ret)
goto out;
ret = btrfs_test_qgroups();
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree();
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9626252ee6b4..b1d920b30070 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
#include <linux/magic.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../free-space-cache.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
#include "../volumes.h"
#include "../disk-io.h"
#include "../qgroup.h"
@@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+ extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
return fs_info;
}
@@ -169,3 +175,55 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
kfree(root);
}
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+ cache->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!cache->fs_info) {
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->key.objectid = 0;
+ cache->key.offset = length;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = 4096;
+ cache->full_stripe_len = 4096;
+
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ btrfs_init_free_space_ctl(cache);
+ mutex_init(&cache->free_space_lock);
+
+ return cache;
+}
+
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (!cache)
+ return;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+}
+
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ INIT_LIST_HEAD(&trans->qgroup_ref_list);
+ trans->type = __TRANS_DUMMY;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index fd3954224480..054b8c73c951 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,17 +24,23 @@
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
struct btrfs_root;
+struct btrfs_trans_handle;
int btrfs_test_free_space_cache(void);
int btrfs_test_extent_buffer_operations(void);
int btrfs_test_extent_io(void);
int btrfs_test_inodes(void);
int btrfs_test_qgroups(void);
+int btrfs_test_free_space_tree(void);
int btrfs_init_test_fs(void);
void btrfs_destroy_test_fs(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
void btrfs_free_dummy_root(struct btrfs_root *root);
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length);
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
#else
static inline int btrfs_test_free_space_cache(void)
{
@@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void)
{
return 0;
}
+static inline int btrfs_test_free_space_tree(void)
+{
+ return 0;
+}
#endif
#endif
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e9f2368177d..e29fa297e053 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sizes.h>
#include "btrfs-tests.h"
#include "../extent_io.h"
@@ -70,12 +72,14 @@ static int test_find_delalloc(void)
struct page *page;
struct page *locked_page = NULL;
unsigned long index = 0;
- u64 total_dirty = 256 * 1024 * 1024;
- u64 max_bytes = 128 * 1024 * 1024;
+ u64 total_dirty = SZ_256M;
+ u64 max_bytes = SZ_128M;
u64 start, end, test_start;
u64 found;
int ret = -EINVAL;
+ test_msg("Running find delalloc tests\n");
+
inode = btrfs_new_test_inode();
if (!inode) {
test_msg("Failed to allocate test inode\n");
@@ -133,7 +137,7 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- test_start = 64 * 1024 * 1024;
+ test_start = SZ_64M;
locked_page = find_lock_page(inode->i_mapping,
test_start >> PAGE_CACHE_SHIFT);
if (!locked_page) {
@@ -220,8 +224,8 @@ static int test_find_delalloc(void)
* Now to test where we run into a page that is no longer dirty in the
* range we want to find.
*/
- page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
- >> PAGE_CACHE_SHIFT);
+ page = find_get_page(inode->i_mapping,
+ (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT);
if (!page) {
test_msg("Couldn't find our page\n");
goto out_bits;
@@ -268,8 +272,139 @@ out:
return ret;
}
+static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
+ unsigned long len)
+{
+ unsigned long i, x;
+
+ memset(bitmap, 0, len);
+ memset_extent_buffer(eb, 0, 0, len);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Bitmap was not zeroed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting all bits failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing all bits failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting straddling pages failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ bitmap_clear(bitmap,
+ (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing straddling pages failed\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Generate a wonky pseudo-random bit pattern for the sake of not using
+ * something repetitive that could miss some hypothetical off-by-n bug.
+ */
+ x = 0;
+ for (i = 0; i < len / sizeof(long); i++) {
+ x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL;
+ bitmap[i] = x;
+ }
+ write_extent_buffer(eb, bitmap, 0, len);
+
+ for (i = 0; i < len * BITS_PER_BYTE; i++) {
+ int bit, bit1;
+
+ bit = !!test_bit(i, bitmap);
+ bit1 = !!extent_buffer_test_bit(eb, 0, i);
+ if (bit1 != bit) {
+ test_msg("Testing bit pattern failed\n");
+ return -EINVAL;
+ }
+
+ bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1 != bit) {
+ test_msg("Testing bit pattern with offset failed\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int test_eb_bitmaps(void)
+{
+ unsigned long len = PAGE_CACHE_SIZE * 4;
+ unsigned long *bitmap;
+ struct extent_buffer *eb;
+ int ret;
+
+ test_msg("Running extent buffer bitmap tests\n");
+
+ bitmap = kmalloc(len, GFP_NOFS);
+ if (!bitmap) {
+ test_msg("Couldn't allocate test bitmap\n");
+ return -ENOMEM;
+ }
+
+ eb = __alloc_dummy_extent_buffer(NULL, 0, len);
+ if (!eb) {
+ test_msg("Couldn't allocate test extent buffer\n");
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, len);
+ if (ret)
+ goto out;
+
+ /* Do it over again with an extent buffer which isn't page-aligned. */
+ free_extent_buffer(eb);
+ eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len);
+ if (!eb) {
+ test_msg("Couldn't allocate test extent buffer\n");
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, len);
+out:
+ free_extent_buffer(eb);
+ kfree(bitmap);
+ return ret;
+}
+
int btrfs_test_extent_io(void)
{
- test_msg("Running find delalloc tests\n");
- return test_find_delalloc();
+ int ret;
+
+ test_msg("Running extent I/O tests\n");
+
+ ret = test_find_delalloc();
+ if (ret)
+ goto out;
+
+ ret = test_eb_bitmaps();
+out:
+ test_msg("Extent I/O tests finished\n");
+ return ret;
}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 8b72b005bfb9..c9ad97b1e690 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -23,41 +23,6 @@
#include "../free-space-cache.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
-static struct btrfs_block_group_cache *init_test_block_group(void)
-{
- struct btrfs_block_group_cache *cache;
-
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
- if (!cache)
- return NULL;
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
- GFP_NOFS);
- if (!cache->free_space_ctl) {
- kfree(cache);
- return NULL;
- }
- cache->fs_info = btrfs_alloc_dummy_fs_info();
- if (!cache->fs_info) {
- kfree(cache->free_space_ctl);
- kfree(cache);
- return NULL;
- }
-
- cache->key.objectid = 0;
- cache->key.offset = 1024 * 1024 * 1024;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = 4096;
- cache->full_stripe_len = 4096;
-
- spin_lock_init(&cache->lock);
- INIT_LIST_HEAD(&cache->list);
- INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->bg_list);
-
- btrfs_init_free_space_ctl(cache);
-
- return cache;
-}
/*
* This test just does basic sanity checking, making sure we can add an exten
@@ -71,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache)
test_msg("Running extent only tests\n");
/* First just make sure we can remove an entire entry */
- ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error adding initial extents %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error removing extent %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_4M)) {
test_msg("Full remove left some lingering space\n");
return -1;
}
/* Ok edge and middle cases now */
- ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error adding half extent %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
if (ret) {
test_msg("Error removing tail end %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
test_msg("Error removing front end %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+ ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
if (ret) {
test_msg("Error removing middle piece %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_1M)) {
test_msg("Still have space at the front\n");
return -1;
}
- if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+ if (test_check_exists(cache, SZ_2M, 4096)) {
test_msg("Still have space in the middle\n");
return -1;
}
- if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
test_msg("Still have space at the end\n");
return -1;
}
@@ -141,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
test_msg("Running bitmap only tests\n");
- ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
test_msg("Couldn't create a bitmap entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error removing bitmap full range %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_4M)) {
test_msg("Left some space in bitmap\n");
return -1;
}
- ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add to our bitmap entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
if (ret) {
test_msg("Couldn't remove middle chunk %d\n", ret);
return ret;
@@ -177,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
/* Test a bit straddling two bitmaps */
- ret = test_add_free_space_entry(cache, next_bitmap_offset -
- (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
+ SZ_4M, 1);
if (ret) {
test_msg("Couldn't add space that straddles two bitmaps %d\n",
ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, next_bitmap_offset -
- (1 * 1024 * 1024), 2 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
if (ret) {
test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
- 2 * 1024 * 1024)) {
+ if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
test_msg("Left some space when removing overlapping\n");
return -1;
}
@@ -216,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* bitmap, but the free space completely in the extent and then
* completely in the bitmap.
*/
- ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
if (ret) {
test_msg("Couldn't create bitmap entry %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
test_msg("Couldn't remove extent entry %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_1M)) {
test_msg("Left remnants after our remove\n");
return -1;
}
/* Now to add back the extent entry and remove from the bitmap */
- ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
test_msg("Couldn't re-add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
if (ret) {
test_msg("Couldn't remove from bitmap %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, SZ_4M, SZ_1M)) {
test_msg("Left remnants in the bitmap\n");
return -1;
}
@@ -261,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* Ok so a little more evil, extent entry and bitmap at the same offset,
* removing an overlapping chunk.
*/
- ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add to a bitmap %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
if (ret) {
test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+ if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
test_msg("Left over pieces after removing overlapping\n");
return -1;
}
@@ -281,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
__btrfs_remove_free_space_cache(cache->free_space_ctl);
/* Now with the extent entry offset into the bitmap */
- ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add space to the bitmap %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
if (ret) {
test_msg("Couldn't add extent to the cache %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
if (ret) {
test_msg("Problem removing overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
test_msg("Left something behind when removing space");
return -1;
}
@@ -315,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* [ del ]
*/
__btrfs_remove_free_space_cache(cache->free_space_ctl);
- ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
- 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add bitmap %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
- 5 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
+ 5 * SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
- 5 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
if (ret) {
test_msg("Failed to free our space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
- 5 * 1024 * 1024)) {
+ if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
test_msg("Left stuff over\n");
return -1;
}
@@ -350,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* to return -EAGAIN back from btrfs_remove_extent, make sure this
* doesn't happen.
*/
- ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
if (ret) {
test_msg("Error removing bitmap and extent overlapping %d\n", ret);
return ret;
@@ -445,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
int ret;
u64 offset;
u64 max_extent_size;
-
- bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
- struct btrfs_free_space *);
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds,
+ .use_bitmap = test_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
test_msg("Running space stealing from bitmap to extent\n");
@@ -469,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that forces use of bitmaps as soon as we have at least 1
* extent entry.
*/
- use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
- cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
/*
* Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
*/
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
- 128 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
/* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
- 128 * 1024 * 1024 - 512 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
+ SZ_128M - SZ_512K, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
@@ -502,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* [128Mb + 512Kb, 128Mb + 768Kb[
*/
ret = btrfs_remove_free_space(cache,
- 128 * 1024 * 1024 + 768 * 1024,
- 128 * 1024 * 1024 - 768 * 1024);
+ SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K);
if (ret) {
test_msg("Failed to free part of bitmap space %d\n", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
- if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
- 256 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
@@ -525,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
* as free anymore.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
- 128 * 1024 * 1024 - 768 * 1024)) {
+ if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K)) {
test_msg("Bitmap region not removed from space cache\n");
return -EINVAL;
}
@@ -535,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
* covered by the bitmap, isn't marked as free.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
- 256 * 1024)) {
+ if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -545,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
* by the bitmap too, isn't marked as free either.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024,
- 256 * 1024)) {
+ if (test_check_exists(cache, SZ_128M, SZ_256K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -556,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* lets make sure the free space cache marks it as free in the bitmap,
* and doesn't insert a new extent entry to represent this region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
test_msg("Bitmap region not marked as free\n");
return -ENOENT;
}
@@ -581,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
- 4096);
+ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -601,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* expand the range covered by the existing extent entry that represents
* the free space [128Mb - 256Kb, 128Mb - 128Kb[.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
- 128 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
test_msg("Extent region not marked as free\n");
return -ENOENT;
}
@@ -637,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that represents the 1Mb free space, and therefore we're able to
* allocate the whole free space at once.
*/
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
- 1 * 1024 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
test_msg("Expected region not marked as free\n");
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+ if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) {
test_msg("Cache free space is not 1Mb + 4Kb\n");
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache,
- 0, 1 * 1024 * 1024, 0,
+ 0, SZ_1M, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+ if (offset != (SZ_128M - SZ_256K)) {
test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -670,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache,
0, 4096, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+ if (offset != (SZ_128M + SZ_16M)) {
test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -691,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
/*
* Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
*/
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
- 128 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
- ret = test_add_free_space_entry(cache, 0,
- 128 * 1024 * 1024 - 512 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
@@ -717,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* [128Mb + 128b, 128Mb + 256Kb[
* [128Mb - 768Kb, 128Mb - 512Kb[
*/
- ret = btrfs_remove_free_space(cache,
- 0,
- 128 * 1024 * 1024 - 768 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
if (ret) {
test_msg("Failed to free part of bitmap space %d\n", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
- 256 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
@@ -741,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
* as free anymore.
*/
- if (test_check_exists(cache, 0,
- 128 * 1024 * 1024 - 768 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
test_msg("Bitmap region not removed from space cache\n");
return -EINVAL;
}
@@ -751,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb - 512Kb, 128Mb[, which is
* covered by the bitmap, isn't marked as free.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024)) {
+ if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -762,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* lets make sure the free space cache marks it as free in the bitmap,
* and doesn't insert a new extent entry to represent this region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
test_msg("Bitmap region not marked as free\n");
return -ENOENT;
}
@@ -789,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+ ret = btrfs_add_free_space(cache, SZ_32M, 8192);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -800,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* expand the range covered by the existing extent entry that represents
* the free space [128Mb + 128Kb, 128Mb + 256Kb[.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
test_msg("Extent region not marked as free\n");
return -ENOENT;
}
@@ -834,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that represents the 1Mb free space, and therefore we're able to
* allocate the whole free space at once.
*/
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
- 1 * 1024 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
test_msg("Expected region not marked as free\n");
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+ if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) {
test_msg("Cache free space is not 1Mb + 8Kb\n");
return -EINVAL;
}
- offset = btrfs_find_space_for_alloc(cache,
- 0, 1 * 1024 * 1024, 0,
+ offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+ if (offset != (SZ_128M - 768 * SZ_1K)) {
test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -867,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache,
0, 8192, 0,
&max_extent_size);
- if (offset != (32 * 1024 * 1024)) {
+ if (offset != SZ_32M) {
test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -877,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
if (ret)
return ret;
- cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+ cache->free_space_ctl->op = orig_free_space_ops;
__btrfs_remove_free_space_cache(cache->free_space_ctl);
return 0;
@@ -891,7 +832,7 @@ int btrfs_test_free_space_cache(void)
test_msg("Running btrfs free space cache tests\n");
- cache = init_test_block_group();
+ cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
if (!cache) {
test_msg("Couldn't run the tests\n");
return 0;
@@ -922,9 +863,7 @@ int btrfs_test_free_space_cache(void)
ret = test_steal_space_from_bitmap_to_extent(cache);
out:
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
- kfree(cache->free_space_ctl);
- kfree(cache);
+ btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
test_msg("Free space cache tests finished\n");
return ret;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
new file mode 100644
index 000000000000..d05fe1ab4808
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../disk-io.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
+
+struct free_space_extent {
+ u64 start, length;
+};
+
+/*
+ * The test cases align their operations to this in order to hit some of the
+ * edge cases in the bitmap code.
+ */
+#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096)
+
+static int __check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path,
+ struct free_space_extent *extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ u64 extent_start = 0, offset, end;
+ u32 flags, extent_count;
+ unsigned int i;
+ int ret;
+
+ info = search_free_space_info(trans, fs_info, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_msg("Could not find free space info\n");
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ if (extent_count != num_extents) {
+ test_msg("Extent count is wrong\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (path->slots[0] != 0)
+ goto invalid;
+ end = cache->key.objectid + cache->key.offset;
+ i = 0;
+ while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY)
+ goto invalid;
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(cache, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ if (i >= num_extents)
+ goto invalid;
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ offset - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ prev_bit = bit;
+ offset += cache->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ end - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ if (i != num_extents)
+ goto invalid;
+ } else {
+ if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 ||
+ path->slots[0] != 0)
+ goto invalid;
+ for (i = 0; i < num_extents; i++) {
+ path->slots[0]++;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY ||
+ key.objectid != extents[i].start ||
+ key.offset != extents[i].length)
+ goto invalid;
+ }
+ }
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+invalid:
+ test_msg("Free space tree is invalid\n");
+ ret = -EINVAL;
+ goto out;
+}
+
+static int check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path,
+ struct free_space_extent *extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ info = search_free_space_info(trans, fs_info, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_msg("Could not find free space info\n");
+ btrfs_release_path(path);
+ return PTR_ERR(info);
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ ret = __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+ if (ret)
+ return ret;
+
+ /* Flip it to the other format and check that for good measure. */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ ret = convert_free_space_to_extents(trans, fs_info, cache, path);
+ if (ret) {
+ test_msg("Could not convert to extents\n");
+ return ret;
+ }
+ } else {
+ ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path);
+ if (ret) {
+ test_msg("Could not convert to bitmaps\n");
+ return ret;
+ }
+ }
+ return __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+}
+
+static int test_empty_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, cache->key.offset},
+ };
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_all(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {};
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_beginning(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid + BITMAP_RANGE,
+ cache->key.offset - BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+
+}
+
+static int test_remove_end(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, cache->key.offset - BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid +
+ cache->key.offset - BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_middle(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, BITMAP_RANGE},
+ {cache->key.objectid + 2 * BITMAP_RANGE,
+ cache->key.offset - 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_left(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_right(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_both(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, 3 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_none(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, BITMAP_RANGE},
+ {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE},
+ {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 4 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *,
+ struct btrfs_fs_info *,
+ struct btrfs_block_group_cache *,
+ struct btrfs_path *);
+
+static int run_test(test_func_t test_func, int bitmaps)
+{
+ struct btrfs_root *root = NULL;
+ struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_trans_handle trans;
+ struct btrfs_path *path = NULL;
+ int ret;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate dummy root\n");
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
+ root->fs_info->free_space_root = root;
+ root->fs_info->tree_root = root;
+
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 8192;
+
+ cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE);
+ if (!cache) {
+ test_msg("Couldn't allocate dummy block group cache\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ cache->bitmap_low_thresh = 0;
+ cache->bitmap_high_thresh = (u32)-1;
+ cache->needs_free_space = 1;
+
+ btrfs_init_dummy_trans(&trans);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ ret = add_block_group_free_space(&trans, root->fs_info, cache);
+ if (ret) {
+ test_msg("Could not add block group free space\n");
+ goto out;
+ }
+
+ if (bitmaps) {
+ ret = convert_free_space_to_bitmaps(&trans, root->fs_info,
+ cache, path);
+ if (ret) {
+ test_msg("Could not convert block group to bitmaps\n");
+ goto out;
+ }
+ }
+
+ ret = test_func(&trans, root->fs_info, cache, path);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_free_space(&trans, root->fs_info, cache);
+ if (ret) {
+ test_msg("Could not remove block group free space\n");
+ goto out;
+ }
+
+ if (btrfs_header_nritems(root->node) != 0) {
+ test_msg("Free space tree has leftover items\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ btrfs_free_dummy_block_group(cache);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+static int run_test_both_formats(test_func_t test_func)
+{
+ int ret;
+
+ ret = run_test(test_func, 0);
+ if (ret)
+ return ret;
+ return run_test(test_func, 1);
+}
+
+int btrfs_test_free_space_tree(void)
+{
+ test_func_t tests[] = {
+ test_empty_block_group,
+ test_remove_all,
+ test_remove_beginning,
+ test_remove_end,
+ test_remove_middle,
+ test_merge_left,
+ test_merge_right,
+ test_merge_both,
+ test_merge_none,
+ };
+ int i;
+
+ test_msg("Running free space tree tests\n");
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ int ret = run_test_both_formats(tests[i]);
+ if (ret) {
+ test_msg("%pf failed\n", tests[i]);
+ return ret;
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 054fc0d97131..5de55fdd28bc 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -100,7 +100,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
static void setup_file_extents(struct btrfs_root *root)
{
int slot = 0;
- u64 disk_bytenr = 1 * 1024 * 1024;
+ u64 disk_bytenr = SZ_1M;
u64 offset = 0;
/* First we want a hole */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 846d277b1901..8ea5d34bc5a2 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -23,14 +23,6 @@
#include "../qgroup.h"
#include "../backref.h"
-static void init_dummy_trans(struct btrfs_trans_handle *trans)
-{
- memset(trans, 0, sizeof(*trans));
- trans->transid = 1;
- INIT_LIST_HEAD(&trans->qgroup_ref_list);
- trans->type = __TRANS_DUMMY;
-}
-
static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u64 parent, u64 root_objectid)
{
@@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
ins.objectid = bytenr;
ins.type = BTRFS_EXTENT_ITEM_KEY;
@@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 refs;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
struct btrfs_path *path;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
u64 refs;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
struct ulist *new_roots = NULL;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
test_msg("Qgroup basic add\n");
ret = btrfs_create_qgroup(NULL, fs_info, 5);
@@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root)
struct ulist *new_roots = NULL;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
test_msg("Qgroup multiple refs test\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index be8eae80ff65..b6031ce474f7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
list_del_init(&em->list);
free_extent_map(em);
}
+ /*
+ * If any block groups are found in ->deleted_bgs then it's
+ * because the transaction was aborted and a commit did not
+ * happen (things failed before writing the new superblock
+ * and calling btrfs_finish_extent_commit()), so we can not
+ * discard the physical locations of the block groups.
+ */
+ while (!list_empty(&transaction->deleted_bgs)) {
+ struct btrfs_block_group_cache *cache;
+
+ cache = list_first_entry(&transaction->deleted_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&cache->bg_list);
+ btrfs_put_block_group_trimming(cache);
+ btrfs_put_block_group(cache);
+ }
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -634,17 +651,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN, 0);
+ return start_transaction(root, 0, TRANS_JOIN,
+ BTRFS_RESERVE_NO_FLUSH);
}
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
+ return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
+ BTRFS_RESERVE_NO_FLUSH);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_USERSPACE, 0);
+ return start_transaction(root, 0, TRANS_USERSPACE,
+ BTRFS_RESERVE_NO_FLUSH);
}
/*
@@ -662,7 +682,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
*/
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_ATTACH, 0);
+ return start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH);
}
/*
@@ -677,7 +698,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
- trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+ trans = start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH);
if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
btrfs_wait_for_commit(root, 0);
@@ -1319,17 +1341,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 root_flags;
uuid_le new_uuid;
- path = btrfs_alloc_path();
- if (!path) {
- pending->error = -ENOMEM;
- return 0;
- }
+ ASSERT(pending->path);
+ path = pending->path;
- new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
- if (!new_root_item) {
- pending->error = -ENOMEM;
- goto root_item_alloc_fail;
- }
+ ASSERT(pending->root_item);
+ new_root_item = pending->root_item;
pending->error = btrfs_find_free_objectid(tree_root, &objectid);
if (pending->error)
@@ -1562,8 +1578,10 @@ clear_skip_qgroup:
btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
-root_item_alloc_fail:
+ pending->root_item = NULL;
btrfs_free_path(path);
+ pending->path = NULL;
+
return ret;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 64c8221b6165..72be51f7ca2f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -137,8 +137,10 @@ struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;
struct btrfs_root *root;
+ struct btrfs_root_item *root_item;
struct btrfs_root *snap;
struct btrfs_qgroup_inherit *inherit;
+ struct btrfs_path *path;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
u64 qgroup_reserved;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f31db4325339..cb65089127cc 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_release_path(path);
+ /*
+ * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+ * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+ * a deadlock (attempting to write lock an already write locked leaf).
+ */
+ path->lowest_level = 1;
wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (wret < 0) {
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = 0;
goto out;
}
- path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- min_trans);
+ /*
+ * The node at level 1 must always be locked when our path has
+ * keep_locks set and lowest_level is 1, regardless of the value of
+ * path->slots[1].
+ */
+ BUG_ON(path->locks[1] == 0);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
&last_ret,
@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
WARN_ON(ret == -EAGAIN);
goto out;
}
+ /*
+ * Now that we reallocated the node we can find the next key. Note that
+ * btrfs_find_next_key() can release our path and do another search
+ * without COWing, this is because even with path->keep_locks = 1,
+ * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+ * node when path->slots[node_level - 1] does not point to the last
+ * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+ * we search for the next key after reallocating our node.
+ */
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+ min_trans);
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a23399e8e3ab..366b335946fa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
-const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
[BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
[BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
@@ -125,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static void btrfs_close_one_device(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -232,6 +233,7 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
+ btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
@@ -1102,7 +1104,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
key.objectid = device->devid;
key.offset = start;
@@ -1182,7 +1184,7 @@ again:
struct map_lookup *map;
int i;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
u64 end;
@@ -1257,6 +1259,15 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
int ret;
int slot;
struct extent_buffer *l;
+ u64 min_search_start;
+
+ /*
+ * We don't want to overwrite the superblock on the drive nor any area
+ * used by the boot loader (grub for example), so we make sure to start
+ * at an offset of at least 1MB.
+ */
+ min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ search_start = max(search_start, min_search_start);
path = btrfs_alloc_path();
if (!path)
@@ -1271,7 +1282,7 @@ again:
goto out;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -1397,18 +1408,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
- struct btrfs_root *root = device->dev_root;
- u64 search_start;
-
/* FIXME use last free of some kind */
-
- /*
- * we don't want to overwrite the superblock on the drive,
- * so we make sure to start at an offset of at least 1MB
- */
- search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
return find_free_dev_extent_start(trans->transaction, device,
- num_bytes, search_start, start, len);
+ num_bytes, 0, start, len);
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
@@ -1642,7 +1644,6 @@ static void update_dev_time(char *path_name)
return;
file_update_time(filp);
filp_close(filp, NULL);
- return;
}
static int btrfs_rm_dev_item(struct btrfs_root *root,
@@ -2755,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
free_extent_map(em);
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
lock_chunks(root->fs_info->chunk_root);
check_system_chunk(trans, extent_root, map->type);
unlock_chunks(root->fs_info->chunk_root);
@@ -3406,7 +3407,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, devices, dev_list) {
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
- size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+ size_to_free = min_t(u64, size_to_free, SZ_1M);
if (!device->writeable ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
@@ -3723,14 +3724,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
goto out;
}
- /* allow dup'ed data chunks only in mixed mode */
- if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
- btrfs_err(fs_info, "dup for data is not allowed");
- ret = -EINVAL;
- goto out;
- }
-
/* allow to reduce meta or sys integrity only if force set */
allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
@@ -3756,6 +3749,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
+ if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+ btrfs_warn(fs_info,
+ "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
+ bctl->meta.target, bctl->data.target);
+ }
+
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
fs_info->num_tolerated_disk_barrier_failures = min(
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
@@ -4268,7 +4268,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
lock_chunks(root);
@@ -4460,7 +4460,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
{
/* TODO allow them to set a preferred stripe size */
- return 64 * 1024;
+ return SZ_64K;
}
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
@@ -4528,21 +4528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ncopies = btrfs_raid_array[index].ncopies;
if (type & BTRFS_BLOCK_GROUP_DATA) {
- max_stripe_size = 1024 * 1024 * 1024;
+ max_stripe_size = SZ_1G;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
- max_stripe_size = 1024 * 1024 * 1024;
+ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ max_stripe_size = SZ_1G;
else
- max_stripe_size = 256 * 1024 * 1024;
+ max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = 32 * 1024 * 1024;
+ max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
@@ -4719,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
goto error;
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->bdev = (struct block_device *)map;
+ em->map_lookup = map;
em->start = start;
em->len = num_bytes;
em->block_start = 0;
@@ -4793,7 +4793,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 dev_offset;
u64 stripe_size;
int i = 0;
- int ret;
+ int ret = 0;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
@@ -4814,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
stripe_size = em->orig_block_len;
@@ -4824,20 +4824,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()).
+ */
+ mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
ret = btrfs_update_device(trans, device);
if (ret)
- goto out;
+ break;
ret = btrfs_alloc_dev_extent(trans, device,
chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
chunk_offset, dev_offset,
stripe_size);
if (ret)
- goto out;
+ break;
+ }
+ if (ret) {
+ mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+ goto out;
}
stripe = &chunk->stripe;
@@ -4850,6 +4862,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
+ mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
btrfs_set_stack_chunk_length(chunk, chunk_size);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
@@ -4956,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
if (!em)
return 1;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->missing) {
miss_ndevs++;
@@ -5036,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
@@ -5072,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
free_extent_map(em);
@@ -5093,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
@@ -5252,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
offset = logical - em->start;
stripe_len = map->stripe_len;
@@ -5366,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
* target drive.
*/
for (i = 0; i < tmp_num_stripes; i++) {
- if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it
- * simple, only add the mirror with the
- * lowest physical address
- */
- if (found &&
- physical_of_found <=
- tmp_bbio->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found =
- tmp_bbio->stripes[i].physical;
- }
+ if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
+ continue;
+
+ /*
+ * In case of DUP, in order to keep it simple, only add
+ * the mirror with the lowest physical address
+ */
+ if (found &&
+ physical_of_found <= tmp_bbio->stripes[i].physical)
+ continue;
+
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = tmp_bbio->stripes[i].physical;
}
- if (found) {
- mirror_num = index_srcdev + 1;
- patch_the_first_stripe_for_dev_replace = 1;
- physical_to_patch_in_first_stripe = physical_of_found;
- } else {
+ btrfs_put_bbio(tmp_bbio);
+
+ if (!found) {
WARN_ON(1);
ret = -EIO;
- btrfs_put_bbio(tmp_bbio);
goto out;
}
- btrfs_put_bbio(tmp_bbio);
+ mirror_num = index_srcdev + 1;
+ patch_the_first_stripe_for_dev_replace = 1;
+ physical_to_patch_in_first_stripe = physical_of_found;
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
@@ -5794,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
free_extent_map(em);
return -EIO;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
length = em->len;
rmap_len = map->stripe_len;
@@ -6057,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
- if (bbio->raid_map) {
+ if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+ ((rw & WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
@@ -6198,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
struct extent_map *em;
u64 logical;
u64 length;
+ u64 stripe_len;
u64 devid;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
@@ -6206,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ /* Validation check */
+ if (!num_stripes) {
+ btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+ num_stripes);
+ return -EIO;
+ }
+ if (!IS_ALIGNED(logical, root->sectorsize)) {
+ btrfs_err(root->fs_info,
+ "invalid chunk logical %llu", logical);
+ return -EIO;
+ }
+ if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+ btrfs_err(root->fs_info,
+ "invalid chunk length %llu", length);
+ return -EIO;
+ }
+ if (!is_power_of_2(stripe_len)) {
+ btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+ stripe_len);
+ return -EIO;
+ }
+ if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ btrfs_chunk_type(leaf, chunk)) {
+ btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+ ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ btrfs_chunk_type(leaf, chunk));
+ return -EIO;
+ }
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6222,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
em = alloc_extent_map();
if (!em)
return -ENOMEM;
- num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
free_extent_map(em);
@@ -6230,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->bdev = (struct block_device *)map;
+ em->map_lookup = map;
em->start = logical;
em->len = length;
em->orig_start = 0;
@@ -6465,11 +6508,11 @@ int btrfs_read_sys_array(struct btrfs_root *root)
sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
- btrfs_set_buffer_uptodate(sb);
+ set_extent_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artifical and just used to read the system array.
- * btrfs_set_buffer_uptodate() call does not properly mark all it's
+ * set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
* the page's extents up-to-date (the hole beyond sb),
@@ -6512,6 +6555,14 @@ int btrfs_read_sys_array(struct btrfs_root *root)
goto out_short_read;
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ printk(KERN_ERR
+ "BTRFS: invalid number of stripes %u in sys_array at offset %u\n",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
@@ -6520,6 +6571,9 @@ int btrfs_read_sys_array(struct btrfs_root *root)
if (ret)
break;
} else {
+ printk(KERN_ERR
+ "BTRFS: unexpected item type %u in sys_array at offset %u\n",
+ (u32)key.type, cur_offset);
ret = -EIO;
break;
}
@@ -6921,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
/* In order to kick the device replace finish process */
lock_chunks(root);
list_for_each_entry(em, &transaction->pending_chunks, list) {
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
dev = map->stripes[i].dev;
@@ -6949,7 +7003,7 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
}
}
-void btrfs_close_one_device(struct btrfs_device *device)
+static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
struct btrfs_device *new_device;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d5c84f6b1353..1939ebde63df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,7 +26,7 @@
extern struct mutex uuid_mutex;
-#define BTRFS_STRIPE_LEN (64 * 1024)
+#define BTRFS_STRIPE_LEN SZ_64K
struct buffer_head;
struct btrfs_pending_bios {
@@ -566,6 +566,5 @@ static inline void unlock_chunks(struct btrfs_root *root)
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_close_one_device(struct btrfs_device *device);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7cbef1a14fe1..6c68d6356197 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
* locks the inode's i_mutex before calling setxattr or removexattr.
*/
if (flags & XATTR_REPLACE) {
- ASSERT(mutex_is_locked(&inode->i_mutex));
+ ASSERT(inode_is_locked(inode));
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
name, name_len, 0);
if (!di)
@@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* search for our xattrs */
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -446,7 +446,7 @@ static int btrfs_initxattrs(struct inode *inode,
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_NOFS);
+ strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
err = -ENOMEM;
break;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index afa023dded5b..675a3332d72f 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -446,7 +446,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
return 0;
cachefiles_begin_secure(cache, &saved_cred);
- mutex_lock(&d_inode(object->backer)->i_mutex);
+ inode_lock(d_inode(object->backer));
/* if there's an extension to a partial page at the end of the backing
* file, we need to discard the partial page so that we pick up new
@@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
ret = notify_change(object->backer, &newattrs, NULL);
truncate_failed:
- mutex_unlock(&d_inode(object->backer)->i_mutex);
+ inode_unlock(d_inode(object->backer));
cachefiles_end_secure(cache, saved_cred);
if (ret == -EIO) {
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index c4b893453e0e..1c2334c163dd 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -295,7 +295,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
cachefiles_mark_object_buried(cache, rep, why);
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
if (ret == -EIO)
cachefiles_io_error(cache, "Unlink failed");
@@ -306,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
/* directories have to be moved to the graveyard */
_debug("move stale object to graveyard");
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
try_again:
/* first step is to make up a grave dentry in the graveyard */
@@ -423,13 +423,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
dir = dget_parent(object->dentry);
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
/* object allocation for the same key preemptively deleted this
* object's file so that it could create its own file */
_debug("object preemptively buried");
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
ret = 0;
} else {
/* we need to check that our parent is _still_ our parent - it
@@ -442,7 +442,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
/* it got moved, presumably by cachefilesd culling it,
* so it's no longer in the key path and we can ignore
* it */
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
ret = 0;
}
}
@@ -501,7 +501,7 @@ lookup_again:
/* search the current directory for the element name */
_debug("lookup '%s'", name);
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
start = jiffies;
next = lookup_one_len(name, dir, nlen);
@@ -585,7 +585,7 @@ lookup_again:
/* process the next component */
if (key) {
_debug("advance");
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(dir);
dir = next;
next = NULL;
@@ -623,7 +623,7 @@ lookup_again:
/* note that we're now using this object */
ret = cachefiles_mark_object_active(cache, object);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(dir);
dir = NULL;
@@ -705,7 +705,7 @@ lookup_error:
cachefiles_io_error(cache, "Lookup failed");
next = NULL;
error:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(next);
error_out2:
dput(dir);
@@ -729,7 +729,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
_enter(",,%s", dirname);
/* search the current directory for the element name */
- mutex_lock(&d_inode(dir)->i_mutex);
+ inode_lock(d_inode(dir));
start = jiffies;
subdir = lookup_one_len(dirname, dir, strlen(dirname));
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
d_backing_inode(subdir)->i_ino);
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
/* we need to make sure the subdir is a directory */
ASSERT(d_backing_inode(subdir));
@@ -800,19 +800,19 @@ check_error:
return ERR_PTR(ret);
mkdir_error:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(subdir);
pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
lookup_error:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
ret = PTR_ERR(subdir);
pr_err("Lookup %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
nomem_d_alloc:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
_leave(" = -ENOMEM");
return ERR_PTR(-ENOMEM);
}
@@ -837,7 +837,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
// dir, filename);
/* look up the victim */
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
start = jiffies;
victim = lookup_one_len(filename, dir, strlen(filename));
@@ -852,7 +852,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
* at the netfs's request whilst the cull was in progress
*/
if (d_is_negative(victim)) {
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(victim);
_leave(" = -ENOENT [absent]");
return ERR_PTR(-ENOENT);
@@ -881,13 +881,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
object_in_use:
read_unlock(&cache->active_lock);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(victim);
//_leave(" = -EBUSY [in use]");
return ERR_PTR(-EBUSY);
lookup_error:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
ret = PTR_ERR(victim);
if (ret == -ENOENT) {
/* file or dir now absent - probably retired by netfs */
@@ -947,7 +947,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
return 0;
error_unlock:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
error:
dput(victim);
if (ret == -ENOENT) {
@@ -982,7 +982,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
if (IS_ERR(victim))
return PTR_ERR(victim);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(victim);
//_leave(" = 0");
return 0;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a4766ded1ba7..7680e2626815 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -197,7 +197,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
return;
/* Avoid multiple racing open requests */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (ci->fscache)
goto done;
@@ -207,7 +207,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
ci, true);
fscache_check_consistency(ci->fscache);
done:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c69e1253b47b..cdbf8cf3d52c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync)
goto out;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
@@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = wait_event_interruptible(ci->i_cap_wq,
caps_are_flushed(inode, flush_tid));
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out:
dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9314b4ea2375..fd11fb231a2e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = -EINVAL;
switch (whence) {
case SEEK_CUR:
@@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
}
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fe02ae7f056a..3b3172357326 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
if (IS_ERR(req))
return PTR_ERR(req);
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
req->r_inode = d_inode(child);
ihold(d_inode(child));
@@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
if (!err) {
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3c68e6aee2f0..10c5ae79696e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1014,7 +1014,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!prealloc_cf)
return -ENOMEM;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
@@ -1070,7 +1070,7 @@ retry_snap:
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
struct ceph_snap_context *snapc;
struct iov_iter data;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
spin_lock(&ci->i_ceph_lock);
if (__ceph_have_pending_cap_snap(ci)) {
@@ -1097,7 +1097,7 @@ retry_snap:
"got EOLDSNAPC, retrying\n",
inode, ceph_vinop(inode),
pos, (unsigned)count);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
goto retry_snap;
}
if (written > 0)
@@ -1117,7 +1117,7 @@ retry_snap:
iocb->ki_pos = pos + written;
if (inode->i_size > old_size)
ceph_fscache_update_objectsize(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
if (written >= 0) {
@@ -1147,7 +1147,7 @@ retry_snap:
goto out_unlocked;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out_unlocked:
ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL;
@@ -1162,7 +1162,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file->f_mapping->host;
int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
@@ -1207,7 +1207,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
@@ -1363,7 +1363,7 @@ static long ceph_fallocate(struct file *file, int mode,
if (!prealloc_cf)
return -ENOMEM;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (ceph_snap(inode) != CEPH_NOSNAP) {
ret = -EROFS;
@@ -1418,7 +1418,7 @@ static long ceph_fallocate(struct file *file, int mode,
ceph_put_cap_refs(ci, got);
unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ceph_free_cap_flush(prealloc_cf);
return ret;
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f446afada328..ca4d5e8457f1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -639,8 +639,8 @@ static int __init init_caches(void)
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ceph_inode_init_once);
if (ceph_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 565a4c0d2ee9..c48ca13673e3 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -642,9 +642,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
while (*s && *s != sep)
s++;
- mutex_lock(&dir->i_mutex);
+ inode_lock(dir);
child = lookup_one_len(p, dentry, s - p);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
@@ -1109,7 +1109,7 @@ cifs_init_inodecache(void)
cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
sizeof(struct cifsInodeInfo),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
cifs_init_once);
if (cifs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0068e82217c3..ff882aeaccc6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (rc)
return rc;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
xid = get_xid();
@@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
}
free_xid(xid);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return rc;
}
@@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (rc)
return rc;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
xid = get_xid();
@@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
free_xid(xid);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return rc;
}
@@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
* with a brlock that prevents writing.
*/
down_read(&cinode->lock_sem);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
rc = generic_write_checks(iocb, from);
if (rc <= 0)
@@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
else
rc = -EACCES;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (rc > 0) {
ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index f829fe963f5b..5104d84c4f64 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -72,8 +72,7 @@ void coda_sysctl_clean(void);
} while (0)
-#define CODA_FREE(ptr,size) \
- do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+#define CODA_FREE(ptr, size) kvfree((ptr))
/* inode to cnode access functions */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index fda9f4311212..42e731b8c80a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
if (host_file->f_op->iterate) {
struct inode *host_inode = file_inode(host_file);
- mutex_lock(&host_inode->i_mutex);
+ inode_lock(host_inode);
ret = -ENOENT;
if (!IS_DEADDIR(host_inode)) {
ret = host_file->f_op->iterate(host_file, ctx);
file_accessed(host_file);
}
- mutex_unlock(&host_inode->i_mutex);
+ inode_unlock(host_inode);
return ret;
}
/* Venus: we must read Venus dirents from a file */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 1da3805f3ddc..f47c7483863b 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
host_file = cfi->cfi_container;
file_start_write(host_file);
- mutex_lock(&coda_inode->i_mutex);
+ inode_lock(coda_inode);
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
- mutex_unlock(&coda_inode->i_mutex);
+ inode_unlock(coda_inode);
file_end_write(host_file);
return ret;
}
@@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end);
if (err)
return err;
- mutex_lock(&coda_inode->i_mutex);
+ inode_lock(coda_inode);
cfi = CODA_FTOC(coda_file);
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
err = vfs_fsync(host_file, datasync);
if (!err && !datasync)
err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
- mutex_unlock(&coda_inode->i_mutex);
+ inode_unlock(coda_inode);
return err;
}
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cac1390b87a3..57e81cbba0fa 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -74,9 +74,9 @@ static void init_once(void *foo)
int __init coda_init_inodecache(void)
{
coda_inode_cachep = kmem_cache_create("coda_inode_cache",
- sizeof(struct coda_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct coda_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (coda_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7ae97e83f121..f419519ec41f 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -640,13 +640,13 @@ static void detach_groups(struct config_group *group)
child = sd->s_dentry;
- mutex_lock(&d_inode(child)->i_mutex);
+ inode_lock(d_inode(child));
configfs_detach_group(sd->s_element);
d_inode(child)->i_flags |= S_DEAD;
dont_mount(child);
- mutex_unlock(&d_inode(child)->i_mutex);
+ inode_unlock(d_inode(child));
d_delete(child);
dput(child);
@@ -834,11 +834,11 @@ static int configfs_attach_item(struct config_item *parent_item,
* the VFS may already have hit and used them. Thus,
* we must lock them as rmdir() would.
*/
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
configfs_remove_dir(item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
d_delete(dentry);
}
}
@@ -874,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item,
* We must also lock the inode to remove it safely in case of
* error, as rmdir() would.
*/
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
configfs_adjust_dir_dirent_depth_before_populate(sd);
ret = populate_groups(to_config_group(item));
if (ret) {
@@ -883,7 +883,7 @@ static int configfs_attach_group(struct config_item *parent_item,
dont_mount(dentry);
}
configfs_adjust_dir_dirent_depth_after_populate(sd);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
if (ret)
d_delete(dentry);
}
@@ -1070,11 +1070,55 @@ out:
return ret;
}
+static int configfs_do_depend_item(struct dentry *subsys_dentry,
+ struct config_item *target)
+{
+ struct configfs_dirent *p;
+ int ret;
+
+ spin_lock(&configfs_dirent_lock);
+ /* Scan the tree, return 0 if found */
+ ret = configfs_depend_prep(subsys_dentry, target);
+ if (ret)
+ goto out_unlock_dirent_lock;
+
+ /*
+ * We are sure that the item is not about to be removed by rmdir(), and
+ * not in the middle of attachment by mkdir().
+ */
+ p = target->ci_dentry->d_fsdata;
+ p->s_dependent_count += 1;
+
+out_unlock_dirent_lock:
+ spin_unlock(&configfs_dirent_lock);
+
+ return ret;
+}
+
+static inline struct configfs_dirent *
+configfs_find_subsys_dentry(struct configfs_dirent *root_sd,
+ struct config_item *subsys_item)
+{
+ struct configfs_dirent *p;
+ struct configfs_dirent *ret = NULL;
+
+ list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+ if (p->s_type & CONFIGFS_DIR &&
+ p->s_element == subsys_item) {
+ ret = p;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
int configfs_depend_item(struct configfs_subsystem *subsys,
struct config_item *target)
{
int ret;
- struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+ struct configfs_dirent *subsys_sd;
struct config_item *s_item = &subsys->su_group.cg_item;
struct dentry *root;
@@ -1091,43 +1135,19 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
* subsystem is really registered, and so we need to lock out
* configfs_[un]register_subsystem().
*/
- mutex_lock(&d_inode(root)->i_mutex);
-
- root_sd = root->d_fsdata;
-
- list_for_each_entry(p, &root_sd->s_children, s_sibling) {
- if (p->s_type & CONFIGFS_DIR) {
- if (p->s_element == s_item) {
- subsys_sd = p;
- break;
- }
- }
- }
+ inode_lock(d_inode(root));
+ subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item);
if (!subsys_sd) {
ret = -ENOENT;
goto out_unlock_fs;
}
/* Ok, now we can trust subsys/s_item */
+ ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
- spin_lock(&configfs_dirent_lock);
- /* Scan the tree, return 0 if found */
- ret = configfs_depend_prep(subsys_sd->s_dentry, target);
- if (ret)
- goto out_unlock_dirent_lock;
-
- /*
- * We are sure that the item is not about to be removed by rmdir(), and
- * not in the middle of attachment by mkdir().
- */
- p = target->ci_dentry->d_fsdata;
- p->s_dependent_count += 1;
-
-out_unlock_dirent_lock:
- spin_unlock(&configfs_dirent_lock);
out_unlock_fs:
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
/*
* If we succeeded, the fs is pinned via other methods. If not,
@@ -1144,8 +1164,7 @@ EXPORT_SYMBOL(configfs_depend_item);
* configfs_depend_item() because we know that that the client driver is
* pinned, thus the subsystem is pinned, and therefore configfs is pinned.
*/
-void configfs_undepend_item(struct configfs_subsystem *subsys,
- struct config_item *target)
+void configfs_undepend_item(struct config_item *target)
{
struct configfs_dirent *sd;
@@ -1168,6 +1187,79 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
}
EXPORT_SYMBOL(configfs_undepend_item);
+/*
+ * caller_subsys is a caller's subsystem not target's. This is used to
+ * determine if we should lock root and check subsys or not. When we are
+ * in the same subsystem as our target there is no need to do locking as
+ * we know that subsys is valid and is not unregistered during this function
+ * as we are called from callback of one of his children and VFS holds a lock
+ * on some inode. Otherwise we have to lock our root to ensure that target's
+ * subsystem it is not unregistered during this function.
+ */
+int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys,
+ struct config_item *target)
+{
+ struct configfs_subsystem *target_subsys;
+ struct config_group *root, *parent;
+ struct configfs_dirent *subsys_sd;
+ int ret = -ENOENT;
+
+ /* Disallow this function for configfs root */
+ if (configfs_is_root(target))
+ return -EINVAL;
+
+ parent = target->ci_group;
+ /*
+ * This may happen when someone is trying to depend root
+ * directory of some subsystem
+ */
+ if (configfs_is_root(&parent->cg_item)) {
+ target_subsys = to_configfs_subsystem(to_config_group(target));
+ root = parent;
+ } else {
+ target_subsys = parent->cg_subsys;
+ /* Find a cofnigfs root as we may need it for locking */
+ for (root = parent; !configfs_is_root(&root->cg_item);
+ root = root->cg_item.ci_group)
+ ;
+ }
+
+ if (target_subsys != caller_subsys) {
+ /*
+ * We are in other configfs subsystem, so we have to do
+ * additional locking to prevent other subsystem from being
+ * unregistered
+ */
+ inode_lock(d_inode(root->cg_item.ci_dentry));
+
+ /*
+ * As we are trying to depend item from other subsystem
+ * we have to check if this subsystem is still registered
+ */
+ subsys_sd = configfs_find_subsys_dentry(
+ root->cg_item.ci_dentry->d_fsdata,
+ &target_subsys->su_group.cg_item);
+ if (!subsys_sd)
+ goto out_root_unlock;
+ } else {
+ subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata;
+ }
+
+ /* Now we can execute core of depend item */
+ ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
+
+ if (target_subsys != caller_subsys)
+out_root_unlock:
+ /*
+ * We were called from subsystem other than our target so we
+ * took some locks so now it's time to release them
+ */
+ inode_unlock(d_inode(root->cg_item.ci_dentry));
+
+ return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item_unlocked);
+
static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int ret = 0;
@@ -1469,7 +1561,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
down_write(&configfs_rename_sem);
parent = item->parent->dentry;
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
if (!IS_ERR(new_dentry)) {
@@ -1485,7 +1577,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
error = -EEXIST;
dput(new_dentry);
}
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
up_write(&configfs_rename_sem);
return error;
@@ -1498,7 +1590,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
struct configfs_dirent * parent_sd = dentry->d_fsdata;
int err;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
/*
* Fake invisibility if dir belongs to a group/default groups hierarchy
* being attached
@@ -1511,7 +1603,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
else
err = 0;
}
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return err;
}
@@ -1521,11 +1613,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
struct dentry * dentry = file->f_path.dentry;
struct configfs_dirent * cursor = file->private_data;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
spin_lock(&configfs_dirent_lock);
list_del_init(&cursor->s_sibling);
spin_unlock(&configfs_dirent_lock);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
release_configfs_dirent(cursor);
@@ -1606,7 +1698,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry * dentry = file->f_path.dentry;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
switch (whence) {
case 1:
offset += file->f_pos;
@@ -1614,7 +1706,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return -EINVAL;
}
if (offset != file->f_pos) {
@@ -1640,7 +1732,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
spin_unlock(&configfs_dirent_lock);
}
}
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return offset;
}
@@ -1675,14 +1767,14 @@ int configfs_register_group(struct config_group *parent_group,
parent = parent_group->cg_item.ci_dentry;
- mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
ret = create_default_group(parent_group, group);
if (!ret) {
spin_lock(&configfs_dirent_lock);
configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
spin_unlock(&configfs_dirent_lock);
}
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
return ret;
}
EXPORT_SYMBOL(configfs_register_group);
@@ -1699,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group)
struct dentry *dentry = group->cg_item.ci_dentry;
struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
- mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
spin_lock(&configfs_dirent_lock);
configfs_detach_prep(dentry, NULL);
spin_unlock(&configfs_dirent_lock);
@@ -1708,7 +1800,7 @@ void configfs_unregister_group(struct config_group *group)
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
d_delete(dentry);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
dput(dentry);
@@ -1780,7 +1872,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
sd = root->d_fsdata;
link_group(to_config_group(sd->s_element), group);
- mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
err = -ENOMEM;
dentry = d_alloc_name(root, group->cg_item.ci_name);
@@ -1800,7 +1892,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
}
}
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
if (err) {
unlink_group(group);
@@ -1821,9 +1913,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
return;
}
- mutex_lock_nested(&d_inode(root)->i_mutex,
+ inode_lock_nested(d_inode(root),
I_MUTEX_PARENT);
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
mutex_lock(&configfs_symlink_mutex);
spin_lock(&configfs_dirent_lock);
if (configfs_detach_prep(dentry, NULL)) {
@@ -1834,11 +1926,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
configfs_detach_group(&group->cg_item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
d_delete(dentry);
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
dput(dentry);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3687187c8ea5..33b7ee34eda5 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -540,10 +540,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
int error = 0;
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL);
+ inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
CONFIGFS_ITEM_ATTR);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
return error;
}
@@ -562,10 +562,10 @@ int configfs_create_bin_file(struct config_item *item,
umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG;
int error = 0;
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+ inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
CONFIGFS_ITEM_BIN_ATTR);
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
return error;
}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 0cc810e9dccc..cee087d8f7e0 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -255,7 +255,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
/* no inode means this hasn't been made visible yet */
return;
- mutex_lock(&d_inode(dir)->i_mutex);
+ inode_lock(d_inode(dir));
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
if (!sd->s_element)
continue;
@@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
break;
}
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
}
diff --git a/fs/coredump.c b/fs/coredump.c
index b3c153ca435d..9ea87e9fdccf 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -118,6 +118,26 @@ int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
ret = cn_vprintf(cn, fmt, arg);
va_end(arg);
+ if (ret == 0) {
+ /*
+ * Ensure that this coredump name component can't cause the
+ * resulting corefile path to consist of a ".." or ".".
+ */
+ if ((cn->used - cur == 1 && cn->corename[cur] == '.') ||
+ (cn->used - cur == 2 && cn->corename[cur] == '.'
+ && cn->corename[cur+1] == '.'))
+ cn->corename[cur] = '!';
+
+ /*
+ * Empty names are fishy and could be used to create a "//" in a
+ * corefile name, causing the coredump to happen one directory
+ * level too high. Enforce that all components of the core
+ * pattern are at least one character long.
+ */
+ if (cn->used == cur)
+ ret = cn_printf(cn, "!");
+ }
+
for (; cur < cn->used; ++cur) {
if (cn->corename[cur] == '/')
cn->corename[cur] = '!';
diff --git a/fs/dax.c b/fs/dax.c
index 43671b68220e..4fd6b0c5c6b5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,58 +24,73 @@
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/pagevec.h>
#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
+
+static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
+{
+ struct request_queue *q = bdev->bd_queue;
+ long rc = -EIO;
+
+ dax->addr = (void __pmem *) ERR_PTR(-EIO);
+ if (blk_queue_enter(q, true) != 0)
+ return rc;
+
+ rc = bdev_direct_access(bdev, dax);
+ if (rc < 0) {
+ dax->addr = (void __pmem *) ERR_PTR(rc);
+ blk_queue_exit(q);
+ return rc;
+ }
+ return rc;
+}
+
+static void dax_unmap_atomic(struct block_device *bdev,
+ const struct blk_dax_ctl *dax)
+{
+ if (IS_ERR(dax->addr))
+ return;
+ blk_queue_exit(bdev->bd_queue);
+}
/*
* dax_clear_blocks() is called from within transaction context from XFS,
* and hence this means the stack from this point must follow GFP_NOFS
* semantics for all operations.
*/
-int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
{
struct block_device *bdev = inode->i_sb->s_bdev;
- sector_t sector = block << (inode->i_blkbits - 9);
+ struct blk_dax_ctl dax = {
+ .sector = block << (inode->i_blkbits - 9),
+ .size = _size,
+ };
might_sleep();
do {
- void __pmem *addr;
- unsigned long pfn;
- long count;
+ long count, sz;
- count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+ count = dax_map_atomic(bdev, &dax);
if (count < 0)
return count;
- BUG_ON(size < count);
- while (count > 0) {
- unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
- if (pgsz > count)
- pgsz = count;
- clear_pmem(addr, pgsz);
- addr += pgsz;
- size -= pgsz;
- count -= pgsz;
- BUG_ON(pgsz & 511);
- sector += pgsz / 512;
- cond_resched();
- }
- } while (size);
+ sz = min_t(long, count, SZ_128K);
+ clear_pmem(dax.addr, sz);
+ dax.size -= sz;
+ dax.sector += sz / 512;
+ dax_unmap_atomic(bdev, &dax);
+ cond_resched();
+ } while (dax.size);
wmb_pmem();
return 0;
}
EXPORT_SYMBOL_GPL(dax_clear_blocks);
-static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
- unsigned blkbits)
-{
- unsigned long pfn;
- sector_t sector = bh->b_blocknr << (blkbits - 9);
- return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
-}
-
/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
loff_t pos, loff_t end)
@@ -105,19 +120,29 @@ static bool buffer_size_valid(struct buffer_head *bh)
return bh->b_state != 0;
}
+
+static sector_t to_sector(const struct buffer_head *bh,
+ const struct inode *inode)
+{
+ sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+
+ return sector;
+}
+
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
loff_t start, loff_t end, get_block_t get_block,
struct buffer_head *bh)
{
- ssize_t retval = 0;
- loff_t pos = start;
- loff_t max = start;
- loff_t bh_max = start;
- void __pmem *addr;
- bool hole = false;
- bool need_wmb = false;
-
- if (iov_iter_rw(iter) != WRITE)
+ loff_t pos = start, max = start, bh_max = start;
+ bool hole = false, need_wmb = false;
+ struct block_device *bdev = NULL;
+ int rw = iov_iter_rw(iter), rc;
+ long map_len = 0;
+ struct blk_dax_ctl dax = {
+ .addr = (void __pmem *) ERR_PTR(-EIO),
+ };
+
+ if (rw == READ)
end = min(end, i_size_read(inode));
while (pos < end) {
@@ -132,13 +157,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
if (pos == bh_max) {
bh->b_size = PAGE_ALIGN(end - pos);
bh->b_state = 0;
- retval = get_block(inode, block, bh,
- iov_iter_rw(iter) == WRITE);
- if (retval)
+ rc = get_block(inode, block, bh, rw == WRITE);
+ if (rc)
break;
if (!buffer_size_valid(bh))
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
+ bdev = bh->b_bdev;
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
@@ -146,47 +171,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
bh->b_size -= done;
}
- hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
+ hole = rw == READ && !buffer_written(bh);
if (hole) {
- addr = NULL;
size = bh->b_size - first;
} else {
- retval = dax_get_addr(bh, &addr, blkbits);
- if (retval < 0)
+ dax_unmap_atomic(bdev, &dax);
+ dax.sector = to_sector(bh, inode);
+ dax.size = bh->b_size;
+ map_len = dax_map_atomic(bdev, &dax);
+ if (map_len < 0) {
+ rc = map_len;
break;
+ }
if (buffer_unwritten(bh) || buffer_new(bh)) {
- dax_new_buf(addr, retval, first, pos,
- end);
+ dax_new_buf(dax.addr, map_len, first,
+ pos, end);
need_wmb = true;
}
- addr += first;
- size = retval - first;
+ dax.addr += first;
+ size = map_len - first;
}
max = min(pos + size, end);
}
if (iov_iter_rw(iter) == WRITE) {
- len = copy_from_iter_pmem(addr, max - pos, iter);
+ len = copy_from_iter_pmem(dax.addr, max - pos, iter);
need_wmb = true;
} else if (!hole)
- len = copy_to_iter((void __force *)addr, max - pos,
+ len = copy_to_iter((void __force *) dax.addr, max - pos,
iter);
else
len = iov_iter_zero(max - pos, iter);
if (!len) {
- retval = -EFAULT;
+ rc = -EFAULT;
break;
}
pos += len;
- addr += len;
+ if (!IS_ERR(dax.addr))
+ dax.addr += len;
}
if (need_wmb)
wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
- return (pos == start) ? retval : pos - start;
+ return (pos == start) ? rc : pos - start;
}
/**
@@ -215,13 +246,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
loff_t end = pos + iov_iter_count(iter);
memset(&bh, 0, sizeof(bh));
+ bh.b_bdev = inode->i_sb->s_bdev;
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
struct address_space *mapping = inode->i_mapping;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = filemap_write_and_wait_range(mapping, pos, end - 1);
if (retval) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
}
@@ -233,7 +265,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
retval = dax_io(inode, iter, pos, end, get_block, &bh);
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if ((retval > 0) && end_io)
end_io(iocb, pos, retval, bh.b_private);
@@ -275,28 +307,228 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
return VM_FAULT_LOCKED;
}
-static int copy_user_bh(struct page *to, struct buffer_head *bh,
- unsigned blkbits, unsigned long vaddr)
+static int copy_user_bh(struct page *to, struct inode *inode,
+ struct buffer_head *bh, unsigned long vaddr)
{
- void __pmem *vfrom;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(bh, inode),
+ .size = bh->b_size,
+ };
+ struct block_device *bdev = bh->b_bdev;
void *vto;
- if (dax_get_addr(bh, &vfrom, blkbits) < 0)
- return -EIO;
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)vfrom, vaddr, to);
+ copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
kunmap_atomic(vto);
+ dax_unmap_atomic(bdev, &dax);
return 0;
}
+#define NO_SECTOR -1
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+
+static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
+ sector_t sector, bool pmd_entry, bool dirty)
+{
+ struct radix_tree_root *page_tree = &mapping->page_tree;
+ pgoff_t pmd_index = DAX_PMD_INDEX(index);
+ int type, error = 0;
+ void *entry;
+
+ WARN_ON_ONCE(pmd_entry && !dirty);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+ spin_lock_irq(&mapping->tree_lock);
+
+ entry = radix_tree_lookup(page_tree, pmd_index);
+ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+ index = pmd_index;
+ goto dirty;
+ }
+
+ entry = radix_tree_lookup(page_tree, index);
+ if (entry) {
+ type = RADIX_DAX_TYPE(entry);
+ if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
+ type != RADIX_DAX_PMD)) {
+ error = -EIO;
+ goto unlock;
+ }
+
+ if (!pmd_entry || type == RADIX_DAX_PMD)
+ goto dirty;
+
+ /*
+ * We only insert dirty PMD entries into the radix tree. This
+ * means we don't need to worry about removing a dirty PTE
+ * entry and inserting a clean PMD entry, thus reducing the
+ * range we would flush with a follow-up fsync/msync call.
+ */
+ radix_tree_delete(&mapping->page_tree, index);
+ mapping->nrexceptional--;
+ }
+
+ if (sector == NO_SECTOR) {
+ /*
+ * This can happen during correct operation if our pfn_mkwrite
+ * fault raced against a hole punch operation. If this
+ * happens the pte that was hole punched will have been
+ * unmapped and the radix tree entry will have been removed by
+ * the time we are called, but the call will still happen. We
+ * will return all the way up to wp_pfn_shared(), where the
+ * pte_same() check will fail, eventually causing page fault
+ * to be retried by the CPU.
+ */
+ goto unlock;
+ }
+
+ error = radix_tree_insert(page_tree, index,
+ RADIX_DAX_ENTRY(sector, pmd_entry));
+ if (error)
+ goto unlock;
+
+ mapping->nrexceptional++;
+ dirty:
+ if (dirty)
+ radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+ spin_unlock_irq(&mapping->tree_lock);
+ return error;
+}
+
+static int dax_writeback_one(struct block_device *bdev,
+ struct address_space *mapping, pgoff_t index, void *entry)
+{
+ struct radix_tree_root *page_tree = &mapping->page_tree;
+ int type = RADIX_DAX_TYPE(entry);
+ struct radix_tree_node *node;
+ struct blk_dax_ctl dax;
+ void **slot;
+ int ret = 0;
+
+ spin_lock_irq(&mapping->tree_lock);
+ /*
+ * Regular page slots are stabilized by the page lock even
+ * without the tree itself locked. These unlocked entries
+ * need verification under the tree lock.
+ */
+ if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+ goto unlock;
+ if (*slot != entry)
+ goto unlock;
+
+ /* another fsync thread may have already written back this entry */
+ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ goto unlock;
+
+ if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+ ret = -EIO;
+ goto unlock;
+ }
+
+ dax.sector = RADIX_DAX_SECTOR(entry);
+ dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+ spin_unlock_irq(&mapping->tree_lock);
+
+ /*
+ * We cannot hold tree_lock while calling dax_map_atomic() because it
+ * eventually calls cond_resched().
+ */
+ ret = dax_map_atomic(bdev, &dax);
+ if (ret < 0)
+ return ret;
+
+ if (WARN_ON_ONCE(ret < dax.size)) {
+ ret = -EIO;
+ goto unmap;
+ }
+
+ wb_cache_pmem(dax.addr, dax.size);
+
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+ unmap:
+ dax_unmap_atomic(bdev, &dax);
+ return ret;
+
+ unlock:
+ spin_unlock_irq(&mapping->tree_lock);
+ return ret;
+}
+
+/*
+ * Flush the mapping to the persistent domain within the byte range of [start,
+ * end]. This is required by data integrity operations to ensure file data is
+ * on persistent storage prior to completion of the operation.
+ */
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ struct inode *inode = mapping->host;
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ pgoff_t start_index, end_index, pmd_index;
+ pgoff_t indices[PAGEVEC_SIZE];
+ struct pagevec pvec;
+ bool done = false;
+ int i, ret = 0;
+ void *entry;
+
+ if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
+ return -EIO;
+
+ start_index = start >> PAGE_CACHE_SHIFT;
+ end_index = end >> PAGE_CACHE_SHIFT;
+ pmd_index = DAX_PMD_INDEX(start_index);
+
+ rcu_read_lock();
+ entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
+ rcu_read_unlock();
+
+ /* see if the start of our range is covered by a PMD entry */
+ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+ start_index = pmd_index;
+
+ tag_pages_for_writeback(mapping, start_index, end_index);
+
+ pagevec_init(&pvec, 0);
+ while (!done) {
+ pvec.nr = find_get_entries_tag(mapping, start_index,
+ PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+ pvec.pages, indices);
+
+ if (pvec.nr == 0)
+ break;
+
+ for (i = 0; i < pvec.nr; i++) {
+ if (indices[i] > end_index) {
+ done = true;
+ break;
+ }
+
+ ret = dax_writeback_one(bdev, mapping, indices[i],
+ pvec.pages[i]);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ wmb_pmem();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
- sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- void __pmem *addr;
- unsigned long pfn;
+ struct address_space *mapping = inode->i_mapping;
+ struct block_device *bdev = bh->b_bdev;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(bh, inode),
+ .size = bh->b_size,
+ };
pgoff_t size;
int error;
@@ -315,20 +547,23 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
goto out;
}
- error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
- if (error < 0)
- goto out;
- if (error < PAGE_SIZE) {
- error = -EIO;
+ if (dax_map_atomic(bdev, &dax) < 0) {
+ error = PTR_ERR(dax.addr);
goto out;
}
if (buffer_unwritten(bh) || buffer_new(bh)) {
- clear_pmem(addr, PAGE_SIZE);
+ clear_pmem(dax.addr, PAGE_SIZE);
wmb_pmem();
}
+ dax_unmap_atomic(bdev, &dax);
+
+ error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
+ vmf->flags & FAULT_FLAG_WRITE);
+ if (error)
+ goto out;
- error = vm_insert_mixed(vma, vaddr, pfn);
+ error = vm_insert_mixed(vma, vaddr, dax.pfn);
out:
i_mmap_unlock_read(mapping);
@@ -373,6 +608,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
memset(&bh, 0, sizeof(bh));
block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+ bh.b_bdev = inode->i_sb->s_bdev;
bh.b_size = PAGE_SIZE;
repeat:
@@ -422,7 +658,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (vmf->cow_page) {
struct page *new_page = vmf->cow_page;
if (buffer_written(&bh))
- error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+ error = copy_user_bh(new_page, inode, &bh, vaddr);
else
clear_user_highpage(new_page, vaddr);
if (error)
@@ -452,6 +688,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
delete_from_page_cache(page);
unlock_page(page);
page_cache_release(page);
+ page = NULL;
}
/*
@@ -523,6 +760,24 @@ EXPORT_SYMBOL_GPL(dax_fault);
*/
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+static void __dax_dbg(struct buffer_head *bh, unsigned long address,
+ const char *reason, const char *fn)
+{
+ if (bh) {
+ char bname[BDEVNAME_SIZE];
+ bdevname(bh->b_bdev, bname);
+ pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
+ "length %zd fallback: %s\n", fn, current->comm,
+ address, bname, bh->b_state, (u64)bh->b_blocknr,
+ bh->b_size, reason);
+ } else {
+ pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
+ current->comm, address, reason);
+ }
+}
+
+#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
+
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, unsigned int flags, get_block_t get_block,
dax_iodone_t complete_unwritten)
@@ -534,61 +789,83 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
unsigned blkbits = inode->i_blkbits;
unsigned long pmd_addr = address & PMD_MASK;
bool write = flags & FAULT_FLAG_WRITE;
- long length;
- void __pmem *kaddr;
+ struct block_device *bdev;
pgoff_t size, pgoff;
- sector_t block, sector;
- unsigned long pfn;
- int result = 0;
+ sector_t block;
+ int error, result = 0;
+ bool alloc = false;
- /* dax pmd mappings are broken wrt gup and fork */
+ /* dax pmd mappings require pfn_t_devmap() */
if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
return VM_FAULT_FALLBACK;
/* Fall back to PTEs if we're going to COW */
- if (write && !(vma->vm_flags & VM_SHARED))
+ if (write && !(vma->vm_flags & VM_SHARED)) {
+ split_huge_pmd(vma, pmd, address);
+ dax_pmd_dbg(NULL, address, "cow write");
return VM_FAULT_FALLBACK;
+ }
/* If the PMD would extend outside the VMA */
- if (pmd_addr < vma->vm_start)
+ if (pmd_addr < vma->vm_start) {
+ dax_pmd_dbg(NULL, address, "vma start unaligned");
return VM_FAULT_FALLBACK;
- if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ }
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
+ dax_pmd_dbg(NULL, address, "vma end unaligned");
return VM_FAULT_FALLBACK;
+ }
pgoff = linear_page_index(vma, pmd_addr);
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (pgoff >= size)
return VM_FAULT_SIGBUS;
/* If the PMD would cover blocks out of the file */
- if ((pgoff | PG_PMD_COLOUR) >= size)
+ if ((pgoff | PG_PMD_COLOUR) >= size) {
+ dax_pmd_dbg(NULL, address,
+ "offset + huge page size > file size");
return VM_FAULT_FALLBACK;
+ }
memset(&bh, 0, sizeof(bh));
+ bh.b_bdev = inode->i_sb->s_bdev;
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
bh.b_size = PMD_SIZE;
- length = get_block(inode, block, &bh, write);
- if (length)
+
+ if (get_block(inode, block, &bh, 0) != 0)
return VM_FAULT_SIGBUS;
- i_mmap_lock_read(mapping);
+
+ if (!buffer_mapped(&bh) && write) {
+ if (get_block(inode, block, &bh, 1) != 0)
+ return VM_FAULT_SIGBUS;
+ alloc = true;
+ }
+
+ bdev = bh.b_bdev;
/*
* If the filesystem isn't willing to tell us the length of a hole,
* just fall back to PTEs. Calling get_block 512 times in a loop
* would be silly.
*/
- if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
- goto fallback;
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
+ dax_pmd_dbg(&bh, address, "allocated block too small");
+ return VM_FAULT_FALLBACK;
+ }
/*
* If we allocated new storage, make sure no process has any
* zero pages covering this hole
*/
- if (buffer_new(&bh)) {
- i_mmap_unlock_read(mapping);
- unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
- i_mmap_lock_read(mapping);
+ if (alloc) {
+ loff_t lstart = pgoff << PAGE_SHIFT;
+ loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+
+ truncate_pagecache_range(inode, lstart, lend);
}
+ i_mmap_lock_read(mapping);
+
/*
* If a truncate happened while we were allocating blocks, we may
* leave blocks allocated to the file that are beyond EOF. We can't
@@ -600,57 +877,108 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
result = VM_FAULT_SIGBUS;
goto out;
}
- if ((pgoff | PG_PMD_COLOUR) >= size)
+ if ((pgoff | PG_PMD_COLOUR) >= size) {
+ dax_pmd_dbg(&bh, address,
+ "offset + huge page size > file size");
goto fallback;
+ }
if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
spinlock_t *ptl;
pmd_t entry;
struct page *zero_page = get_huge_zero_page();
- if (unlikely(!zero_page))
+ if (unlikely(!zero_page)) {
+ dax_pmd_dbg(&bh, address, "no zero page");
goto fallback;
+ }
ptl = pmd_lock(vma->vm_mm, pmd);
if (!pmd_none(*pmd)) {
spin_unlock(ptl);
+ dax_pmd_dbg(&bh, address, "pmd already present");
goto fallback;
}
+ dev_dbg(part_to_dev(bdev->bd_part),
+ "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
+ __func__, current->comm, address,
+ (unsigned long long) to_sector(&bh, inode));
+
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
result = VM_FAULT_NOPAGE;
spin_unlock(ptl);
} else {
- sector = bh.b_blocknr << (blkbits - 9);
- length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
- bh.b_size);
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(&bh, inode),
+ .size = PMD_SIZE,
+ };
+ long length = dax_map_atomic(bdev, &dax);
+
if (length < 0) {
result = VM_FAULT_SIGBUS;
goto out;
}
- if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ if (length < PMD_SIZE) {
+ dax_pmd_dbg(&bh, address, "dax-length too small");
+ dax_unmap_atomic(bdev, &dax);
goto fallback;
+ }
+ if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
+ dax_pmd_dbg(&bh, address, "pfn unaligned");
+ dax_unmap_atomic(bdev, &dax);
+ goto fallback;
+ }
- /*
- * TODO: teach vmf_insert_pfn_pmd() to support
- * 'pte_special' for pmds
- */
- if (pfn_valid(pfn))
+ if (!pfn_t_devmap(dax.pfn)) {
+ dax_unmap_atomic(bdev, &dax);
+ dax_pmd_dbg(&bh, address, "pfn not in memmap");
goto fallback;
+ }
if (buffer_unwritten(&bh) || buffer_new(&bh)) {
- int i;
- for (i = 0; i < PTRS_PER_PMD; i++)
- clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+ clear_pmem(dax.addr, PMD_SIZE);
wmb_pmem();
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
result |= VM_FAULT_MAJOR;
}
+ dax_unmap_atomic(bdev, &dax);
+
+ /*
+ * For PTE faults we insert a radix tree entry for reads, and
+ * leave it clean. Then on the first write we dirty the radix
+ * tree entry via the dax_pfn_mkwrite() path. This sequence
+ * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+ * call into get_block() to translate the pgoff to a sector in
+ * order to be able to create a new radix tree entry.
+ *
+ * The PMD path doesn't have an equivalent to
+ * dax_pfn_mkwrite(), though, so for a read followed by a
+ * write we traverse all the way through __dax_pmd_fault()
+ * twice. This means we can just skip inserting a radix tree
+ * entry completely on the initial read and just wait until
+ * the write to insert a dirty entry.
+ */
+ if (write) {
+ error = dax_radix_entry(mapping, pgoff, dax.sector,
+ true, true);
+ if (error) {
+ dax_pmd_dbg(&bh, address,
+ "PMD radix insertion failed");
+ goto fallback;
+ }
+ }
- result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ dev_dbg(part_to_dev(bdev->bd_part),
+ "%s: %s addr: %lx pfn: %lx sect: %llx\n",
+ __func__, current->comm, address,
+ pfn_t_to_pfn(dax.pfn),
+ (unsigned long long) dax.sector);
+ result |= vmf_insert_pfn_pmd(vma, address, pmd,
+ dax.pfn, write);
}
out:
@@ -702,15 +1030,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
- *
*/
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ struct file *file = vma->vm_file;
- sb_start_pagefault(sb);
- file_update_time(vma->vm_file);
- sb_end_pagefault(sb);
+ /*
+ * We pass NO_SECTOR to dax_radix_entry() because we expect that a
+ * RADIX_DAX_PTE entry already exists in the radix tree from a
+ * previous call to __dax_fault(). We just want to look up that PTE
+ * entry using vmf->pgoff and make sure the dirty tag is set. This
+ * saves us from having to make a call to get_block() here to look
+ * up the sector.
+ */
+ dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -747,17 +1080,23 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
BUG_ON((offset + length) > PAGE_CACHE_SIZE);
memset(&bh, 0, sizeof(bh));
+ bh.b_bdev = inode->i_sb->s_bdev;
bh.b_size = PAGE_CACHE_SIZE;
err = get_block(inode, index, &bh, 0);
if (err < 0)
return err;
if (buffer_written(&bh)) {
- void __pmem *addr;
- err = dax_get_addr(&bh, &addr, inode->i_blkbits);
- if (err < 0)
- return err;
- clear_pmem(addr + offset, length);
+ struct block_device *bdev = bh.b_bdev;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(&bh, inode),
+ .size = PAGE_CACHE_SIZE,
+ };
+
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
+ clear_pmem(dax.addr + offset, length);
wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
}
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 8d38cd07b207..92d5140de851 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
- struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+ struct external_name *p = kmalloc(size + name->len,
+ GFP_KERNEL_ACCOUNT);
if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
@@ -2461,7 +2462,7 @@ EXPORT_SYMBOL(d_rehash);
*/
void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
{
- BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
+ BUG_ON(!inode_is_locked(dentry->d_parent->d_inode));
BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
spin_lock(&dentry->d_lock);
@@ -2737,7 +2738,7 @@ static int __d_unalias(struct inode *inode,
if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
goto out_err;
m1 = &dentry->d_sb->s_vfs_rename_mutex;
- if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
+ if (!inode_trylock(alias->d_parent->d_inode))
goto out_err;
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
@@ -3415,7 +3416,7 @@ static void __init dcache_init(void)
* of the dcache.
*/
dentry_cache = KMEM_CACHE(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b7fcc0de0b2f..bece948b363d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
dentry = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
dput(dentry);
@@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
}
if (IS_ERR(dentry)) {
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
@@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
static struct dentry *failed_creating(struct dentry *dentry)
{
- mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+ inode_unlock(d_inode(dentry->d_parent));
dput(dentry);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
return NULL;
@@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
static struct dentry *end_creating(struct dentry *dentry)
{
- mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+ inode_unlock(d_inode(dentry->d_parent));
return dentry;
}
@@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry)
if (!parent || d_really_is_negative(parent))
return;
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
ret = __debugfs_remove(dentry, parent);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
if (!ret)
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
@@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
parent = dentry;
down:
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
loop:
/*
* The parent->d_subdirs is protected by the d_lock. Outside that
@@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
/* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
spin_unlock(&parent->d_lock);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
parent = child;
goto down;
}
@@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry)
}
spin_unlock(&parent->d_lock);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
child = parent;
parent = parent->d_parent;
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
if (child != dentry)
/* go up */
@@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
if (!__debugfs_remove(child, parent))
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c35ffdc12bba..1f107fd51328 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb)
if (!uid_valid(root_uid) || !gid_valid(root_gid))
return -EINVAL;
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
/* If we have already created ptmx node, return */
if (fsi->ptmx_dentry) {
@@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb)
fsi->ptmx_dentry = dentry;
rc = 0;
out:
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
return rc;
}
@@ -615,7 +615,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
sprintf(s, "%d", index);
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
dentry = d_alloc_name(root, s);
if (dentry) {
@@ -626,7 +626,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
inode = ERR_PTR(-ENOMEM);
}
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
return inode;
}
@@ -671,7 +671,7 @@ void devpts_pty_kill(struct inode *inode)
BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
dentry = d_find_alias(inode);
@@ -680,7 +680,7 @@ void devpts_pty_kill(struct inode *inode)
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
dput(dentry); /* d_find_alias above */
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
}
static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 602e8441bc0f..1b2f7ffc8b84 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1157,12 +1157,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
iocb->ki_filp->f_mapping;
/* will be released by direct_io_worker */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = filemap_write_and_wait_range(mapping, offset,
end - 1);
if (retval) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
kmem_cache_free(dio_cache, dio);
goto out;
}
@@ -1173,7 +1173,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio->i_size = i_size_read(inode);
if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
if (dio->flags & DIO_LOCKING)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
kmem_cache_free(dio_cache, dio);
retval = 0;
goto out;
@@ -1295,7 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* of protecting us from looking up uninitialized blocks.
*/
if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
- mutex_unlock(&dio->inode->i_mutex);
+ inode_unlock(dio->inode);
/*
* The only time we want to leave bios in flight is when a successful
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 1925d6d222b8..58c2f4a21b7f 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -516,7 +516,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
return -EINVAL;
kbuf = memdup_user_nul(buf, count);
- if (!IS_ERR(kbuf))
+ if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
if (check_version(kbuf)) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 040aa879d634..4e685ac1024d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry)
struct dentry *dir;
dir = dget_parent(dentry);
- mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
return dir;
}
static void unlock_dir(struct dentry *dir)
{
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(dir);
}
@@ -397,11 +397,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
int rc = 0;
lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
- mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dir_dentry));
lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
lower_dir_dentry,
ecryptfs_dentry->d_name.len);
- mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dir_dentry));
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -426,11 +426,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
"filename; rc = [%d]\n", __func__, rc);
goto out;
}
- mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dir_dentry));
lower_dentry = lookup_one_len(encrypted_and_encoded_name,
lower_dir_dentry,
encrypted_and_encoded_name_size);
- mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dir_dentry));
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -869,9 +869,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = notify_change(lower_dentry, &lower_ia, NULL);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
}
return rc;
}
@@ -970,9 +970,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
lower_ia.ia_valid &= ~ATTR_MODE;
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = notify_change(lower_dentry, &lower_ia, NULL);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
out:
fsstack_copy_attr_all(inode, lower_inode);
return rc;
@@ -1048,10 +1048,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
size);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
out:
return rc;
}
@@ -1075,9 +1075,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
out:
return rc;
}
@@ -1092,9 +1092,9 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
out:
return rc;
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4f4d0474bee9..e25b6b06bacf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -663,6 +663,7 @@ static struct ecryptfs_cache_info {
struct kmem_cache **cache;
const char *name;
size_t size;
+ unsigned long flags;
void (*ctor)(void *obj);
} ecryptfs_cache_infos[] = {
{
@@ -684,6 +685,7 @@ static struct ecryptfs_cache_info {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
+ .flags = SLAB_ACCOUNT,
.ctor = inode_info_init_once,
},
{
@@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void)
struct ecryptfs_cache_info *info;
info = &ecryptfs_cache_infos[i];
- *(info->cache) = kmem_cache_create(info->name, info->size,
- 0, SLAB_HWCACHE_ALIGN, info->ctor);
+ *(info->cache) = kmem_cache_create(info->name, info->size, 0,
+ SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
if (!*(info->cache)) {
ecryptfs_free_kmem_caches();
ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index caba848ac763..c6ced4cbf0cf 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -436,7 +436,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
rc = -ENOMEM;
goto out;
}
- mutex_lock(&lower_inode->i_mutex);
+ inode_lock(lower_inode);
size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
xattr_virt, PAGE_CACHE_SIZE);
if (size < 0)
@@ -444,7 +444,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
xattr_virt, size, 0);
- mutex_unlock(&lower_inode->i_mutex);
+ inode_unlock(lower_inode);
if (rc)
printk(KERN_ERR "Error whilst attempting to write inode size "
"to lower file xattr; rc = [%d]\n", rc);
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 90001da9abfd..c424e4813ec8 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -50,9 +50,9 @@ static ssize_t efivarfs_file_write(struct file *file,
d_delete(file->f_path.dentry);
dput(file->f_path.dentry);
} else {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
bytes = count;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 86a2121828c3..b8a564f29107 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -160,10 +160,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
efivar_entry_size(entry, &size);
efivar_entry_add(entry, &efivarfs_list);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
inode->i_private = entry;
i_size_write(inode, size + sizeof(entry->var.Attributes));
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
d_add(dentry, inode);
return 0;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c8411a30f7da..cb68dac4f9d3 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -94,9 +94,9 @@ static void init_once(void *foo)
static int __init init_inodecache(void)
{
efs_inode_cachep = kmem_cache_create("efs_inode_cache",
- sizeof(struct efs_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct efs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (efs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8d0c0df01854..ed70cf9fdc7b 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -45,10 +45,10 @@ struct eventfd_ctx {
*
* This function is supposed to be called by the kernel in paths that do not
* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returining a POLLERR
+ * value, and we signal this as overflow condition by returning a POLLERR
* to poll(2).
*
- * Returns the amount by which the counter was incrememnted. This will be less
+ * Returns the amount by which the counter was incremented. This will be less
* than @n if the counter has overflowed.
*/
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1e009cad8d5c..ae1dbcf47e97 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -92,7 +92,7 @@
*/
/* Epoll private bits inside the event mask */
-#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
+#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
@@ -1002,6 +1002,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
+ int ewake = 0;
if ((unsigned long)key & POLLFREE) {
ep_pwq_from_wait(wait)->whead = NULL;
@@ -1066,8 +1067,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
- if (waitqueue_active(&ep->wq))
+ if (waitqueue_active(&ep->wq)) {
+ ewake = 1;
wake_up_locked(&ep->wq);
+ }
if (waitqueue_active(&ep->poll_wait))
pwake++;
@@ -1078,6 +1081,9 @@ out_unlock:
if (pwake)
ep_poll_safewake(&ep->poll_wait);
+ if (epi->event.events & EPOLLEXCLUSIVE)
+ return ewake;
+
return 1;
}
@@ -1095,7 +1101,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
- add_wait_queue(whead, &pwq->wait);
+ if (epi->event.events & EPOLLEXCLUSIVE)
+ add_wait_queue_exclusive(whead, &pwq->wait);
+ else
+ add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
@@ -1862,6 +1871,15 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
goto error_tgt_fput;
/*
+ * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
+ * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
+ * Also, we do not currently supported nested exclusive wakeups.
+ */
+ if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD ||
+ (op == EPOLL_CTL_ADD && is_file_epoll(tf.file))))
+ goto error_tgt_fput;
+
+ /*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
diff --git a/fs/exec.c b/fs/exec.c
index 828ec5f07de0..dcd4ac7d3f1e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1307,13 +1307,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
return;
/* Be careful if suid/sgid is set */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
uid = inode->i_uid;
gid = inode->i_gid;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/* We ignore suid/sgid if there are no mappings for them in the ns */
if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 906de66e8e7e..28645f0640f7 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = sync_inode_metadata(filp->f_mapping->host, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b5e1..6658a50530a0 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
{
exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
sizeof(struct exofs_i_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- exofs_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, exofs_init_once);
if (exofs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 714cd37a6ba3..c46f1a190b8d 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
int err;
parent = ERR_PTR(-EACCES);
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock(dentry->d_inode);
if (mnt->mnt_sb->s_export_op->get_parent)
parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
if (IS_ERR(parent)) {
dprintk("%s: get_parent of %ld failed, err %d\n",
@@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
if (IS_ERR(tmp)) {
dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
goto out_err;
@@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
*/
err = exportfs_get_name(mnt, target_dir, nbuf, result);
if (!err) {
- mutex_lock(&target_dir->d_inode->i_mutex);
+ inode_lock(target_dir->d_inode);
nresult = lookup_one_len(nbuf, target_dir,
strlen(nbuf));
- mutex_unlock(&target_dir->d_inode->i_mutex);
+ inode_unlock(target_dir->d_inode);
if (!IS_ERR(nresult)) {
if (nresult->d_inode) {
dput(result);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 11a42c5a09ae..2c88d683cd91 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
{
struct inode *inode = file_inode(vma->vm_file);
struct ext2_inode_info *ei = EXT2_I(inode);
- int ret = VM_FAULT_NOPAGE;
loff_t size;
+ int ret;
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
@@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
+ else
+ ret = dax_pfn_mkwrite(vma, vmf);
up_read(&ei->dax_sem);
sb_end_pagefault(inode->i_sb);
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 5d46c09863f0..b386af2e45f4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = ext2_mask_flags(inode->i_mode, flags);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = -EPERM;
goto setflags_out;
}
@@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
*/
if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = -EPERM;
goto setflags_out;
}
@@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ext2_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mark_inode_dirty(inode);
setflags_out:
@@ -102,10 +102,10 @@ setflags_out:
goto setversion_out;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
inode->i_ctime = CURRENT_TIME_SEC;
inode->i_generation = generation;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mark_inode_dirty(inode);
setversion_out:
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 748d35afc902..2a188413a2b0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -203,7 +203,7 @@ static int __init init_inodecache(void)
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
sizeof(struct ext2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext2_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 1a0835073663..c8021208a7eb 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -384,14 +384,12 @@ int ext4_decrypt(struct page *page)
EXT4_DECRYPT, page->index, page, page);
}
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+ ext4_fsblk_t pblk, ext4_lblk_t len)
{
struct ext4_crypto_ctx *ctx;
struct page *ciphertext_page = NULL;
struct bio *bio;
- ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
- ext4_fsblk_t pblk = ext4_ext_pblock(ex);
- unsigned int len = ext4_ext_get_actual_len(ex);
int ret, err = 0;
#if 0
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index c5882b36e558..9a16d1e75a49 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -213,9 +213,11 @@ retry:
res = -ENOKEY;
goto out;
}
+ down_read(&keyring_key->sem);
ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
res = -EINVAL;
+ up_read(&keyring_key->sem);
goto out;
}
master_key = (struct ext4_encryption_key *)ukp->data;
@@ -226,10 +228,12 @@ retry:
"ext4: key size incorrect: %d\n",
master_key->size);
res = -ENOKEY;
+ up_read(&keyring_key->sem);
goto out;
}
res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
raw_key);
+ up_read(&keyring_key->sem);
if (res)
goto out;
got_key:
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cc7ca4e87144..0662b285dc8a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -378,14 +378,22 @@ struct flex_groups {
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */
+
+#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
+ EXT4_IMMUTABLE_FL | \
+ EXT4_APPEND_FL | \
+ EXT4_NODUMP_FL | \
+ EXT4_NOATIME_FL | \
+ EXT4_PROJINHERIT_FL)
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
- EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
+ EXT4_PROJINHERIT_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
@@ -555,10 +563,12 @@ enum {
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
- /* Do not take i_data_sem locking in ext4_map_blocks */
-#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100
+ /* Write zeros to newly created written extents */
+#define EXT4_GET_BLOCKS_ZERO 0x0200
+#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
+ EXT4_GET_BLOCKS_ZERO)
/*
* The bit position of these flags must not overlap with any of the
@@ -616,6 +626,46 @@ enum {
#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy)
+#ifndef FS_IOC_FSGETXATTR
+/* Until the uapi changes get merged for project quota... */
+
+#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr)
+
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+ __u32 fsx_xflags; /* xflags field value (get/set) */
+ __u32 fsx_extsize; /* extsize field value (get/set)*/
+ __u32 fsx_nextents; /* nextents field value (get) */
+ __u32 fsx_projid; /* project identifier (get/set) */
+ unsigned char fsx_pad[12];
+};
+
+/*
+ * Flags for the fsx_xflags field
+ */
+#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
+#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
+#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
+#define FS_XFLAG_APPEND 0x00000010 /* all writes append */
+#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
+#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */
+#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
+#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
+#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
+#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
+#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
+#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
+#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
+#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
+#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
+#endif /* !defined(FS_IOC_FSGETXATTR) */
+
+#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
+#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
@@ -910,6 +960,15 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
+ /*
+ * i_mmap_sem is for serializing page faults with truncate / punch hole
+ * operations. We have to make sure that new page cannot be faulted in
+ * a section of the inode that is being punched. We cannot easily use
+ * i_data_sem for this since we need protection for the whole punch
+ * operation and i_data_sem ranks below transaction start so we have
+ * to occasionally drop it.
+ */
+ struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
@@ -993,6 +1052,7 @@ struct ext4_inode_info {
/* Encryption params */
struct ext4_crypt_info *i_crypt_info;
#endif
+ kprojid_t i_projid;
};
/*
@@ -1248,7 +1308,7 @@ struct ext4_super_block {
#endif
/* Number of quota types we support */
-#define EXT4_MAXQUOTAS 2
+#define EXT4_MAXQUOTAS 3
/*
* fourth extended-fs super-block data in memory
@@ -1754,7 +1814,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
- EXT4_FEATURE_RO_COMPAT_QUOTA)
+ EXT4_FEATURE_RO_COMPAT_QUOTA |\
+ EXT4_FEATURE_RO_COMPAT_PROJECT)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -1796,6 +1857,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
#define EXT4_DEF_RESUID 0
#define EXT4_DEF_RESGID 0
+/*
+ * Default project ID
+ */
+#define EXT4_DEF_PROJID 0
+
#define EXT4_DEF_INODE_READAHEAD_BLKS 32
/*
@@ -2234,7 +2300,8 @@ void ext4_restore_control_page(struct page *data_page);
struct page *ext4_encrypt(struct inode *inode,
struct page *plaintext_page);
int ext4_decrypt(struct page *page);
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+ ext4_fsblk_t pblk, ext4_lblk_t len);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
int ext4_init_crypto(void);
@@ -2440,8 +2507,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
@@ -2484,9 +2551,13 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
+extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
+ ext4_fsblk_t pblk, ext4_lblk_t len);
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2825,7 +2896,7 @@ do { \
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
- !mutex_is_locked(&inode->i_mutex));
+ !inode_is_locked(inode));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = newsize;
@@ -2848,6 +2919,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
return changed;
}
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len);
+
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
@@ -2986,8 +3060,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
struct page *page);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
- struct dentry *dentry,
- struct inode *inode);
+ struct inode *dir, struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
struct inode *parent,
struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 551353b1b17a..0ffabaf90aa5 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3119,19 +3119,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
ext4_fsblk_t ee_pblock;
unsigned int ee_len;
- int ret;
ee_len = ext4_ext_get_actual_len(ex);
ee_pblock = ext4_ext_pblock(ex);
-
- if (ext4_encrypted_inode(inode))
- return ext4_encrypted_zeroout(inode, ex);
-
- ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
- if (ret > 0)
- ret = 0;
-
- return ret;
+ return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
+ ee_len);
}
/*
@@ -4052,6 +4044,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
}
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
+ allocated);
+ if (err < 0)
+ goto out2;
+ }
ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
ppath);
if (ret >= 0) {
@@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
if (len <= EXT_UNWRITTEN_MAX_LEN)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
-
/*
* credits to insert 1 extent into extent tree
*/
@@ -4752,8 +4748,6 @@ retry:
goto retry;
}
- ext4_inode_resume_unlocked_dio(inode);
-
return ret > 0 ? ret2 : ret;
}
@@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
- struct address_space *mapping = inode->i_mapping;
unsigned int blkbits = inode->i_blkbits;
trace_ext4_zero_range(inode, offset, len, mode);
@@ -4786,17 +4779,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + len - 1);
- if (ret)
- return ret;
- }
-
- /*
* Round up offset. This is not fallocate, we neet to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
@@ -4817,7 +4799,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
else
max_blocks -= lblk;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Indirect files do not support unwritten extnets
@@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
round_down(offset, 1 << blkbits)) >> blkbits,
new_size, flags, mode);
if (ret)
- goto out_mutex;
+ goto out_dio;
}
@@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);
- /* Now release the pages and zero block aligned part of pages*/
+ /*
+ * Prevent page faults from reinstantiating pages we have
+ * released from page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ goto out_dio;
+ }
+ /* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
-
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
if (ret)
goto out_dio;
}
@@ -4909,7 +4902,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
out_dio:
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -4980,7 +4973,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* We only support preallocation for extent-based files only
@@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ ext4_inode_resume_unlocked_dio(inode);
if (ret)
goto out;
@@ -5008,7 +5006,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
EXT4_I(inode)->i_sync_tid);
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
return ret;
}
@@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
- /*
- * Need to round down offset to be aligned with page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
-
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- return ret;
-
- /* Take mutex lock */
- mutex_lock(&inode->i_mutex);
-
+ inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
@@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache(inode, ioffset);
-
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+ /*
+ * Write tail of the last page before removed range since it will get
+ * removed from the page cache below.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+ if (ret)
+ goto out_mmap;
+ /*
+ * Write data that will be shifted to preserve them when discarding
+ * page cache below. We are also protected from pages becoming dirty
+ * by i_mmap_sem.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+ LLONG_MAX);
+ if (ret)
+ goto out_mmap;
+ truncate_pagecache(inode, ioffset);
+
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_dio;
+ goto out_mmap;
}
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5573,10 +5583,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
- /*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
-
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- return ret;
-
- /* Take mutex lock */
- mutex_lock(&inode->i_mutex);
-
+ inode_lock(inode);
/* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ret = -EOPNOTSUPP;
@@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache(inode, ioffset);
-
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Need to round down to align start offset to page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
+ if (ret)
+ goto out_mmap;
+ truncate_pagecache(inode, ioffset);
+
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_dio;
+ goto out_mmap;
}
/* Expand file to avoid data loss if there is error while shifting */
@@ -5741,10 +5753,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -5779,8 +5792,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
- BUG_ON(!mutex_is_locked(&inode1->i_mutex));
- BUG_ON(!mutex_is_locked(&inode2->i_mutex));
+ BUG_ON(!inode_is_locked(inode1));
+ BUG_ON(!inode_is_locked(inode2));
*erp = ext4_es_remove_extent(inode1, lblk1, count);
if (unlikely(*erp))
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 113837e7ba98..1126436dada1 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -113,7 +113,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ext4_unwritten_wait(inode);
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -169,7 +169,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
ret = __generic_file_write_iter(iocb, from);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret > 0) {
ssize_t err;
@@ -186,50 +186,42 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ret;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (aio_mutex)
mutex_unlock(aio_mutex);
return ret;
}
#ifdef CONFIG_FS_DAX
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
- struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
- loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
- int err;
- if (!uptodate)
- return;
- WARN_ON(!buffer_unwritten(bh));
- err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int result;
handle_t *handle = NULL;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
- }
+ } else
+ down_read(&EXT4_I(inode)->i_mmap_sem);
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
- result = __dax_fault(vma, vmf, ext4_get_block_dax,
- ext4_end_io_unwritten);
+ result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
- }
+ } else
+ up_read(&EXT4_I(inode)->i_mmap_sem);
return result;
}
@@ -246,44 +238,88 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
ext4_chunk_trans_blocks(inode,
PMD_SIZE / PAGE_SIZE));
- }
+ } else
+ down_read(&EXT4_I(inode)->i_mmap_sem);
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
result = __dax_pmd_fault(vma, addr, pmd, flags,
- ext4_get_block_dax, ext4_end_io_unwritten);
+ ext4_dax_mmap_get_block, NULL);
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
- }
+ } else
+ up_read(&EXT4_I(inode)->i_mmap_sem);
return result;
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block_dax,
- ext4_end_io_unwritten);
+ int err;
+ struct inode *inode = file_inode(vma->vm_file);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(inode->i_sb);
+
+ return err;
+}
+
+/*
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
+ * handler we check for races agaist truncate. Note that since we cycle through
+ * i_mmap_sem, we are sure that also any hole punching that began before we
+ * were called is finished by now and so if it included part of the file we
+ * are working on, our pte will get unmapped and the check for pte_same() in
+ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
+ * desired.
+ */
+static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ loff_t size;
+ int ret;
+
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ else
+ ret = dax_pfn_mkwrite(vma, vmf);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
+
+ return ret;
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
- .pfn_mkwrite = dax_pfn_mkwrite,
+ .pfn_mkwrite = ext4_dax_pfn_mkwrite,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
+ .fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
@@ -527,11 +563,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
int blkbits;
int ret = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
if (offset >= isize) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
@@ -579,7 +615,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
dataoff = (loff_t)last << blkbits;
} while (last <= end);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (dataoff > isize)
return -ENXIO;
@@ -600,11 +636,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
int blkbits;
int ret = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
if (offset >= isize) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
@@ -655,7 +691,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
break;
} while (last <= end);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (holeoff > isize)
holeoff = isize;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1b8024d26f65..3fcfd50a2e8a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -799,6 +799,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
inode->i_gid = dir->i_gid;
} else
inode_init_owner(inode, dir, mode);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
+ ei->i_projid = EXT4_I(dir)->i_projid;
+ else
+ ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+
err = dquot_initialize(inode);
if (err)
goto out;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d884989cc83d..dfe3b9bafc0d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -995,12 +995,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
*/
static int ext4_add_dirent_to_inline(handle_t *handle,
struct ext4_filename *fname,
- struct dentry *dentry,
+ struct inode *dir,
struct inode *inode,
struct ext4_iloc *iloc,
void *inline_start, int inline_size)
{
- struct inode *dir = d_inode(dentry->d_parent);
int err;
struct ext4_dir_entry_2 *de;
@@ -1245,12 +1244,11 @@ out:
* the new created block.
*/
int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
- struct dentry *dentry, struct inode *inode)
+ struct inode *dir, struct inode *inode)
{
int ret, inline_size;
void *inline_start;
struct ext4_iloc iloc;
- struct inode *dir = d_inode(dentry->d_parent);
ret = ext4_get_inode_loc(dir, &iloc);
if (ret)
@@ -1264,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
- ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc,
+ ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,
inline_start, inline_size);
if (ret != -ENOSPC)
goto out;
@@ -1285,7 +1283,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
if (inline_size) {
inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
- ret = ext4_add_dirent_to_inline(handle, fname, dentry,
+ ret = ext4_add_dirent_to_inline(handle, fname, dir,
inode, &iloc, inline_start,
inline_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3bd912df6bf..83bc8bfb3bea 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -383,6 +383,21 @@ static int __check_block_validity(struct inode *inode, const char *func,
return 0;
}
+int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
+ ext4_lblk_t len)
+{
+ int ret;
+
+ if (ext4_encrypted_inode(inode))
+ return ext4_encrypted_zeroout(inode, lblk, pblk, len);
+
+ ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
+ if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
@@ -403,8 +418,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* out taking i_data_sem. So at the time the unwritten extent
* could be converted.
*/
- if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- down_read(&EXT4_I(inode)->i_data_sem);
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -412,8 +426,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
- if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- up_read((&EXT4_I(inode)->i_data_sem));
+ up_read((&EXT4_I(inode)->i_data_sem));
/*
* We don't check m_len because extent will be collpased in status
@@ -509,8 +522,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- down_read(&EXT4_I(inode)->i_data_sem);
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -541,8 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (ret < 0)
retval = ret;
}
- if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- up_read((&EXT4_I(inode)->i_data_sem));
+ up_read((&EXT4_I(inode)->i_data_sem));
found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -626,13 +637,29 @@ found:
}
/*
+ * We have to zeroout blocks before inserting them into extent
+ * status tree. Otherwise someone could look them up there and
+ * use them before they are really zeroed.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO &&
+ map->m_flags & EXT4_MAP_MAPPED &&
+ map->m_flags & EXT4_MAP_NEW) {
+ ret = ext4_issue_zeroout(inode, map->m_lblk,
+ map->m_pblk, map->m_len);
+ if (ret) {
+ retval = ret;
+ goto out_sem;
+ }
+ }
+
+ /*
* If the extent has been zeroed out, we don't need to update
* extent status tree.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
if (ext4_es_is_written(&es))
- goto has_zeroout;
+ goto out_sem;
}
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -643,11 +670,13 @@ found:
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
- if (ret < 0)
+ if (ret < 0) {
retval = ret;
+ goto out_sem;
+ }
}
-has_zeroout:
+out_sem:
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map);
@@ -674,7 +703,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
+ if (flags && !handle) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
@@ -694,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
- if (IS_DAX(inode) && buffer_unwritten(bh)) {
- /*
- * dgc: I suspect unwritten conversion on ext4+DAX is
- * fundamentally broken here when there are concurrent
- * read/write in progress on this inode.
- */
- WARN_ON_ONCE(io_end);
- bh->b_assoc_map = inode->i_mapping;
- bh->b_private = (void *)(unsigned long)iblock;
- }
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -879,9 +898,6 @@ int do_journal_get_write_access(handle_t *handle,
return ret;
}
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
-
#ifdef CONFIG_EXT4_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
@@ -3054,25 +3070,96 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ int ret;
+
+ ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_NO_LOCK);
+ ret = _ext4_get_block(inode, iblock, bh_result, 0);
+ /*
+ * Blocks should have been preallocated! ext4_file_write_iter() checks
+ * that.
+ */
+ WARN_ON_ONCE(!buffer_mapped(bh_result));
+
+ return ret;
}
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+#ifdef CONFIG_FS_DAX
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
- int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
- if (create)
- flags |= EXT4_GET_BLOCKS_CREATE;
- ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+ int ret, err;
+ int credits;
+ struct ext4_map_blocks map;
+ handle_t *handle = NULL;
+ int flags = 0;
+
+ ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result, flags);
+ map.m_lblk = iblock;
+ map.m_len = bh_result->b_size >> inode->i_blkbits;
+ credits = ext4_chunk_trans_blocks(inode, map.m_len);
+ if (create) {
+ flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+ }
+
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (create) {
+ err = ext4_journal_stop(handle);
+ if (ret >= 0 && err < 0)
+ ret = err;
+ }
+ if (ret <= 0)
+ goto out;
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int err2;
+
+ /*
+ * We are protected by i_mmap_sem so we know block cannot go
+ * away from under us even though we dropped i_data_sem.
+ * Convert extent to written and write zeros there.
+ *
+ * Note: We may get here even when create == 0.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ err = ext4_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
+ if (err < 0)
+ ret = err;
+ err2 = ext4_journal_stop(handle);
+ if (err2 < 0 && ret > 0)
+ ret = err2;
+ }
+out:
+ WARN_ON_ONCE(ret == 0 && create);
+ if (ret > 0) {
+ map_bh(bh_result, inode->i_sb, map.m_pblk);
+ bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
+ map.m_flags;
+ /*
+ * At least for now we have to clear BH_New so that DAX code
+ * doesn't attempt to zero blocks again in a racy way.
+ */
+ bh_result->b_state &= ~(1 << BH_New);
+ bh_result->b_size = map.m_len << inode->i_blkbits;
+ ret = 0;
+ }
+ return ret;
}
+#endif
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
@@ -3143,10 +3230,8 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
- if (overwrite) {
- down_read(&EXT4_I(inode)->i_data_sem);
- mutex_unlock(&inode->i_mutex);
- }
+ if (overwrite)
+ inode_unlock(inode);
/*
* We could direct write to holes and fallocate.
@@ -3189,7 +3274,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
if (overwrite) {
- get_block_func = ext4_get_block_write_nolock;
+ get_block_func = ext4_get_block_overwrite;
} else {
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
@@ -3245,10 +3330,8 @@ retake_lock:
if (iov_iter_rw(iter) == WRITE)
inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
- if (overwrite) {
- up_read(&EXT4_I(inode)->i_data_sem);
- mutex_lock(&inode->i_mutex);
- }
+ if (overwrite)
+ inode_lock(inode);
return ret;
}
@@ -3559,6 +3642,35 @@ int ext4_can_truncate(struct inode *inode)
}
/*
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ loff_t size = i_size_read(inode);
+
+ WARN_ON(!inode_is_locked(inode));
+ if (offset > size || offset + len < size)
+ return 0;
+
+ if (EXT4_I(inode)->i_disksize >= size)
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ext4_update_i_disksize(inode, size);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return 0;
+}
+
+/*
* ext4_punch_hole: punches a hole in a file by releaseing the blocks
* associated with the given offset and length
*
@@ -3595,7 +3707,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
return ret;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
@@ -3623,17 +3735,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset)
+ if (last_block_offset > first_block_offset) {
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
+ if (ret)
+ goto out_dio;
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);
-
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
+ }
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -3680,19 +3801,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- /* Now release the pages again to reduce race window */
- if (last_block_offset > first_block_offset)
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
-
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
out_dio:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -3762,7 +3879,7 @@ void ext4_truncate(struct inode *inode)
* have i_mutex locked because it's not necessary.
*/
if (!(inode->i_state & (I_NEW|I_FREEING)))
- WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ WARN_ON(!inode_is_locked(inode));
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
@@ -4076,6 +4193,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
EXT4_I(inode)->i_inline_off = 0;
}
+int ext4_get_projid(struct inode *inode, kprojid_t *projid)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+ return -EOPNOTSUPP;
+ *projid = EXT4_I(inode)->i_projid;
+ return 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -4087,6 +4212,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
int block;
uid_t i_uid;
gid_t i_gid;
+ projid_t i_projid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -4136,12 +4262,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
+ else
+ i_projid = EXT4_DEF_PROJID;
+
if (!(test_opt(inode->i_sb, NO_UID32))) {
i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
i_uid_write(inode, i_uid);
i_gid_write(inode, i_gid);
+ ei->i_projid = make_kprojid(&init_user_ns, i_projid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
@@ -4440,6 +4574,7 @@ static int ext4_do_update_inode(handle_t *handle,
int need_datasync = 0, set_large_file = 0;
uid_t i_uid;
gid_t i_gid;
+ projid_t i_projid;
spin_lock(&ei->i_raw_lock);
@@ -4452,6 +4587,7 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
i_uid = i_uid_read(inode);
i_gid = i_gid_read(inode);
+ i_projid = from_kprojid(&init_user_ns, ei->i_projid);
if (!(test_opt(inode->i_sb, NO_UID32))) {
raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
@@ -4529,6 +4665,15 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le16(ei->i_extra_isize);
}
}
+
+ BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ i_projid != EXT4_DEF_PROJID);
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ raw_inode->i_projid = cpu_to_le32(i_projid);
+
ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
if (inode->i_sb->s_flags & MS_LAZYTIME)
@@ -4824,6 +4969,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
} else
ext4_wait_for_tail_page_commit(inode);
}
+ down_write(&EXT4_I(inode)->i_mmap_sem);
/*
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
@@ -4831,6 +4977,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
truncate_pagecache(inode, inode->i_size);
if (shrink)
ext4_truncate(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
if (!rc) {
@@ -5279,6 +5426,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
@@ -5348,6 +5497,19 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(ret);
out:
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int err;
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_fault(vma, vmf);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+
+ return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5e872fd40e5e..0f6c36922c24 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/random.h>
+#include <linux/quotaops.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16])
return 1;
}
+static int ext4_ioctl_setflags(struct inode *inode,
+ unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ handle_t *handle = NULL;
+ int err = EPERM, migrate = 0;
+ struct ext4_iloc iloc;
+ unsigned int oldflags, mask, i;
+ unsigned int jflag;
+
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ goto flags_out;
+
+ oldflags = ei->i_flags;
+
+ /* The JOURNAL_DATA flag is modifiable only by root */
+ jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ *
+ * This test looks nicer. Thanks to Pauline Middelink
+ */
+ if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ goto flags_out;
+ }
+
+ /*
+ * The JOURNAL_DATA flag can only be changed by
+ * the relevant capability.
+ */
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if (!capable(CAP_SYS_RESOURCE))
+ goto flags_out;
+ }
+ if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+ migrate = 1;
+
+ if (flags & EXT4_EOFBLOCKS_FL) {
+ /* we don't support adding EOFBLOCKS flag */
+ if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (oldflags & EXT4_EOFBLOCKS_FL)
+ ext4_truncate(inode);
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto flags_out;
+ }
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto flags_err;
+
+ for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+ if (!(mask & EXT4_FL_USER_MODIFIABLE))
+ continue;
+ if (mask & flags)
+ ext4_set_inode_flag(inode, i);
+ else
+ ext4_clear_inode_flag(inode, i);
+ }
+
+ ext4_set_inode_flags(inode);
+ inode->i_ctime = ext4_current_time(inode);
+
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+ ext4_journal_stop(handle);
+ if (err)
+ goto flags_out;
+
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+ err = ext4_change_inode_journal_flag(inode, jflag);
+ if (err)
+ goto flags_out;
+ if (migrate) {
+ if (flags & EXT4_EXTENTS_FL)
+ err = ext4_ext_migrate(inode);
+ else
+ err = ext4_ind_migrate(inode);
+ }
+
+flags_out:
+ return err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, rc;
+ handle_t *handle;
+ kprojid_t kprojid;
+ struct ext4_iloc iloc;
+ struct ext4_inode *raw_inode;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+ if (projid != EXT4_DEF_PROJID)
+ return -EOPNOTSUPP;
+ else
+ return 0;
+ }
+
+ if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE)
+ return -EOPNOTSUPP;
+
+ kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
+
+ if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
+ return 0;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ err = -EPERM;
+ inode_lock(inode);
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ goto out_unlock;
+
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ goto out_unlock;
+
+ raw_inode = ext4_raw_inode(&iloc);
+ if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
+ err = -EOVERFLOW;
+ brelse(iloc.bh);
+ goto out_unlock;
+ }
+ brelse(iloc.bh);
+
+ dquot_initialize(inode);
+
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ EXT4_QUOTA_INIT_BLOCKS(sb) +
+ EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_unlock;
+ }
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_stop;
+
+ if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
+ struct dquot *transfer_to[MAXQUOTAS] = { };
+
+ transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+ if (transfer_to[PRJQUOTA]) {
+ err = __dquot_transfer(inode, transfer_to);
+ dqput(transfer_to[PRJQUOTA]);
+ if (err)
+ goto out_dirty;
+ }
+ }
+ EXT4_I(inode)->i_projid = kprojid;
+ inode->i_ctime = ext4_current_time(inode);
+out_dirty:
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+out_stop:
+ ext4_journal_stop(handle);
+out_unlock:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return err;
+}
+#else
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+ if (projid != EXT4_DEF_PROJID)
+ return -EOPNOTSUPP;
+ return 0;
+}
+#endif
+
+/* Transfer internal flags to xflags */
+static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
+{
+ __u32 xflags = 0;
+
+ if (iflags & EXT4_SYNC_FL)
+ xflags |= FS_XFLAG_SYNC;
+ if (iflags & EXT4_IMMUTABLE_FL)
+ xflags |= FS_XFLAG_IMMUTABLE;
+ if (iflags & EXT4_APPEND_FL)
+ xflags |= FS_XFLAG_APPEND;
+ if (iflags & EXT4_NODUMP_FL)
+ xflags |= FS_XFLAG_NODUMP;
+ if (iflags & EXT4_NOATIME_FL)
+ xflags |= FS_XFLAG_NOATIME;
+ if (iflags & EXT4_PROJINHERIT_FL)
+ xflags |= FS_XFLAG_PROJINHERIT;
+ return xflags;
+}
+
+/* Transfer xflags flags to internal */
+static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
+{
+ unsigned long iflags = 0;
+
+ if (xflags & FS_XFLAG_SYNC)
+ iflags |= EXT4_SYNC_FL;
+ if (xflags & FS_XFLAG_IMMUTABLE)
+ iflags |= EXT4_IMMUTABLE_FL;
+ if (xflags & FS_XFLAG_APPEND)
+ iflags |= EXT4_APPEND_FL;
+ if (xflags & FS_XFLAG_NODUMP)
+ iflags |= EXT4_NODUMP_FL;
+ if (xflags & FS_XFLAG_NOATIME)
+ iflags |= EXT4_NOATIME_FL;
+ if (xflags & FS_XFLAG_PROJINHERIT)
+ iflags |= EXT4_PROJINHERIT_FL;
+
+ return iflags;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
case EXT4_IOC_SETFLAGS: {
- handle_t *handle = NULL;
- int err, migrate = 0;
- struct ext4_iloc iloc;
- unsigned int oldflags, mask, i;
- unsigned int jflag;
+ int err;
if (!inode_owner_or_capable(inode))
return -EACCES;
@@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = ext4_mask_flags(inode->i_mode, flags);
- err = -EPERM;
- mutex_lock(&inode->i_mutex);
- /* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode))
- goto flags_out;
-
- oldflags = ei->i_flags;
-
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT4_JOURNAL_DATA_FL;
-
- /*
- * The IMMUTABLE and APPEND_ONLY flags can only be changed by
- * the relevant capability.
- *
- * This test looks nicer. Thanks to Pauline Middelink
- */
- if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- goto flags_out;
- }
-
- /*
- * The JOURNAL_DATA flag can only be changed by
- * the relevant capability.
- */
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE))
- goto flags_out;
- }
- if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
- migrate = 1;
-
- if (flags & EXT4_EOFBLOCKS_FL) {
- /* we don't support adding EOFBLOCKS flag */
- if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
- err = -EOPNOTSUPP;
- goto flags_out;
- }
- } else if (oldflags & EXT4_EOFBLOCKS_FL)
- ext4_truncate(inode);
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto flags_out;
- }
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto flags_err;
-
- for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
- if (!(mask & EXT4_FL_USER_MODIFIABLE))
- continue;
- if (mask & flags)
- ext4_set_inode_flag(inode, i);
- else
- ext4_clear_inode_flag(inode, i);
- }
-
- ext4_set_inode_flags(inode);
- inode->i_ctime = ext4_current_time(inode);
-
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-flags_err:
- ext4_journal_stop(handle);
- if (err)
- goto flags_out;
-
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
- err = ext4_change_inode_journal_flag(inode, jflag);
- if (err)
- goto flags_out;
- if (migrate) {
- if (flags & EXT4_EXTENTS_FL)
- err = ext4_ext_migrate(inode);
- else
- err = ext4_ind_migrate(inode);
- }
-
-flags_out:
- mutex_unlock(&inode->i_mutex);
+ inode_lock(inode);
+ err = ext4_ioctl_setflags(inode, flags);
+ inode_unlock(inode);
mnt_drop_write_file(filp);
return err;
}
@@ -349,7 +497,7 @@ flags_out:
goto setversion_out;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -364,7 +512,7 @@ flags_out:
ext4_journal_stop(handle);
unlock_out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
setversion_out:
mnt_drop_write_file(filp);
return err;
@@ -510,9 +658,9 @@ group_add_out:
* ext4_ext_swap_inode_data before we switch the
* inode format to prevent read.
*/
- mutex_lock(&(inode->i_mutex));
+ inode_lock((inode));
err = ext4_ext_migrate(inode);
- mutex_unlock(&(inode->i_mutex));
+ inode_unlock((inode));
mnt_drop_write_file(filp);
return err;
}
@@ -689,6 +837,60 @@ encryption_policy_out:
return -EOPNOTSUPP;
#endif
}
+ case EXT4_IOC_FSGETXATTR:
+ {
+ struct fsxattr fa;
+
+ memset(&fa, 0, sizeof(struct fsxattr));
+ ext4_get_inode_flags(ei);
+ fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+ fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
+ EXT4_I(inode)->i_projid);
+ }
+
+ if (copy_to_user((struct fsxattr __user *)arg,
+ &fa, sizeof(fa)))
+ return -EFAULT;
+ return 0;
+ }
+ case EXT4_IOC_FSSETXATTR:
+ {
+ struct fsxattr fa;
+ int err;
+
+ if (copy_from_user(&fa, (struct fsxattr __user *)arg,
+ sizeof(fa)))
+ return -EFAULT;
+
+ /* Make sure caller has proper permission */
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+ flags = ext4_mask_flags(inode->i_mode, flags);
+
+ inode_lock(inode);
+ flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
+ (flags & EXT4_FL_XFLAG_VISIBLE);
+ err = ext4_ioctl_setflags(inode, flags);
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+
+ err = ext4_ioctl_setproject(filp, fa.fsx_projid);
+ if (err)
+ return err;
+
+ return 0;
+ }
default:
return -ENOTTY;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f27e0c2598c5..06574dd77614 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
- struct dentry *dentry, struct inode *inode);
+ struct inode *dir, struct inode *inode);
/* checksumming functions */
void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
@@ -1928,10 +1928,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* directory, and adds the dentry to the indexed directory.
*/
static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
- struct dentry *dentry,
+ struct inode *dir,
struct inode *inode, struct buffer_head *bh)
{
- struct inode *dir = d_inode(dentry->d_parent);
struct buffer_head *bh2;
struct dx_root *root;
struct dx_frame frames[2], *frame;
@@ -2086,8 +2085,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
if (ext4_has_inline_data(dir)) {
- retval = ext4_try_add_inline_entry(handle, &fname,
- dentry, inode);
+ retval = ext4_try_add_inline_entry(handle, &fname, dir, inode);
if (retval < 0)
goto out;
if (retval == 1) {
@@ -2097,7 +2095,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
if (is_dx(dir)) {
- retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
+ retval = ext4_dx_add_entry(handle, &fname, dir, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
@@ -2119,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (blocks == 1 && !dx_fallback &&
ext4_has_feature_dir_index(sb)) {
- retval = make_indexed_dir(handle, &fname, dentry,
+ retval = make_indexed_dir(handle, &fname, dir,
inode, bh);
bh = NULL; /* make_indexed_dir releases bh */
goto out;
@@ -2154,12 +2152,11 @@ out:
* Returns 0 for success, or a negative error value
*/
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
- struct dentry *dentry, struct inode *inode)
+ struct inode *dir, struct inode *inode)
{
struct dx_frame frames[2], *frame;
struct dx_entry *entries, *at;
struct buffer_head *bh;
- struct inode *dir = d_inode(dentry->d_parent);
struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
int err;
@@ -2756,7 +2753,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
return 0;
WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !mutex_is_locked(&inode->i_mutex));
+ !inode_is_locked(inode));
/*
* Exit early if inode already is on orphan list. This is a big speedup
* since we don't have to contend on the global s_orphan_lock.
@@ -2838,7 +2835,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
return 0;
WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !mutex_is_locked(&inode->i_mutex));
+ !inode_is_locked(inode));
/* Do this quick check before taking global s_orphan_lock. */
if (list_empty(&ei->i_orphan))
return 0;
@@ -3212,6 +3209,12 @@ static int ext4_link(struct dentry *old_dentry,
if (ext4_encrypted_inode(dir) &&
!ext4_is_child_context_consistent_with_parent(dir, inode))
return -EPERM;
+
+ if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
+ (!projid_eq(EXT4_I(dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
err = dquot_initialize(dir);
if (err)
return err;
@@ -3492,6 +3495,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
int credits;
u8 old_file_type;
+ if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
+ (!projid_eq(EXT4_I(new_dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
retval = dquot_initialize(old.dir);
if (retval)
return retval;
@@ -3701,6 +3709,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
new.inode)))
return -EPERM;
+ if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
+ !projid_eq(EXT4_I(new_dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)) ||
+ (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) &&
+ !projid_eq(EXT4_I(old_dir)->i_projid,
+ EXT4_I(new_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
retval = dquot_initialize(old.dir);
if (retval)
return retval;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c9ab67da6e5a..3ed01ec011d7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -80,6 +80,36 @@ static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
+/*
+ * Lock ordering
+ *
+ * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
+ * i_mmap_rwsem (inode->i_mmap_rwsem)!
+ *
+ * page fault path:
+ * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
+ * page lock -> i_data_sem (rw)
+ *
+ * buffered write path:
+ * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> transaction start -> page lock ->
+ * i_data_sem (rw)
+ *
+ * truncate:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ * i_mmap_rwsem (w) -> page lock
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ * transaction start -> i_data_sem (rw)
+ *
+ * direct IO:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) ->
+ * transaction start -> i_data_sem (rw)
+ *
+ * writepages:
+ * transaction start -> page lock(s) -> i_data_sem (rw)
+ */
+
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
@@ -958,6 +988,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
+ init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
}
@@ -966,7 +997,7 @@ static int __init init_inodecache(void)
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
@@ -1066,8 +1097,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
}
#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
-#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+static char *quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
static int ext4_write_dquot(struct dquot *dquot);
static int ext4_acquire_dquot(struct dquot *dquot);
@@ -1100,6 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = {
.write_info = ext4_write_info,
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
+ .get_projid = ext4_get_projid,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -2254,10 +2286,10 @@ static void ext4_orphan_cleanup(struct super_block *sb,
__func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
nr_truncates++;
} else {
if (test_opt(sb, DEBUG))
@@ -2526,6 +2558,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
"without CONFIG_QUOTA");
return 0;
}
+ if (ext4_has_feature_project(sb) && !readonly) {
+ ext4_msg(sb, KERN_ERR,
+ "Filesystem with project quota feature cannot be mounted RDWR "
+ "without CONFIG_QUOTA");
+ return 0;
+ }
#endif /* CONFIG_QUOTA */
return 1;
}
@@ -3654,7 +3692,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &ext4_qctl_operations;
- sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
@@ -4790,6 +4828,48 @@ restore_opts:
return err;
}
+#ifdef CONFIG_QUOTA
+static int ext4_statfs_project(struct super_block *sb,
+ kprojid_t projid, struct kstatfs *buf)
+{
+ struct kqid qid;
+ struct dquot *dquot;
+ u64 limit;
+ u64 curblock;
+
+ qid = make_kqid_projid(projid);
+ dquot = dqget(sb, qid);
+ if (IS_ERR(dquot))
+ return PTR_ERR(dquot);
+ spin_lock(&dq_data_lock);
+
+ limit = (dquot->dq_dqb.dqb_bsoftlimit ?
+ dquot->dq_dqb.dqb_bsoftlimit :
+ dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ if (limit && buf->f_blocks > limit) {
+ curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+ buf->f_blocks = limit;
+ buf->f_bfree = buf->f_bavail =
+ (buf->f_blocks > curblock) ?
+ (buf->f_blocks - curblock) : 0;
+ }
+
+ limit = dquot->dq_dqb.dqb_isoftlimit ?
+ dquot->dq_dqb.dqb_isoftlimit :
+ dquot->dq_dqb.dqb_ihardlimit;
+ if (limit && buf->f_files > limit) {
+ buf->f_files = limit;
+ buf->f_ffree =
+ (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
+ (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ }
+
+ spin_unlock(&dq_data_lock);
+ dqput(dquot);
+ return 0;
+}
+#endif
+
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -4822,6 +4902,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+#ifdef CONFIG_QUOTA
+ if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
+ sb_has_quota_limits_enabled(sb, PRJQUOTA))
+ ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
+#endif
return 0;
}
@@ -4986,7 +5071,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
struct inode *qf_inode;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
- le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
};
BUG_ON(!ext4_has_feature_quota(sb));
@@ -5014,7 +5100,8 @@ static int ext4_enable_quotas(struct super_block *sb)
int type, err = 0;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
- le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
};
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 011ba6670d99..c70d06a383e2 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -10,8 +10,10 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ac9e7c6aac74..5c06db17e41f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -794,7 +794,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
if (start >= isize)
@@ -860,7 +860,7 @@ out:
if (ret == 1)
ret = 0;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 18ddb1e5182a..ea272be62677 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -333,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
loff_t isize;
int err = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
if (offset >= isize)
@@ -388,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
found:
if (whence == SEEK_HOLE && data_ofs > isize)
data_ofs = isize;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return vfs_setpos(file, data_ofs, maxbytes);
fail:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
@@ -1219,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode,
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (mode & FALLOC_FL_PUNCH_HOLE) {
if (offset >= inode->i_size)
@@ -1243,7 +1243,7 @@ static long f2fs_fallocate(struct file *file, int mode,
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
trace_f2fs_fallocate(inode, mode, offset, len, ret);
return ret;
@@ -1307,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
flags = f2fs_mask_flags(inode->i_mode, flags);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
oldflags = fi->i_flags;
if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = -EPERM;
goto out;
}
@@ -1322,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
flags = flags & FS_FL_USER_MODIFIABLE;
flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
fi->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
f2fs_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME;
@@ -1667,7 +1667,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
f2fs_balance_fs(sbi, true);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* writeback all dirty pages in the range */
err = filemap_write_and_wait_range(inode->i_mapping, range->start,
@@ -1778,7 +1778,7 @@ do_map:
clear_out:
clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!err)
range->len = (u64)total << PAGE_CACHE_SHIFT;
return err;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3bf990b80026..6134832baaaf 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1541,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs");
static int __init init_inodecache(void)
{
- f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
- sizeof(struct f2fs_inode_info));
+ f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+ sizeof(struct f2fs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
if (!f2fs_inode_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 93fc62232ec2..5d384921524d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 8b2127ffb226..d0b95c95079b 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -91,7 +91,7 @@ next:
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
@@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
buf.dirent = dirent;
buf.result = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
buf.ctx.pos = file->f_pos;
ret = -ENOENT;
if (!IS_DEADDIR(inode)) {
@@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
short_only, both ? &buf : NULL);
file->f_pos = buf.ctx.pos;
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret >= 0)
ret = buf.result;
return ret;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index be5e15323bab..e6b764a17a9c 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -87,7 +87,7 @@ struct msdos_sb_info {
unsigned int vol_id; /*volume ID*/
int fatent_shift;
- struct fatent_operations *fatent_ops;
+ const struct fatent_operations *fatent_ops;
struct inode *fat_inode;
struct inode *fsinfo_inode;
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 8226557130a2..1d9a8c4e9de0 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -99,7 +99,7 @@ err:
static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
int offset, sector_t blocknr)
{
- struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+ const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
@@ -246,7 +246,7 @@ static int fat32_ent_next(struct fat_entry *fatent)
return 0;
}
-static struct fatent_operations fat12_ops = {
+static const struct fatent_operations fat12_ops = {
.ent_blocknr = fat12_ent_blocknr,
.ent_set_ptr = fat12_ent_set_ptr,
.ent_bread = fat12_ent_bread,
@@ -255,7 +255,7 @@ static struct fatent_operations fat12_ops = {
.ent_next = fat12_ent_next,
};
-static struct fatent_operations fat16_ops = {
+static const struct fatent_operations fat16_ops = {
.ent_blocknr = fat_ent_blocknr,
.ent_set_ptr = fat16_ent_set_ptr,
.ent_bread = fat_ent_bread,
@@ -264,7 +264,7 @@ static struct fatent_operations fat16_ops = {
.ent_next = fat16_ent_next,
};
-static struct fatent_operations fat32_ops = {
+static const struct fatent_operations fat32_ops = {
.ent_blocknr = fat_ent_blocknr,
.ent_set_ptr = fat32_ent_set_ptr,
.ent_bread = fat_ent_bread,
@@ -320,7 +320,7 @@ static inline int fat_ent_update_ptr(struct super_block *sb,
int offset, sector_t blocknr)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
struct buffer_head **bhs = fatent->bhs;
/* Is this fatent's blocks including this entry? */
@@ -349,7 +349,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
int err, offset;
sector_t blocknr;
@@ -407,7 +407,7 @@ int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
int new, int wait)
{
struct super_block *sb = inode->i_sb;
- struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+ const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
int err;
ops->ent_put(fatent, new);
@@ -432,7 +432,7 @@ static inline int fat_ent_next(struct msdos_sb_info *sbi,
static inline int fat_ent_read_block(struct super_block *sb,
struct fat_entry *fatent)
{
- struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+ const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
sector_t blocknr;
int offset;
@@ -463,7 +463,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent, prev_ent;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
int i, count, err, nr_bhs, idx_clus;
@@ -551,7 +551,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
int i, err, nr_bhs;
@@ -636,7 +636,7 @@ EXPORT_SYMBOL_GPL(fat_free_clusters);
static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
unsigned long reada_blocks)
{
- struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+ const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
sector_t blocknr;
int i, offset;
@@ -649,7 +649,7 @@ static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
int fat_count_free_clusters(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent;
unsigned long reada_blocks, reada_mask, cur_block;
int err = 0, free;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a08f1039909a..f70185668832 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -14,15 +14,19 @@
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
attr = fat_make_attrs(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return put_user(attr, user_attr);
}
@@ -43,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
err = mnt_want_write_file(file);
if (err)
goto out;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -105,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
fat_save_attrs(inode, attr);
mark_inode_dirty(inode);
out_unlock_inode:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mnt_drop_write_file(file);
out:
return err;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ inode_unlock(inode);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..a5599052116c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -449,6 +480,24 @@ static int fat_calc_dir_size(struct inode *inode)
return 0;
}
+static int fat_validate_dir(struct inode *dir)
+{
+ struct super_block *sb = dir->i_sb;
+
+ if (dir->i_nlink < 2) {
+ /* Directory should have "."/".." entries at least. */
+ fat_fs_error(sb, "corrupted directory (invalid entries)");
+ return -EIO;
+ }
+ if (MSDOS_I(dir)->i_start == 0 ||
+ MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) {
+ /* Directory should point valid cluster. */
+ fat_fs_error(sb, "corrupted directory (invalid i_start)");
+ return -EIO;
+ }
+ return 0;
+}
+
/* doesn't deal with root inode */
int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
{
@@ -475,6 +524,10 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
MSDOS_I(inode)->mmu_private = inode->i_size;
set_nlink(inode, fat_subdirs(inode));
+
+ error = fat_validate_dir(inode);
+ if (error < 0)
+ return error;
} else { /* not a directory */
inode->i_generation |= 1;
inode->i_mode = fat_make_mode(sbi, de->attr,
@@ -553,13 +606,43 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
@@ -677,7 +760,7 @@ static int __init fat_init_inodecache(void)
fat_inode_cachep = kmem_cache_create("fat_inode_cache",
sizeof(struct msdos_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (fat_inode_cachep == NULL)
return -ENOMEM;
@@ -1146,7 +1229,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_time_offset:
if (match_int(&args[0], &option))
return -EINVAL;
- if (option < -12 * 60 || option > 12 * 60)
+ /*
+ * GMT+-12 zones may have DST corrections so at least
+ * 13 hours difference is needed. Make the limit 24
+ * just in case someone invents something unusual.
+ */
+ if (option < -24 * 60 || option > 24 * 60)
return -EINVAL;
opts->tz_set = 1;
opts->time_offset = option;
diff --git a/fs/file.c b/fs/file.c
index 1aed0add16a2..1fbc5c0555a9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size)
* vmalloc() if the allocation size will be considered "large" by the VM.
*/
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
+ void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
+ __GFP_NOWARN | __GFP_NORETRY);
if (data != NULL)
return data;
}
- return vmalloc(size);
+ return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
}
static void __free_fdtable(struct fdtable *fdt)
@@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
if (unlikely(nr > sysctl_nr_open))
nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
fdt->max_fds = nr;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 5797d45a78cb..c5618db110be 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs)
static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
struct file_system_type **p;
- for (p=&file_systems; *p; p=&(*p)->next)
- if (strlen((*p)->name) == len &&
- strncmp((*p)->name, name, len) == 0)
+ for (p = &file_systems; *p; p = &(*p)->next)
+ if (strncmp((*p)->name, name, len) == 0 &&
+ !(*p)->name[len])
break;
return p;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 023f6a1f23cd..6915c950e6e8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -677,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
if (!wbc->wb)
return;
- rcu_read_lock();
id = mem_cgroup_css_from_page(page)->id;
- rcu_read_unlock();
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 712601f299b8..4b855b65d457 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
if (!parent)
return -ENOENT;
- mutex_lock(&parent->i_mutex);
+ inode_lock(parent);
if (!S_ISDIR(parent->i_mode))
goto unlock;
@@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
fuse_invalidate_entry(entry);
if (child_nodeid != 0 && d_really_is_positive(entry)) {
- mutex_lock(&d_inode(entry)->i_mutex);
+ inode_lock(d_inode(entry));
if (get_node_id(d_inode(entry)) != child_nodeid) {
err = -ENOENT;
goto badentry;
@@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
clear_nlink(d_inode(entry));
err = 0;
badentry:
- mutex_unlock(&d_inode(entry)->i_mutex);
+ inode_unlock(d_inode(entry));
if (!err)
d_delete(entry);
} else {
@@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
dput(entry);
unlock:
- mutex_unlock(&parent->i_mutex);
+ inode_unlock(parent);
iput(parent);
return err;
}
@@ -1504,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode)
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- BUG_ON(!mutex_is_locked(&inode->i_mutex));
+ BUG_ON(!inode_is_locked(inode));
spin_lock(&fc->lock);
BUG_ON(fi->writectr < 0);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 570ca4053c80..b03d253ece15 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
return err;
if (lock_inode)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
@@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
fuse_finish_open(inode, file);
if (lock_inode)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
@@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
fuse_sync_writes(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
req = fuse_get_req_nofail_nopages(fc, file);
memset(&inarg, 0, sizeof(inarg));
@@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
if (is_bad_inode(inode))
return -EIO;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Start writeback against all dirty pages of the inode, then
@@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
err = 0;
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
@@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return generic_file_write_iter(iocb, from);
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
@@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
out:
current->backing_dev_info = NULL;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return written ? written : err;
}
@@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
if (!write)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
fuse_sync_writes(inode);
if (!write)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
while (count) {
@@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
return -EIO;
/* Don't allow parallel writes to the same file */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
res = generic_write_checks(iocb, from);
if (res > 0)
res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
fuse_invalidate_attr(inode);
if (res > 0)
fuse_write_update_size(inode, iocb->ki_pos);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return res;
}
@@ -2231,20 +2231,77 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
return err ? 0 : outarg.block;
}
+static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+ FUSE_ARGS(args);
+ struct fuse_lseek_in inarg = {
+ .fh = ff->fh,
+ .offset = offset,
+ .whence = whence
+ };
+ struct fuse_lseek_out outarg;
+ int err;
+
+ if (fc->no_lseek)
+ goto fallback;
+
+ args.in.h.opcode = FUSE_LSEEK;
+ args.in.h.nodeid = ff->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (err) {
+ if (err == -ENOSYS) {
+ fc->no_lseek = 1;
+ goto fallback;
+ }
+ return err;
+ }
+
+ return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
+
+fallback:
+ err = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!err)
+ return generic_file_llseek(file, offset, whence);
+ else
+ return err;
+}
+
static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
{
loff_t retval;
struct inode *inode = file_inode(file);
- /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
- if (whence == SEEK_CUR || whence == SEEK_SET)
- return generic_file_llseek(file, offset, whence);
-
- mutex_lock(&inode->i_mutex);
- retval = fuse_update_attributes(inode, NULL, file, NULL);
- if (!retval)
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
retval = generic_file_llseek(file, offset, whence);
- mutex_unlock(&inode->i_mutex);
+ break;
+ case SEEK_END:
+ inode_lock(inode);
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!retval)
+ retval = generic_file_llseek(file, offset, whence);
+ inode_unlock(inode);
+ break;
+ case SEEK_HOLE:
+ case SEEK_DATA:
+ inode_lock(inode);
+ retval = fuse_lseek(file, offset, whence);
+ inode_unlock(inode);
+ break;
+ default:
+ retval = -EINVAL;
+ }
return retval;
}
@@ -2887,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
return -EOPNOTSUPP;
if (lock_inode) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (mode & FALLOC_FL_PUNCH_HOLE) {
loff_t endbyte = offset + length - 1;
err = filemap_write_and_wait_range(inode->i_mapping,
@@ -2933,7 +2990,7 @@ out:
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
if (lock_inode)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 405113101db8..ce394b5fe6b4 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -605,6 +605,9 @@ struct fuse_conn {
/** Does the filesystem support asynchronous direct-IO submission? */
unsigned async_dio:1;
+ /** Is lseek not implemented by fs? */
+ unsigned no_lseek:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2a5b99..4d69d5c0bedc 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void)
int err;
fuse_inode_cachep = kmem_cache_create("fuse_inode",
- sizeof(struct fuse_inode),
- 0, SLAB_HWCACHE_ALIGN,
+ sizeof(struct fuse_inode), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
fuse_inode_init_once);
err = -ENOMEM;
if (!fuse_inode_cachep)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7412863cda1e..c9384f932975 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -914,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
@@ -946,7 +946,7 @@ out_unlock:
gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f348cfb6b69a..437fd73e381e 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/posix_acl.h>
+#include <linux/security.h>
#include "gfs2.h"
#include "incore.h"
@@ -262,6 +263,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
if (ip) {
set_bit(GIF_INVALID, &ip->i_flags);
forget_all_cached_acls(&ip->i_inode);
+ security_inode_invalidate_secctx(&ip->i_inode);
gfs2_dir_hash_inval(ip);
}
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3e94400d587c..352f958769e1 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2067,7 +2067,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
if (ret)
@@ -2094,7 +2094,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
gfs2_glock_dq_uninit(&gh);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 1d709d496364..f99f8e94de3f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,8 @@ static int __init init_gfs2_fs(void)
gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
sizeof(struct gfs2_inode),
0, SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT,
gfs2_init_inode_once);
if (!gfs2_inode_cachep)
goto fail;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index be6d9c450b22..a39891344259 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -888,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
return -ENOMEM;
sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
- mutex_lock(&ip->i_inode.i_mutex);
+ inode_lock(&ip->i_inode);
for (qx = 0; qx < num_qd; qx++) {
error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
GL_NOCACHE, &ghs[qx]);
@@ -953,7 +953,7 @@ out_alloc:
out:
while (qx--)
gfs2_glock_dq_uninit(&ghs[qx]);
- mutex_unlock(&ip->i_inode.i_mutex);
+ inode_unlock(&ip->i_inode);
kfree(ghs);
gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
return error;
@@ -1674,7 +1674,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (error)
goto out_put;
- mutex_lock(&ip->i_inode.i_mutex);
+ inode_lock(&ip->i_inode);
error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
if (error)
goto out_unlockput;
@@ -1739,7 +1739,7 @@ out_i:
out_q:
gfs2_glock_dq_uninit(&q_gh);
out_unlockput:
- mutex_unlock(&ip->i_inode.i_mutex);
+ inode_unlock(&ip->i_inode);
out_put:
qd_put(qd);
return error;
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index db458ee3a546..1eb5d415d434 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -214,7 +214,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
{
struct super_block *sb;
struct hfs_find_data fd;
- struct list_head *pos;
+ struct hfs_readdir_data *rd;
int res, type;
hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
@@ -240,9 +240,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
}
}
- list_for_each(pos, &HFS_I(dir)->open_dir_list) {
- struct hfs_readdir_data *rd =
- list_entry(pos, struct hfs_readdir_data, list);
+ list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) {
if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
rd->file->f_pos--;
}
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 70788e03820a..e9f2b855f831 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
{
struct hfs_readdir_data *rd = file->private_data;
if (rd) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
list_del(&rd->list);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
kfree(rd);
}
return 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index b99ebddb10cb..6686bf39a5b5 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file)
if (HFS_IS_RSRC(inode))
inode = HFS_I(inode)->rsrc_inode;
if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
hfs_file_truncate(inode);
//if (inode->i_flags & S_DEAD) {
// hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
// hfs_delete_inode(inode);
//}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
}
@@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* sync the inode to buffers */
ret = write_inode_now(inode, 0);
@@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
err = sync_blockdev(sb->s_bdev);
if (!ret)
ret = err;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4574fdd3d421..1ca95c232bb5 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -483,8 +483,8 @@ static int __init init_hfs_fs(void)
int err;
hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
- sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
- hfs_init_once);
+ sizeof(struct hfs_inode_info), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
if (!hfs_inode_cachep)
return -ENOMEM;
err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d0f39dcbb58e..a4e867e08947 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
{
struct hfsplus_readdir_data *rd = file->private_data;
if (rd) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
list_del(&rd->list);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
kfree(rd);
}
return 0;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 19b33f8151f1..1a6394cdb54e 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
if (HFSPLUS_IS_RSRC(inode))
inode = HFSPLUS_I(inode)->rsrc_inode;
if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
hfsplus_file_truncate(inode);
if (inode->i_flags & S_DEAD) {
hfsplus_delete_cat(inode->i_ino,
HFSPLUS_SB(sb)->hidden_dir, NULL);
hfsplus_delete_inode(inode);
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
}
@@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (error)
return error;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Sync inode metadata into the catalog and extent trees.
@@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 0624ce4e0702..32a49e292b6a 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
goto out_drop_write;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
@@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
mark_inode_dirty(inode);
out_unlock_inode:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out_drop_write:
mnt_drop_write_file(file);
out:
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7302d96ae8bf..5d54490a136d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void)
int err;
hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
- HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+ HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
hfsplus_init_once);
if (!hfsplus_inode_cachep)
return -ENOMEM;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f49be23e78aa..d1abbee281d1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+ hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
@@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index dc540bfcee1d..e57a53c13d86 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
if (whence == SEEK_DATA || whence == SEEK_HOLE)
return -EINVAL;
- mutex_lock(&i->i_mutex);
+ inode_lock(i);
hpfs_lock(s);
/*pr_info("dir lseek\n");*/
@@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
ok:
filp->f_pos = new_off;
hpfs_unlock(s);
- mutex_unlock(&i->i_mutex);
+ inode_unlock(i);
return new_off;
fail:
/*pr_warn("illegal lseek: %016llx\n", new_off);*/
hpfs_unlock(s);
- mutex_unlock(&i->i_mutex);
+ inode_unlock(i);
return -ESPIPE;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a561591896bd..458cf463047b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -261,7 +261,7 @@ static int init_inodecache(void)
hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
sizeof(struct hpfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (hpfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d8f51ee8126b..e1f465a389d5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -4,11 +4,11 @@
* Nadia Yvette Chambers, 2002
*
* Copyright (C) 2002 Linus Torvalds.
+ * License: GPL
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
@@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
file_accessed(file);
ret = -ENOMEM;
@@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (vma->vm_flags & VM_WRITE && inode->i_size < len)
inode->i_size = len;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
delete_from_page_cache(page);
}
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+ struct vm_area_struct *vma;
+
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+ unsigned long v_offset;
+ unsigned long v_end;
+
+ /*
+ * Can the expression below overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+ else
+ v_offset = 0;
+
+ if (!end)
+ v_end = vma->vm_end;
+ else {
+ v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+ + vma->vm_start;
+ if (v_end > vma->vm_end)
+ v_end = vma->vm_end;
+ }
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+ NULL);
+ }
+}
/*
* remove_inode_hugepages handles two distinct cases: truncation and hole
* punch. There are subtle differences in operation for each case.
-
+ *
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ bool rsv_on_error;
u32 hash;
/*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
mapping, next, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
- lock_page(page);
- if (likely(!page_mapped(page))) {
- bool rsv_on_error = !PagePrivate(page);
- /*
- * We must free the huge page and remove
- * from page cache (remove_huge_page) BEFORE
- * removing the region/reserve map
- * (hugetlb_unreserve_pages). In rare out
- * of memory conditions, removal of the
- * region/reserve map could fail. Before
- * free'ing the page, note PagePrivate which
- * is used in case of error.
- */
- remove_huge_page(page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(
- inode, next,
- next + 1, 1)))
- hugetlb_fix_reserve_counts(
- inode, rsv_on_error);
- }
- } else {
- /*
- * If page is mapped, it was faulted in after
- * being unmapped. It indicates a race between
- * hole punch and page fault. Do nothing in
- * this case. Getting here in a truncate
- * operation is a bug.
- */
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) now after taking
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the page.
+ *
+ * This race can only happen in the hole punch case.
+ * Getting here in a truncate operation is a bug.
+ */
+ if (unlikely(page_mapped(page))) {
BUG_ON(truncate_op);
+
+ i_mmap_lock_write(mapping);
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ next * pages_per_huge_page(h),
+ (next + 1) * pages_per_huge_page(h));
+ i_mmap_unlock_write(mapping);
+ }
+
+ lock_page(page);
+ /*
+ * We must free the huge page and remove from page
+ * cache (remove_huge_page) BEFORE removing the
+ * region/reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the
+ * region/reserve map could fail. Before free'ing
+ * the page, note PagePrivate which is used in case
+ * of error.
+ */
+ rsv_on_error = !PagePrivate(page);
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode,
+ next, next + 1, 1)))
+ hugetlb_fix_reserve_counts(inode,
+ rsv_on_error);
}
unlock_page(page);
@@ -452,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
- struct vm_area_struct *vma;
-
- /*
- * end == 0 indicates that the entire range after
- * start should be unmapped.
- */
- vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
- unsigned long v_offset;
-
- /*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
- */
- if (vma->vm_pgoff < start)
- v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- if (end) {
- end = ((end - start) << PAGE_SHIFT) +
- vma->vm_start + v_offset;
- if (end > vma->vm_end)
- end = vma->vm_end;
- } else
- end = vma->vm_end;
-
- unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
- }
-}
-
static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
@@ -521,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (hole_end > hole_start) {
struct address_space *mapping = inode->i_mapping;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
hugetlb_vmdelete_list(&mapping->i_mmap,
@@ -529,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
hole_end >> PAGE_SHIFT);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, hole_start, hole_end);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
@@ -563,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
start = offset >> hpage_shift;
end = (offset + len + hpage_size - 1) >> hpage_shift;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
error = inode_newsize_ok(inode, offset + len);
@@ -650,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
i_size_write(inode, offset + len);
inode->i_ctime = CURRENT_TIME;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
@@ -708,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
/*
* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
* be taken from reclaim -- unlike regular filesystems. This needs an
- * annotation because huge_pmd_share() does an allocation under
+ * annotation because huge_pmd_share() does an allocation under hugetlb's
* i_mmap_rwsem.
*/
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
@@ -738,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* The policy is initialized here even if we are creating a
* private inode because initialization simply creates an
- * an empty rb tree and calls spin_lock_init(), later when we
+ * an empty rb tree and calls rwlock_init(), later when we
* call mpol_free_shared_policy() it will just return because
* the rb tree will still be empty.
*/
@@ -1202,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = {
.mount = hugetlbfs_mount,
.kill_sb = kill_litter_super,
};
-MODULE_ALIAS_FS("hugetlbfs");
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1322,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void)
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
- 0, 0, init_once);
+ 0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
goto out2;
@@ -1356,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void)
out2:
return error;
}
-
-static void __exit exit_hugetlbfs_fs(void)
-{
- struct hstate *h;
- int i;
-
-
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(hugetlbfs_inode_cachep);
- i = 0;
- for_each_hstate(h)
- kern_unmount(hugetlbfs_vfsmount[i++]);
- unregister_filesystem(&hugetlbfs_fs_type);
-}
-
-module_init(init_hugetlbfs_fs)
-module_exit(exit_hugetlbfs_fs)
-
-MODULE_LICENSE("GPL");
+fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index 4230f66b7410..9f62db3bcc3e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -495,7 +495,7 @@ void clear_inode(struct inode *inode)
*/
spin_lock_irq(&inode->i_data.tree_lock);
BUG_ON(inode->i_data.nrpages);
- BUG_ON(inode->i_data.nrshadows);
+ BUG_ON(inode->i_data.nrexceptional);
spin_unlock_irq(&inode->i_data.tree_lock);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
@@ -966,9 +966,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
swap(inode1, inode2);
if (inode1 && !S_ISDIR(inode1->i_mode))
- mutex_lock(&inode1->i_mutex);
+ inode_lock(inode1);
if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
+ inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);
@@ -980,9 +980,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
if (inode1 && !S_ISDIR(inode1->i_mode))
- mutex_unlock(&inode1->i_mutex);
+ inode_unlock(inode1);
if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
- mutex_unlock(&inode2->i_mutex);
+ inode_unlock(inode2);
}
EXPORT_SYMBOL(unlock_two_nondirectories);
@@ -1883,7 +1883,7 @@ void __init inode_init(void)
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
/* Hash may have been set up in inode_init_early */
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 29466c380958..116a333e9c77 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -434,9 +434,9 @@ int generic_block_fiemap(struct inode *inode,
u64 len, get_block_t *get_block)
{
int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
EXPORT_SYMBOL(generic_block_fiemap);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 61abdc4920da..bcd2d41b318a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -94,7 +94,7 @@ static int __init init_inodecache(void)
isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
sizeof(struct iso_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (isofs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a3750f902adc..0ae91ad6df2d 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mtd/mtd.h>
+#include <linux/mm.h> /* kvfree() */
#include "nodelist.h"
static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
@@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
return 0;
out_free:
-#ifndef __ECOS
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
-#endif
- kfree(c->blocks);
+ kvfree(c->blocks);
return ret;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f509f62e12f6..c5ac5944bc1b 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Trigger GC to flush any pending writes for this inode */
jffs2_flush_wbuf_gc(c, inode->i_ino);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return 0;
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2caf1682036d..bead25ae8fe4 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
out_root:
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
- kfree(c->blocks);
+ kvfree(c->blocks);
out_inohash:
jffs2_clear_xattr_subsystem(c);
kfree(c->inocache_list);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d86c5e3176a1..0a9a114bb9d1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb)
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
- kfree(c->blocks);
+ kvfree(c->blocks);
jffs2_flash_cleanup(c);
kfree(c->inocache_list);
jffs2_clear_xattr_subsystem(c);
@@ -387,7 +384,7 @@ static int __init init_jffs2_fs(void)
jffs2_inode_cachep = kmem_cache_create("jffs2_i",
sizeof(struct jffs2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
jffs2_i_init_once);
if (!jffs2_inode_cachep) {
pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 0e026a7bdcd4..4ce7735dd042 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (rc)
return rc;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (!(inode->i_state & I_DIRTY_ALL) ||
(datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return rc;
}
rc |= jfs_commit_inode(inode, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return rc ? -EIO : 0;
}
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 8db8b7d61e40..8653cac7e12e 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
/* Lock against other parallel changes of flags */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
jfs_get_inode_flags(jfs_inode);
oldflags = jfs_inode->mode2;
@@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
((flags ^ oldflags) &
(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
err = -EPERM;
goto setflags_out;
}
@@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
jfs_inode->mode2 = flags;
jfs_set_inode_flags(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
setflags_out:
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8f9176caf098..4f5d85ba8e23 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
struct buffer_head tmp_bh;
struct buffer_head *bh;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
while (towrite > 0) {
tocopy = sb->s_blocksize - offset < towrite ?
sb->s_blocksize - offset : towrite;
@@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
}
out:
if (len == towrite) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
if (inode->i_size < off+len-towrite)
@@ -832,7 +832,7 @@ out:
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return len - towrite;
}
@@ -898,7 +898,7 @@ static int __init init_jfs_fs(void)
jfs_inode_cachep =
kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (jfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 742bf4a230e8..996b7742c90b 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
if (!kn)
goto err_out1;
- /*
- * If the ino of the sysfs entry created for a kmem cache gets
- * allocated from an ida layer, which is accounted to the memcg that
- * owns the cache, the memcg will get pinned forever. So do not account
- * ino ida allocations.
- */
- ret = ida_simple_get(&root->ino_ida, 1, 0,
- GFP_KERNEL | __GFP_NOACCOUNT);
+ ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
if (ret < 0)
goto err_out2;
kn->ino = ret;
@@ -1518,9 +1511,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
struct inode *inode = file_inode(file);
loff_t ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = generic_file_llseek(file, offset, whence);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/libfs.c b/fs/libfs.c
index 01491299f348..0ca80b2af420 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close);
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry *dentry = file->f_path.dentry;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
switch (whence) {
case 1:
offset += file->f_pos;
@@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return -EINVAL;
}
if (offset != file->f_pos) {
@@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
spin_unlock(&dentry->d_lock);
}
}
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);
@@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = sync_mapping_buffers(inode->i_mapping);
if (!(inode->i_state & I_DIRTY_ALL))
goto out;
@@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
ret = err;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 5f31ebd96c06..154a107cd376 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,13 +25,17 @@
#include <linux/mutex.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/inetdevice.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <net/ip.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
#include <linux/lockd/lockd.h>
#include <linux/nfs.h>
@@ -44,7 +48,7 @@
static struct svc_program nlmsvc_program;
-struct nlmsvc_binding * nlmsvc_ops;
+const struct nlmsvc_binding *nlmsvc_ops;
EXPORT_SYMBOL_GPL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
@@ -90,8 +94,7 @@ static unsigned long get_lockd_grace_period(void)
static void grace_ender(struct work_struct *grace)
{
- struct delayed_work *dwork = container_of(grace, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(grace);
struct lockd_net *ln = container_of(dwork, struct lockd_net,
grace_period_end);
@@ -279,6 +282,68 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
}
}
+static int lockd_inetaddr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct sockaddr_in sin;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nlmsvc_rqst) {
+ dprintk("lockd_inetaddr_event: removed %pI4\n",
+ &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+ (struct sockaddr *)&sin);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inetaddr_notifier = {
+ .notifier_call = lockd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int lockd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct sockaddr_in6 sin6;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nlmsvc_rqst) {
+ dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+ (struct sockaddr *)&sin6);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inet6addr_notifier = {
+ .notifier_call = lockd_inet6addr_event,
+};
+#endif
+
+static void lockd_svc_exit_thread(void)
+{
+ unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
+ svc_exit_thread(nlmsvc_rqst);
+}
+
static int lockd_start_svc(struct svc_serv *serv)
{
int error;
@@ -315,7 +380,7 @@ static int lockd_start_svc(struct svc_serv *serv)
return 0;
out_task:
- svc_exit_thread(nlmsvc_rqst);
+ lockd_svc_exit_thread();
nlmsvc_task = NULL;
out_rqst:
nlmsvc_rqst = NULL;
@@ -360,6 +425,10 @@ static struct svc_serv *lockd_create_svc(void)
printk(KERN_WARNING "lockd_up: create service failed\n");
return ERR_PTR(-ENOMEM);
}
+ register_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
dprintk("lockd_up: service created\n");
return serv;
}
@@ -428,7 +497,7 @@ lockd_down(struct net *net)
}
kthread_stop(nlmsvc_task);
dprintk("lockd_down: service stopped\n");
- svc_exit_thread(nlmsvc_rqst);
+ lockd_svc_exit_thread();
dprintk("lockd_down: service destroyed\n");
nlmsvc_task = NULL;
nlmsvc_rqst = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index af1ed74a657f..7c5f91be9b65 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1650,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
* bother, maybe that's a sign this just isn't a good file to
* hand out a delegation on.
*/
- if (is_deleg && !mutex_trylock(&inode->i_mutex))
+ if (is_deleg && !inode_trylock(inode))
return -EAGAIN;
if (is_deleg && arg == F_WRLCK) {
/* Write delegations are not currently supported: */
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
WARN_ON_ONCE(1);
return -EINVAL;
}
@@ -1732,7 +1732,7 @@ out:
spin_unlock(&ctx->flc_lock);
locks_dispose_list(&dispose);
if (is_deleg)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!error && !my_fl)
*flp = NULL;
return error;
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index 09ed066c0221..2b4503163930 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
config LOGFS
tristate "LogFS file system"
- depends on (MTD || BLOCK)
+ depends on MTD || (!MTD && BLOCK)
select ZLIB_INFLATE
select ZLIB_DEFLATE
select CRC32
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 1a6f0167b16a..61eaeb1b6cac 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
oldflags = li->li_flags;
flags &= LOGFS_FL_USER_MODIFIABLE;
flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
li->li_flags = flags;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
inode->i_ctime = CURRENT_TIME;
mark_inode_dirty_sync(inode);
@@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
logfs_get_wblocks(sb, NULL, WF_LOCK);
logfs_write_anchor(sb);
logfs_put_wblocks(sb, NULL, WF_LOCK);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return 0;
}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 0fce46d62b9c..db9cfc598883 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -409,7 +409,8 @@ const struct super_operations logfs_super_operations = {
int logfs_init_inode_cache(void)
{
logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
- sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+ sizeof(struct logfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
logfs_init_once);
if (!logfs_inode_cache)
return -ENOMEM;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 39d91f86cd35..27d040e35faa 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
#endif
/* dev_mtd.c */
-#ifdef CONFIG_MTD
+#if IS_ENABLED(CONFIG_MTD)
int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
#else
static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index cb1789ca1ee6..f975d667c539 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int __init init_inodecache(void)
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (minix_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/namei.c b/fs/namei.c
index bceefd5588a2..f624d132e01e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1629,9 +1629,9 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
parent = nd->path.dentry;
BUG_ON(nd->inode != parent->d_inode);
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
dentry = __lookup_hash(&nd->last, parent, nd->flags);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
path->mnt = nd->path.mnt;
@@ -2229,10 +2229,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path)
putname(filename);
return ERR_PTR(-EINVAL);
}
- mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
d = __lookup_hash(&last, path->dentry, 0);
if (IS_ERR(d)) {
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ inode_unlock(path->dentry->d_inode);
path_put(path);
}
putname(filename);
@@ -2282,7 +2282,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
unsigned int c;
int err;
- WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(base->d_inode));
this.name = name;
this.len = len;
@@ -2380,9 +2380,9 @@ struct dentry *lookup_one_len_unlocked(const char *name,
if (ret)
return ret;
- mutex_lock(&base->d_inode->i_mutex);
+ inode_lock(base->d_inode);
ret = __lookup_hash(&this, base, 0);
- mutex_unlock(&base->d_inode->i_mutex);
+ inode_unlock(base->d_inode);
return ret;
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2463,7 +2463,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
goto done;
}
- mutex_lock(&dir->d_inode->i_mutex);
+ inode_lock(dir->d_inode);
dentry = d_lookup(dir, &nd->last);
if (!dentry) {
/*
@@ -2473,16 +2473,16 @@ mountpoint_last(struct nameidata *nd, struct path *path)
*/
dentry = d_alloc(dir, &nd->last);
if (!dentry) {
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
return -ENOMEM;
}
dentry = lookup_real(dir->d_inode, dentry, nd->flags);
if (IS_ERR(dentry)) {
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
return PTR_ERR(dentry);
}
}
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
done:
if (d_is_negative(dentry)) {
@@ -2672,7 +2672,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
struct dentry *p;
if (p1 == p2) {
- mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
return NULL;
}
@@ -2680,29 +2680,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
p = d_ancestor(p2, p1);
if (p) {
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
return p;
}
p = d_ancestor(p1, p2);
if (p) {
- mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
return p;
}
- mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
return NULL;
}
EXPORT_SYMBOL(lock_rename);
void unlock_rename(struct dentry *p1, struct dentry *p2)
{
- mutex_unlock(&p1->d_inode->i_mutex);
+ inode_unlock(p1->d_inode);
if (p1 != p2) {
- mutex_unlock(&p2->d_inode->i_mutex);
+ inode_unlock(p2->d_inode);
mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
}
}
@@ -3141,9 +3141,9 @@ retry_lookup:
* dropping this one anyway.
*/
}
- mutex_lock(&dir->d_inode->i_mutex);
+ inode_lock(dir->d_inode);
error = lookup_open(nd, &path, file, op, got_write, opened);
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
if (error <= 0) {
if (error)
@@ -3489,7 +3489,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* Do the final lookup.
*/
lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
- mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path->dentry, lookup_flags);
if (IS_ERR(dentry))
goto unlock;
@@ -3518,7 +3518,7 @@ fail:
dput(dentry);
dentry = ERR_PTR(error);
unlock:
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ inode_unlock(path->dentry->d_inode);
if (!err2)
mnt_drop_write(path->mnt);
out:
@@ -3538,7 +3538,7 @@ EXPORT_SYMBOL(kern_path_create);
void done_path_create(struct path *path, struct dentry *dentry)
{
dput(dentry);
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ inode_unlock(path->dentry->d_inode);
mnt_drop_write(path->mnt);
path_put(path);
}
@@ -3735,7 +3735,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
return -EPERM;
dget(dentry);
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock(dentry->d_inode);
error = -EBUSY;
if (is_local_mountpoint(dentry))
@@ -3755,7 +3755,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
detach_mounts(dentry);
out:
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
dput(dentry);
if (!error)
d_delete(dentry);
@@ -3794,7 +3794,7 @@ retry:
if (error)
goto exit1;
- mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
@@ -3810,7 +3810,7 @@ retry:
exit3:
dput(dentry);
exit2:
- mutex_unlock(&path.dentry->d_inode->i_mutex);
+ inode_unlock(path.dentry->d_inode);
mnt_drop_write(path.mnt);
exit1:
path_put(&path);
@@ -3856,7 +3856,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
if (!dir->i_op->unlink)
return -EPERM;
- mutex_lock(&target->i_mutex);
+ inode_lock(target);
if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
@@ -3873,7 +3873,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
}
}
out:
- mutex_unlock(&target->i_mutex);
+ inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
@@ -3916,7 +3916,7 @@ retry:
if (error)
goto exit1;
retry_deleg:
- mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
@@ -3934,7 +3934,7 @@ retry_deleg:
exit2:
dput(dentry);
}
- mutex_unlock(&path.dentry->d_inode->i_mutex);
+ inode_unlock(path.dentry->d_inode);
if (inode)
iput(inode); /* truncate the inode here */
inode = NULL;
@@ -4086,7 +4086,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
if (error)
return error;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Make sure we don't allow creating hardlink to an unlinked file */
if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
error = -ENOENT;
@@ -4103,7 +4103,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
inode->i_state &= ~I_LINKABLE;
spin_unlock(&inode->i_lock);
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!error)
fsnotify_link(dir, inode, new_dentry);
return error;
@@ -4303,7 +4303,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!is_dir || (flags & RENAME_EXCHANGE))
lock_two_nondirectories(source, target);
else if (target)
- mutex_lock(&target->i_mutex);
+ inode_lock(target);
error = -EBUSY;
if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
@@ -4356,7 +4356,7 @@ out:
if (!is_dir || (flags & RENAME_EXCHANGE))
unlock_two_nondirectories(source, target);
else if (target)
- mutex_unlock(&target->i_mutex);
+ inode_unlock(target);
dput(new_dentry);
if (!error) {
fsnotify_move(old_dir, new_dir, old_name, is_dir,
diff --git a/fs/namespace.c b/fs/namespace.c
index a830e1463704..4fb1691b4355 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1961,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path)
struct vfsmount *mnt;
struct dentry *dentry = path->dentry;
retry:
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock(dentry->d_inode);
if (unlikely(cant_mount(dentry))) {
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
return ERR_PTR(-ENOENT);
}
namespace_lock();
@@ -1974,13 +1974,13 @@ retry:
mp = new_mountpoint(dentry);
if (IS_ERR(mp)) {
namespace_unlock();
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
return mp;
}
return mp;
}
namespace_unlock();
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ inode_unlock(path->dentry->d_inode);
path_put(path);
path->mnt = mnt;
dentry = path->dentry = dget(mnt->mnt_root);
@@ -1992,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where)
struct dentry *dentry = where->m_dentry;
put_mountpoint(where);
namespace_unlock();
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f0e3e9e747dd..26c2de2de13f 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
if (!res) {
struct inode *inode = d_inode(dentry);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
ncp_new_dentry(dentry);
val=1;
@@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
ncp_dbg(2, "found, but dirEntNum changed\n");
ncp_update_inode2(inode, &finfo);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
finished:
@@ -639,9 +639,9 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
} else {
struct inode *inode = d_inode(newdent);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(inode, I_MUTEX_CHILD);
ncp_update_inode2(inode, entry);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
if (ctl.idx >= NCP_DIRCACHE_SIZE) {
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 011324ce9df2..dd38ca1f2ecb 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
iocb->ki_pos = pos;
if (pos > i_size_read(inode)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (pos > i_size_read(inode))
i_size_write(inode, pos);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
ncp_dbg(1, "exit %pD2\n", file);
outrel:
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index ce1eb3f9dfe8..1af15fcbe57b 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -82,7 +82,7 @@ static int init_inodecache(void)
ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
sizeof(struct ncp_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ncp_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 807eb6ef4f91..f0939d097406 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -83,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
res = htonl(NFS4ERR_BADHANDLE);
inode = nfs_delegation_find_inode(cps->clp, &args->fh);
- if (inode == NULL)
+ if (inode == NULL) {
+ trace_nfs4_cb_recall(cps->clp, &args->fh, NULL,
+ &args->stateid, -ntohl(res));
goto out;
+ }
/* Set up a helper thread to actually return the delegation */
switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
case 0:
@@ -96,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
default:
res = htonl(NFS4ERR_RESOURCE);
}
- trace_nfs4_recall_delegation(inode, -ntohl(res));
+ trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
+ &args->stateid, -ntohl(res));
iput(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
@@ -160,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
return lo;
}
+/*
+ * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
+ */
+static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new)
+{
+ u32 oldseq, newseq;
+
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ newseq = be32_to_cpu(new->seqid);
+
+ if (newseq > oldseq + 1)
+ return false;
+ return true;
+}
+
static u32 initiate_file_draining(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
@@ -169,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp,
LIST_HEAD(free_me_list);
lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
- if (!lo)
+ if (!lo) {
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
+ &args->cbl_stateid, -rv);
goto out;
+ }
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
+ if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
+ rv = NFS4ERR_DELAY;
+ goto unlock;
+ }
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
spin_unlock(&ino->i_lock);
pnfs_layoutcommit_inode(ino, false);
spin_lock(&ino->i_lock);
- if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
- pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
- &args->cbl_range)) {
+ /*
+ * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
+ */
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
rv = NFS4ERR_DELAY;
goto unlock;
}
+ if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
+ &args->cbl_range)) {
+ rv = NFS4_OK;
+ goto unlock;
+ }
+
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
&args->cbl_range);
}
+ pnfs_mark_layout_returned_if_empty(lo);
unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(ino, 0);
pnfs_put_layout_hdr(lo);
- trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
+ &args->cbl_stateid, -rv);
iput(ino);
out:
return rv;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce5a21861074..9cce67043f92 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
filp, offset, whence);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case 1:
offset += filp->f_pos;
@@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
dir_ctx->duped = 0;
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
@@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return 0;
}
@@ -1894,15 +1894,14 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_USER);
if (!page)
return -ENOMEM;
- kaddr = kmap_atomic(page);
+ kaddr = page_address(page);
memcpy(kaddr, symname, pathlen);
if (pathlen < PAGE_SIZE)
memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
- kunmap_atomic(kaddr);
trace_nfs_symlink_enter(dir, dentry);
error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
@@ -2432,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
}
EXPORT_SYMBOL_GPL(nfs_may_open);
+static int nfs_execute_ok(struct inode *inode, int mask)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret;
+
+ if (mask & MAY_NOT_BLOCK)
+ ret = nfs_revalidate_inode_rcu(server, inode);
+ else
+ ret = nfs_revalidate_inode(server, inode);
+ if (ret == 0 && !execute_ok(inode))
+ ret = -EACCES;
+ return ret;
+}
+
int nfs_permission(struct inode *inode, int mask)
{
struct rpc_cred *cred;
@@ -2449,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask)
case S_IFLNK:
goto out;
case S_IFREG:
+ if ((mask & MAY_OPEN) &&
+ nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN))
+ return 0;
break;
case S_IFDIR:
/*
@@ -2481,8 +2497,8 @@ force_lookup:
res = PTR_ERR(cred);
}
out:
- if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
- res = -EACCES;
+ if (!res && (mask & MAY_EXEC))
+ res = nfs_execute_ok(inode, mask);
dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
inode->i_sb->s_id, inode->i_ino, mask, res);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4b1d08f56aba..7a0cfd3266e5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
return atomic_dec_and_test(&dreq->io_count);
}
-void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
-{
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-}
-EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
-
static void
nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
{
@@ -586,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
if (!count)
goto out;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
result = nfs_sync_mapping(mapping);
if (result)
goto out_unlock;
@@ -614,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
NFS_I(inode)->read_io += count;
result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!result) {
result = nfs_direct_wait(dreq);
@@ -628,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
out_release:
nfs_direct_req_release(dreq);
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out:
return result;
}
@@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
req = nfs_list_entry(reqs.next);
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ list_splice_init(&reqs, &failed);
+ goto out_failed;
+ }
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
@@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_list_add_request(req, &failed);
spin_lock(cinfo.lock);
dreq->flags = 0;
- dreq->error = -EIO;
+ if (desc.pg_error < 0)
+ dreq->error = desc.pg_error;
+ else
+ dreq->error = -EIO;
spin_unlock(cinfo.lock);
}
nfs_release_request(req);
}
nfs_pageio_complete(&desc);
+out_failed:
while (!list_empty(&failed)) {
req = nfs_list_entry(failed.next);
nfs_list_remove_request(req);
@@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
nfs_direct_write_complete(dreq, data->inode);
}
-static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
{
- /* There is no lock to clear */
+ struct nfs_direct_req *dreq = cinfo->dreq;
+
+ spin_lock(&dreq->lock);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_mark_request_commit(req, NULL, cinfo, 0);
}
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
.completion = nfs_direct_commit_complete,
- .error_cleanup = nfs_direct_error_cleanup,
+ .resched_write = nfs_direct_resched_write,
};
static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
@@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head)
}
}
+static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_req *dreq = hdr->dreq;
+
+ spin_lock(&dreq->lock);
+ if (dreq->error == 0) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ /* fake unstable write to let common nfs resend pages */
+ hdr->verf.committed = NFS_UNSTABLE;
+ hdr->good_bytes = hdr->args.count;
+ }
+ spin_unlock(&dreq->lock);
+}
+
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
.error_cleanup = nfs_write_sync_pgio_error,
.init_hdr = nfs_direct_pgio_init,
.completion = nfs_direct_write_completion,
+ .reschedule_io = nfs_direct_write_reschedule_io,
};
@@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
}
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ nfs_free_request(req);
+ result = desc.pg_error;
+ break;
+ }
nfs_lock_request(req);
req->wb_index = pos >> PAGE_SHIFT;
@@ -977,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
pos = iocb->ki_pos;
end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
result = nfs_sync_mapping(mapping);
if (result)
@@ -1017,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
pos >> PAGE_CACHE_SHIFT, end);
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!result) {
result = nfs_direct_wait(dreq);
@@ -1038,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
out_release:
nfs_direct_req_release(dreq);
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return result;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 93e236429c5d..748bb813b8ec 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
break;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = nfs_file_fsync_commit(file, start, end, datasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* If nfs_file_fsync_commit detected a server reboot, then
* resend all dirty pages that might have been covered by
@@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page,
* so it will not block due to pages that will shortly be freeable.
*/
nfsi = NFS_I(mapping->host);
- if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+ if (atomic_read(&nfsi->commit_info.rpcs_out)) {
*writeback = true;
return;
}
@@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page)
inode->i_ino, (long long)page_offset(page));
nfs_fscache_wait_on_page_write(nfsi, page);
- return nfs_wb_page(inode, page);
+ return nfs_wb_launder_page(inode, page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -756,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
if (!IS_ERR(l_ctx)) {
- status = nfs_iocounter_wait(&l_ctx->io_count);
+ status = nfs_iocounter_wait(l_ctx);
nfs_put_lock_context(l_ctx);
if (status < 0)
return status;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 02ec07973bc4..3384dc8e6683 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
pnfs_error_mark_layout_for_return(inode, lseg);
+ pnfs_set_lo_fail(lseg);
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
@@ -883,13 +884,19 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_read_mds(pgio);
@@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -957,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req,
u32 i, j;
if (fl->commit_through_mds) {
- nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+ nfs_request_add_commit_list(req, cinfo);
} else {
/* Note that we are calling nfs4_fl_calc_j_index on each page
* that ends up being committed to a data server. An attractive
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 03516c80855a..5bcd92d50e82 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -145,7 +145,7 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
return false;
for (i = 0; i < m1->fh_versions_cnt; i++) {
bool found_fh = false;
- for (j = 0; j < m2->fh_versions_cnt; i++) {
+ for (j = 0; j < m2->fh_versions_cnt; j++) {
if (nfs_compare_fh(&m1->fh_versions[i],
&m2->fh_versions[j]) == 0) {
found_fh = true;
@@ -505,9 +505,17 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
}
p = xdr_inline_decode(&stream, 4);
- if (p)
- fls->flags = be32_to_cpup(p);
+ if (!p)
+ goto out_sort_mirrors;
+ fls->flags = be32_to_cpup(p);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ for (i=0; i < fls->mirror_array_cnt; i++)
+ fls->mirror_array[i]->report_interval = be32_to_cpup(p);
+out_sort_mirrors:
ff_layout_sort_mirrors(fls);
rc = ff_layout_check_layout(lgr);
if (rc)
@@ -603,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
mirror->start_time = now;
if (ktime_equal(mirror->last_report_time, notime))
mirror->last_report_time = now;
- if (layoutstats_timer != 0)
+ if (mirror->report_interval != 0)
+ report_interval = (s64)mirror->report_interval * 1000LL;
+ else if (layoutstats_timer != 0)
report_interval = (s64)layoutstats_timer * 1000LL;
if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
report_interval) {
@@ -785,13 +795,19 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
/* Use full layout for now */
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -825,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
int i;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -867,18 +889,25 @@ static unsigned int
ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ goto out;
+ }
+ }
if (pgio->pg_lseg)
return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
/* no lseg means that pnfs is not in use, so no mirroring here */
nfs_pageio_reset_write_mds(pgio);
+out:
return 1;
}
@@ -912,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
hdr->args.count,
(unsigned long long)hdr->args.offset);
- if (!hdr->dreq) {
- struct nfs_open_context *ctx;
-
- ctx = nfs_list_entry(hdr->pages.next)->wb_context;
- set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
- hdr->completion_ops->error_cleanup(&hdr->pages);
- } else {
- nfs_direct_set_resched_writes(hdr->dreq);
- /* fake unstable write to let common nfs resend pages */
- hdr->verf.committed = NFS_UNSTABLE;
- hdr->good_bytes = hdr->args.count;
- }
+ hdr->completion_ops->reschedule_io(hdr);
return;
}
@@ -1101,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
return -NFS4ERR_RESET_TO_PNFS;
out_retry:
task->tk_status = 0;
- rpc_restart_call(task);
+ rpc_restart_call_prepare(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
return -EAGAIN;
}
@@ -1159,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
}
+ switch (status) {
+ case NFS4ERR_DELAY:
+ case NFS4ERR_GRACE:
+ return;
+ default:
+ break;
+ }
+
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
@@ -1242,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
return ff_layout_test_devid_unavailable(node);
}
-static int ff_layout_read_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
nfs4_ff_layout_stat_io_start_read(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count,
task->tk_start);
+}
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_read(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ hdr->res.count);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1265,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
}
hdr->pgio_done_cb = ff_layout_read_done_cb;
+ ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1323,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
- nfs4_ff_layout_stat_io_end_read(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1341,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_read_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
}
+static void ff_layout_read_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1362,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
ff_layout_reset_write(hdr, true);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
ff_layout_reset_write(hdr, false);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -1402,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- pnfs_set_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -NFS4ERR_RESET_TO_MDS:
- pnfs_clear_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
@@ -1421,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return 0;
}
-static int ff_layout_write_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
nfs4_ff_layout_stat_io_start_write(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count,
task->tk_start);
+}
+
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count, hdr->res.count,
+ hdr->res.verf->committed);
+}
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1445,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EAGAIN;
}
+ ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1480,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count,
- hdr->res.verf->committed);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1499,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_write_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
}
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
+static void ff_layout_write_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
nfs4_ff_layout_stat_io_start_write(cdata->inode,
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
0, task->tk_start);
}
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ struct nfs_page *req;
+ __u64 count = 0;
+
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+
+ if (task->tk_status == 0) {
+ list_for_each_entry(req, &cdata->pages, wb_list)
+ count += req->wb_bytes;
+ }
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ count, count, NFS_FILE_SYNC);
+}
+
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ ff_layout_commit_record_layoutstats_start(task, cdata);
+}
+
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
{
ff_layout_commit_prepare_common(task, data);
@@ -1531,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
static void ff_layout_commit_done(struct rpc_task *task, void *data)
{
- struct nfs_commit_data *cdata = data;
- struct nfs_page *req;
- __u64 count = 0;
-
- if (task->tk_status == 0) {
- list_for_each_entry(req, &cdata->pages, wb_list)
- count += req->wb_bytes;
- }
-
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
- count, count, NFS_FILE_SYNC);
-
pnfs_generic_write_commit_done(task, data);
}
@@ -1551,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
{
struct nfs_commit_data *cdata = data;
+ ff_layout_commit_record_layoutstats_done(task, cdata);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
}
+static void ff_layout_commit_release(void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+ pnfs_generic_commit_release(data);
+}
+
static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
.rpc_call_prepare = ff_layout_read_prepare_v3,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
.rpc_call_prepare = ff_layout_read_prepare_v4,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
.rpc_call_prepare = ff_layout_write_prepare_v3,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
.rpc_call_prepare = ff_layout_write_prepare_v4,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
.rpc_call_prepare = ff_layout_commit_prepare_v3,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
.rpc_call_prepare = ff_layout_commit_prepare_v4,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static enum pnfs_try_status
@@ -1859,11 +1948,9 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
start = xdr_reserve_space(xdr, 4);
BUG_ON(!start);
- if (ff_layout_encode_ioerr(flo, xdr, args))
- goto out;
-
+ ff_layout_encode_ioerr(flo, xdr, args);
ff_layout_encode_iostats(flo, xdr, args);
-out:
+
*start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return\n", __func__);
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 2bb08bc6aaf0..dd353bb7dc0a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -85,6 +85,7 @@ struct nfs4_ff_layout_mirror {
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
ktime_t last_report_time;
+ u32 report_interval;
};
struct nfs4_ff_layout_segment {
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e125e55de86d..29898a9550fa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
err->length = end - err->offset;
}
-static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- nfs4_stateid *stateid,
- struct nfs4_deviceid *deviceid)
+static int
+ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
+ const struct nfs4_ff_layout_ds_err *e2)
{
- return err->status == status && err->opnum == opnum &&
- nfs4_stateid_match(&err->stateid, stateid) &&
- !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
- end_offset(err->offset, err->length) >= offset &&
- err->offset <= end_offset(offset, length);
-}
-
-static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
- struct nfs4_ff_layout_ds_err *new)
-{
- if (!ds_error_can_merge(old, new->offset, new->length, new->status,
- new->opnum, &new->stateid, &new->deviceid))
- return false;
-
- extend_ds_error(old, new->offset, new->length);
- return true;
+ int ret;
+
+ if (e1->opnum != e2->opnum)
+ return e1->opnum < e2->opnum ? -1 : 1;
+ if (e1->status != e2->status)
+ return e1->status < e2->status ? -1 : 1;
+ ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+ if (ret != 0)
+ return ret;
+ ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
+ if (ret != 0)
+ return ret;
+ if (end_offset(e1->offset, e1->length) < e2->offset)
+ return -1;
+ if (e1->offset > end_offset(e2->offset, e2->length))
+ return 1;
+ /* If ranges overlap or are contiguous, they are the same */
+ return 0;
}
-static bool
+static void
ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
struct nfs4_ff_layout_ds_err *dserr)
{
- struct nfs4_ff_layout_ds_err *err;
-
- list_for_each_entry(err, &flo->error_list, list) {
- if (merge_ds_error(err, dserr)) {
- return true;
- }
- }
-
- list_add(&dserr->list, &flo->error_list);
- return false;
-}
-
-static bool
-ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
-{
- bool found = false;
- struct nfs4_ff_layout_ds_err *err;
-
- list_for_each_entry(err, &flo->error_list, list) {
- if (ds_error_can_merge(err, offset, length, status, opnum,
- stateid, deviceid)) {
- found = true;
- extend_ds_error(err, offset, length);
+ struct nfs4_ff_layout_ds_err *err, *tmp;
+ struct list_head *head = &flo->error_list;
+ int match;
+
+ /* Do insertion sort w/ merges */
+ list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
+ match = ff_ds_error_match(err, dserr);
+ if (match < 0)
+ continue;
+ if (match > 0) {
+ /* Add entry "dserr" _before_ entry "err" */
+ head = &err->list;
break;
}
+ /* Entries match, so merge "err" into "dserr" */
+ extend_ds_error(dserr, err->offset, err->length);
+ list_del(&err->list);
+ kfree(err);
}
- return found;
+ list_add_tail(&dserr->list, head);
}
int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
@@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
gfp_t gfp_flags)
{
struct nfs4_ff_layout_ds_err *dserr;
- bool needfree;
if (status == 0)
return 0;
@@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
if (mirror->mirror_ds == NULL)
return -EINVAL;
- spin_lock(&flo->generic_hdr.plh_inode->i_lock);
- if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
- &mirror->stateid,
- &mirror->mirror_ds->id_node.deviceid)) {
- spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
- return 0;
- }
- spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
dserr = kmalloc(sizeof(*dserr), gfp_flags);
if (!dserr)
return -ENOMEM;
@@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
NFS4_DEVICEID4_SIZE);
spin_lock(&flo->generic_hdr.plh_inode->i_lock);
- needfree = ff_layout_add_ds_error_locked(flo, dserr);
+ ff_layout_add_ds_error_locked(flo, dserr);
spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
- if (needfree)
- kfree(dserr);
return 0;
}
@@ -429,22 +410,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
- if (fail_return) {
- pnfs_error_mark_layout_for_return(ino, lseg);
- if (ff_layout_has_available_ds(lseg))
- pnfs_set_retry_layoutget(lseg->pls_layout);
- else
- pnfs_clear_retry_layoutget(lseg->pls_layout);
-
- } else {
+ if (!fail_return) {
if (ff_layout_has_available_ds(lseg))
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&lseg->pls_layout->plh_flags);
- else {
+ else
pnfs_error_mark_layout_for_return(ino, lseg);
- pnfs_clear_retry_layoutget(lseg->pls_layout);
- }
- }
+ } else
+ pnfs_error_mark_layout_for_return(ino, lseg);
}
out_update_creds:
if (ff_layout_update_mirror_cred(mirror, ds))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bdb4dc7b4ecd..86faecf8f328 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
return nfs_fileid_to_ino_t(fattr->fileid);
}
-/**
- * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
- * @word: long word containing the bit lock
- */
-int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+static int nfs_wait_killable(int mode)
{
freezable_schedule_unsafe();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
}
+
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+{
+ return nfs_wait_killable(mode);
+}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
+int nfs_wait_atomic_killable(atomic_t *p)
+{
+ return nfs_wait_killable(TASK_KILLABLE);
+}
+
/**
* nfs_compat_user_ino64 - returns the user-visible inode number
* @fileid: 64-bit fileid
@@ -655,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = nfs_sync_inode(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (err)
goto out;
}
@@ -700,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
l_ctx->lockowner.l_owner = current->files;
l_ctx->lockowner.l_pid = current->tgid;
INIT_LIST_HEAD(&l_ctx->list);
- nfs_iocounter_init(&l_ctx->io_count);
+ atomic_set(&l_ctx->io_count, 0);
}
static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
@@ -913,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp)
if (ctx) {
struct inode *inode = d_inode(ctx->dentry);
+ /*
+ * We fatal error on write before. Try to writeback
+ * every page again.
+ */
+ if (ctx->error < 0)
+ invalidate_inode_pages2(inode->i_mapping);
filp->private_data = NULL;
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -1166,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode,
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
if (may_lock) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = nfs_invalidate_mapping(inode, mapping);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
} else
ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
@@ -1663,6 +1675,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0;
unsigned long now = jiffies;
unsigned long save_cache_validity;
+ bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1724,22 +1737,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
inode->i_version = fattr->change_attr;
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity;
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
- } else if (server->caps & NFS_CAP_MTIME)
+ } else if (server->caps & NFS_CAP_MTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- } else if (server->caps & NFS_CAP_CTIME)
+ } else if (server->caps & NFS_CAP_CTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1759,19 +1778,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(long long)cur_isize,
(long long)new_isize);
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_PAGECACHE
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- else if (server->caps & NFS_CAP_ATIME)
+ else if (server->caps & NFS_CAP_ATIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
@@ -1780,36 +1803,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_mode = newmode;
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
}
- } else if (server->caps & NFS_CAP_MODE)
+ } else if (server->caps & NFS_CAP_MODE) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (!uid_eq(inode->i_uid, fattr->uid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
- } else if (server->caps & NFS_CAP_OWNER)
+ } else if (server->caps & NFS_CAP_OWNER) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (!gid_eq(inode->i_gid, fattr->gid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
- } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ } else if (server->caps & NFS_CAP_OWNER_GROUP) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
@@ -1818,19 +1847,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_DATA;
set_nlink(inode, fattr->nlink);
}
- } else if (server->caps & NFS_CAP_NLINK)
+ } else if (server->caps & NFS_CAP_NLINK) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- }
- if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
+ else
+ cache_revalidated = false;
/* Update attrtimeo value if we're out of the unstable period */
if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1840,9 +1872,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Set barrier to be more recent than all outstanding updates */
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
- if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
- if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
- nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ if (cache_revalidated) {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp,
+ nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ nfsi->attrtimeo <<= 1;
+ if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode))
+ nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ }
nfsi->attrtimeo_timestamp = now;
}
/* Set the barrier to be more recent than this fattr */
@@ -1851,7 +1887,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
/* Don't declare attrcache up to date if there were no attrs! */
- if (fattr->valid != 0)
+ if (cache_revalidated)
invalid &= ~NFS_INO_INVALID_ATTR;
/* Don't invalidate the data if we were to blame */
@@ -1933,7 +1969,7 @@ static int __init nfs_init_inodecache(void)
nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
sizeof(struct nfs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (nfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9dea85f7f918..9a547aa3ec8e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr,
void (*release)(struct nfs_pgio_header *hdr));
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
-int nfs_iocounter_wait(struct nfs_io_counter *c);
+int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
@@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
-static inline void nfs_iocounter_init(struct nfs_io_counter *c)
-{
- c->flags = 0;
- atomic_set(&c->io_count, 0);
-}
-
static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
{
WARN_ON_ONCE(desc->pg_mirror_count < 1);
return desc->pg_mirror_count > 1;
}
+static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+ const struct nfs_open_context *ctx2)
+{
+ return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+}
+
/* nfs2xdr.c */
extern struct rpc_procinfo nfs_procedures[];
extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -380,6 +380,7 @@ extern void nfs_clear_inode(struct inode *);
extern void nfs_evict_inode(struct inode *);
void nfs_zap_acl_cache(struct inode *inode);
extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+extern int nfs_wait_atomic_killable(atomic_t *p);
/* super.c */
extern const struct super_operations nfs_sops;
@@ -483,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list,
struct nfs_commit_info *cinfo,
u32 ds_commit_idx);
void nfs_commitdata_release(struct nfs_commit_data *data);
-void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+void nfs_request_add_commit_list(struct nfs_page *req,
struct nfs_commit_info *cinfo);
void nfs_request_add_commit_list_locked(struct nfs_page *req,
struct list_head *dst,
@@ -519,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
inode_dio_wait(inode);
}
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
-extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
{
return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
}
+static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
+{
+ return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
+ NFS4_STATEID_OTHER_SIZE);
+}
#else
static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
{
return 0;
}
+static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+{
+ return 0;
+}
#endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+ switch (err) {
+ case -ERESTARTSYS:
+ case -EIO:
+ case -ENOSPC:
+ case -EROFS:
+ case -E2BIG:
+ return true;
+ default:
+ return false;
+ }
+}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6b1ce9825430..bd25dc7077f7 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -101,13 +101,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
@@ -123,7 +123,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
return -EOPNOTSUPP;
nfs_wb_all(inode);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == 0)
@@ -131,7 +131,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
@@ -204,6 +204,8 @@ static void
nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo;
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -211,12 +213,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo && nfs4_stateid_match(&data->args.stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
case -ENOTSUPP:
case -EOPNOTSUPP:
- NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
default:
- dprintk("%s server returns %d\n", __func__, task->tk_status);
+ break;
}
+
+ dprintk("%s server returns %d\n", __func__, task->tk_status);
}
static void
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 26f9a23e2b25..57ca1c8039c1 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
break;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = nfs_file_fsync_commit(file, start, end, datasync);
if (!ret)
ret = pnfs_sync_inode(inode, !!datasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* If nfs_file_fsync_commit detected a server reboot, then
* resend all dirty pages that might have been covered by
@@ -219,13 +219,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
if (same_inode) {
- mutex_lock(&src_inode->i_mutex);
+ inode_lock(src_inode);
} else if (dst_inode < src_inode) {
- mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(dst_inode, I_MUTEX_PARENT);
+ inode_lock_nested(src_inode, I_MUTEX_CHILD);
} else {
- mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(src_inode, I_MUTEX_PARENT);
+ inode_lock_nested(dst_inode, I_MUTEX_CHILD);
}
/* flush all pending writes on both src and dst so that server
@@ -246,13 +246,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
out_unlock:
if (same_inode) {
- mutex_unlock(&src_inode->i_mutex);
+ inode_unlock(src_inode);
} else if (dst_inode < src_inode) {
- mutex_unlock(&src_inode->i_mutex);
- mutex_unlock(&dst_inode->i_mutex);
+ inode_unlock(src_inode);
+ inode_unlock(dst_inode);
} else {
- mutex_unlock(&dst_inode->i_mutex);
- mutex_unlock(&src_inode->i_mutex);
+ inode_unlock(dst_inode);
+ inode_unlock(src_inode);
}
out:
return ret;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c57d1332c1c8..4bfc33ad0563 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -208,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY,
FATTR4_WORD2_MDSTHRESHOLD
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ | FATTR4_WORD2_SECURITY_LABEL
+#endif
};
static const u32 nfs4_open_noattr_bitmap[3] = {
@@ -1385,6 +1388,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
* Protect the call to nfs4_state_set_mode_locked and
* serialise the stateid update
*/
+ spin_lock(&state->owner->so_lock);
write_seqlock(&state->seqlock);
if (deleg_stateid != NULL) {
nfs4_stateid_copy(&state->stateid, deleg_stateid);
@@ -1393,7 +1397,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
if (open_stateid != NULL)
nfs_set_open_stateid_locked(state, open_stateid, fmode);
write_sequnlock(&state->seqlock);
- spin_lock(&state->owner->so_lock);
update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
}
@@ -1598,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
if (!data->rpc_done) {
state = nfs4_try_open_cached(data);
+ trace_nfs4_cached_open(data->state);
goto out;
}
@@ -2015,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
}
return;
unlock_no_action:
+ trace_nfs4_cached_open(data->state);
rcu_read_unlock();
out_no_action:
task->tk_action = NULL;
@@ -2703,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
+ trace_nfs4_setattr(inode, &arg.stateid, status);
return status;
}
@@ -2719,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
int err;
do {
err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
- trace_nfs4_setattr(inode, err);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -5048,7 +5053,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
static int
nfs4_init_nonuniform_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5076,7 +5080,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
return -ENOMEM;
rcu_read_lock();
- result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+ scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
clp->cl_ipaddr,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
@@ -5089,7 +5093,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
static int
nfs4_init_uniquifier_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5109,7 +5112,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
clp->rpc_ops->version, clp->cl_minorversion,
nfs4_client_id_uniquifier,
clp->cl_rpcclient->cl_nodename);
@@ -5120,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
static int
nfs4_init_uniform_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5145,7 +5147,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+ scnprintf(str, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
clp->cl_rpcclient->cl_nodename);
clp->cl_owner_id = str;
@@ -5384,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
if (data == NULL)
return -ENOMEM;
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+
+ nfs4_state_protect(server->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->cache_consistency_bitmask;
@@ -5426,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
int err;
do {
err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
- trace_nfs4_delegreturn(inode, err);
+ trace_nfs4_delegreturn(inode, stateid, err);
switch (err) {
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
@@ -5936,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
data->cancelled = 1;
rpc_put_task(task);
dprintk("%s: done, ret = %d!\n", __func__, ret);
+ trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
return ret;
}
@@ -5952,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
- trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -5979,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
- trace_nfs4_lock_expired(request, state, F_SETLK, err);
switch (err) {
default:
goto out;
@@ -6087,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
do {
err = _nfs4_proc_setlk(state, cmd, request);
- trace_nfs4_set_lock(request, state, cmd, err);
if (err == -NFS4ERR_DENIED)
err = -EAGAIN;
err = nfs4_handle_exception(NFS_SERVER(state->inode),
@@ -6847,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
},
.allow.u.words = {
[0] = 1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN) |
1 << (OP_COMMIT),
[1] = 1 << (OP_SECINFO - 32) |
1 << (OP_SECINFO_NO_NAME - 32) |
+ 1 << (OP_LAYOUTRETURN - 32) |
1 << (OP_TEST_STATEID - 32) |
1 << (OP_FREE_STATEID - 32) |
1 << (OP_WRITE - 32)
@@ -6915,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
}
if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+ test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+ test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
test_bit(OP_LOCKU, sp->allow.u.longs)) {
dfprintk(MOUNT, " cleanup mode enabled\n");
set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
}
+ if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " pnfs cleanup mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &clp->cl_sp4_flags);
+ }
+
if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
dfprintk(MOUNT, " secinfo mode enabled\n");
@@ -7748,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
struct nfs4_session *session = nfs4_get_session(server);
+ int ret;
dprintk("--> %s\n", __func__);
/* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -7758,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
if (nfs41_setup_sequence(session, &lgp->args.seq_args,
&lgp->res.seq_res, task))
return;
- if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+ ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
NFS_I(lgp->args.inode)->layout,
&lgp->args.range,
- lgp->args.ctx->state)) {
- rpc_exit(task, NFS4_OK);
- }
+ lgp->args.ctx->state);
+ if (ret < 0)
+ rpc_exit(task, ret);
}
static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -7783,6 +7800,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
goto out;
+
+ /*
+ * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+ * on the file. set tk_status to -ENODATA to tell upper layer to
+ * retry go inband.
+ */
+ case -NFS4ERR_LAYOUTUNAVAILABLE:
+ task->tk_status = -ENODATA;
+ goto out;
/*
* NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
* length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
@@ -7979,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
trace_nfs4_layoutget(lgp->args.ctx,
&lgp->args.range,
&lgp->res.range,
+ &lgp->res.stateid,
status);
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
@@ -8035,11 +8062,11 @@ static void nfs4_layoutreturn_release(void *calldata)
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+ pnfs_mark_layout_returned_if_empty(lo);
if (lrp->res.lrs_present)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
- pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
pnfs_clear_layoutreturn_waitbit(lo);
- lo->plh_block_lgets--;
spin_unlock(&lo->plh_inode->i_lock);
pnfs_free_lseg_list(&freeme);
pnfs_put_layout_hdr(lrp->args.layout);
@@ -8071,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
};
int status = 0;
+ nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+ NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
dprintk("--> %s\n", __func__);
if (!sync) {
lrp->inode = nfs_igrab_and_active(lrp->args.inode);
@@ -8086,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
return PTR_ERR(task);
if (sync)
status = task->tk_status;
- trace_nfs4_layoutreturn(lrp->args.inode, status);
+ trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -8234,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
return PTR_ERR(task);
if (sync)
status = task->tk_status;
- trace_nfs4_layoutcommit(data->args.inode, status);
+ trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status);
dprintk("%s: status %d\n", __func__, status);
rpc_put_task(task);
return status;
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 0fbd3ab1be22..8693d77c45ea 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -12,7 +12,7 @@
#include "nfs4idmap.h"
#include "callback.h"
-static const int nfs_set_port_min = 0;
+static const int nfs_set_port_min;
static const int nfs_set_port_max = 65535;
static struct ctl_table_header *nfs4_callback_sysctl_table;
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d774335cc8bc..2850bce19244 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -6,6 +6,7 @@
#include "internal.h"
#include "nfs4session.h"
#include "callback.h"
+#include "pnfs.h"
#define CREATE_TRACE_POINTS
#include "nfs4trace.h"
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 671cf68fe56b..2c8d05dae5b1 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done,
__entry->highest_slotid = res->sr_highest_slotid;
__entry->target_highest_slotid =
res->sr_target_highest_slotid;
+ __entry->status_flags = res->sr_status_flags;
__entry->error = res->sr_status;
),
TP_printk(
@@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__field(u64, fileid)
__field(u64, dir)
__string(name, ctx->dentry->d_name.name)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, openstateid_seq)
+ __field(u32, openstateid_hash)
),
TP_fast_assign(
@@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->flags = flags;
__entry->fmode = (__force unsigned int)ctx->mode;
__entry->dev = ctx->dentry->d_sb->s_dev;
- if (!IS_ERR_OR_NULL(state))
+ if (!IS_ERR_OR_NULL(state)) {
inode = state->inode;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->openstateid_seq =
+ be32_to_cpu(state->open_stateid.seqid);
+ __entry->openstateid_hash =
+ nfs_stateid_hash(&state->open_stateid);
+ } else {
+ __entry->stateid_seq = 0;
+ __entry->stateid_hash = 0;
+ __entry->openstateid_seq = 0;
+ __entry->openstateid_hash = 0;
+ }
if (inode != NULL) {
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
TP_printk(
"error=%d (%s) flags=%d (%s) fmode=%s "
"fileid=%02x:%02x:%llu fhandle=0x%08x "
- "name=%02x:%02x:%llu/%s",
+ "name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
+ "openstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
__entry->flags,
@@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->fhandle,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
- __get_str(name)
+ __get_str(name),
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->openstateid_seq, __entry->openstateid_hash
)
);
@@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
+TRACE_EVENT(nfs4_cached_open,
+ TP_PROTO(
+ const struct nfs4_state *state
+ ),
+ TP_ARGS(state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x stateid=%d:0x%08x",
+ __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
TRACE_EVENT(nfs4_close,
TP_PROTO(
const struct nfs4_state *state,
@@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close,
__field(u64, fileid)
__field(unsigned int, fmode)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close,
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->fmode = (__force unsigned int)state->state;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->stateid);
),
TP_printk(
"error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
- "fhandle=0x%08x",
+ "fhandle=0x%08x openstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
__entry->fmode ? show_fmode_flags(__entry->fmode) :
"closed",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) cmd=%s:%s range=%lld:%lld "
- "fileid=%02x:%02x:%llu fhandle=0x%08x",
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
show_lock_cmd(__entry->cmd),
@@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
(long long)__entry->end,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
), \
TP_ARGS(request, state, cmd, error))
DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
+TRACE_EVENT(nfs4_set_lock,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ const nfs4_stateid *lockstateid,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, lockstateid, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(int, cmd)
+ __field(char, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, lockstateid_seq)
+ __field(u32, lockstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->lockstateid_seq =
+ be32_to_cpu(lockstateid->seqid);
+ __entry->lockstateid_hash =
+ nfs_stateid_hash(lockstateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x lockstateid=%d:0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ show_lock_cmd(__entry->cmd),
+ show_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->lockstateid_seq, __entry->lockstateid_hash
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
TP_PROTO(
const struct inode *inode,
@@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit,
__field(dev_t, dev)
__field(u32, fhandle)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
__entry->dev = res->server->s_dev;
__entry->fhandle = nfs_fhandle_hash(args->fhandle);
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(args->stateid);
),
TP_printk(
- "error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
+ "error=%d (%s) dev=%02x:%02x fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
- "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event,
), \
TP_ARGS(inode, error))
-DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
DEFINE_NFS4_INODE_EVENT(nfs4_access);
DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
@@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
-DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(inode, stateid, error))
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
DECLARE_EVENT_CLASS(nfs4_getattr_event,
TP_PROTO(
@@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
), \
TP_ARGS(clp, fhandle, inode, error))
DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
-DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x dstaddr=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, stateid, error))
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_PROTO(
@@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
__field(loff_t, offset)
__field(size_t, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
const struct inode *inode = hdr->inode;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->offset = hdr->args.offset;
__entry->count = hdr->args.count;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zu",
+ "offset=%lld count=%zu stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
#define DEFINE_NFS4_READ_EVENT(name) \
@@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
__field(loff_t, offset)
__field(size_t, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
const struct inode *inode = hdr->inode;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->offset = hdr->args.offset;
__entry->count = hdr->args.count;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zu",
+ "offset=%lld count=%zu stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget,
const struct nfs_open_context *ctx,
const struct pnfs_layout_range *args,
const struct pnfs_layout_range *res,
+ const nfs4_stateid *layout_stateid,
int error
),
- TP_ARGS(ctx, args, res, error),
+ TP_ARGS(ctx, args, res, layout_stateid, error),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget,
__field(u64, offset)
__field(u64, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
const struct inode *inode = d_inode(ctx->dentry);
+ const struct nfs4_state *state = ctx->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget,
__entry->offset = args->offset;
__entry->count = args->length;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ if (!error) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(layout_stateid->seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(layout_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "iomode=%s offset=%llu count=%llu",
+ "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget,
__entry->fhandle,
show_pnfs_iomode(__entry->iomode),
(unsigned long long)__entry->offset,
- (unsigned long long)__entry->count
+ (unsigned long long)__entry->count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
+#define show_pnfs_update_layout_reason(reason) \
+ __print_symbolic(reason, \
+ { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \
+ { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \
+ { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \
+ { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \
+ { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \
+ { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \
+ { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \
+ { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
+ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
+ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
+ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+
+TRACE_EVENT(pnfs_update_layout,
+ TP_PROTO(struct inode *inode,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ struct pnfs_layout_hdr *lo,
+ enum pnfs_update_layout_reason reason
+ ),
+ TP_ARGS(inode, pos, count, iomode, lo, reason),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, pos)
+ __field(u64, count)
+ __field(enum pnfs_iomode, iomode)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ __field(enum pnfs_update_layout_reason, reason)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->pos = pos;
+ __entry->count = count;
+ __entry->iomode = iomode;
+ __entry->reason = reason;
+ if (lo != NULL) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(lo->plh_stateid.seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(&lo->plh_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ ),
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s pos=%llu count=%llu "
+ "layoutstateid=%d:0x%08x (%s)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_iomode(__entry->iomode),
+ (unsigned long long)__entry->pos,
+ (unsigned long long)__entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ show_pnfs_update_layout_reason(__entry->reason)
+ )
+);
+
#endif /* CONFIG_NFS_V4_1 */
#endif /* _TRACE_NFS4_H */
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 59f838cdc009..9f80a086b612 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -39,7 +39,6 @@
{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
- { 1 << NFS_INO_COMMIT, "COMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 452a011ba0d8..8ce4f61cbaa5 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p)
kmem_cache_free(nfs_page_cachep, p);
}
-static void
-nfs_iocounter_inc(struct nfs_io_counter *c)
-{
- atomic_inc(&c->io_count);
-}
-
-static void
-nfs_iocounter_dec(struct nfs_io_counter *c)
-{
- if (atomic_dec_and_test(&c->io_count)) {
- clear_bit(NFS_IO_INPROGRESS, &c->flags);
- smp_mb__after_atomic();
- wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
- }
-}
-
-static int
-__nfs_iocounter_wait(struct nfs_io_counter *c)
-{
- wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
- DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
- int ret = 0;
-
- do {
- prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
- set_bit(NFS_IO_INPROGRESS, &c->flags);
- if (atomic_read(&c->io_count) == 0)
- break;
- ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE);
- } while (atomic_read(&c->io_count) != 0 && !ret);
- finish_wait(wq, &q.wait);
- return ret;
-}
-
/**
* nfs_iocounter_wait - wait for i/o to complete
- * @c: nfs_io_counter to use
+ * @l_ctx: nfs_lock_context with io_counter to use
*
* returns -ERESTARTSYS if interrupted by a fatal signal.
* Otherwise returns 0 once the io_count hits 0.
*/
int
-nfs_iocounter_wait(struct nfs_io_counter *c)
+nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
{
- if (atomic_read(&c->io_count) == 0)
- return 0;
- return __nfs_iocounter_wait(c);
+ return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable,
+ TASK_KILLABLE);
}
/*
@@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
return ERR_CAST(l_ctx);
}
req->wb_lock_context = l_ctx;
- nfs_iocounter_inc(&l_ctx->io_count);
+ atomic_inc(&l_ctx->io_count);
/* Initialize the request struct. Initially, we assume a
* long write-back delay. This will be adjusted in
@@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req)
req->wb_page = NULL;
}
if (l_ctx != NULL) {
- nfs_iocounter_dec(&l_ctx->io_count);
+ if (atomic_dec_and_test(&l_ctx->io_count))
+ wake_up_atomic_t(&l_ctx->io_count);
nfs_put_lock_context(l_ctx);
req->wb_lock_context = NULL;
}
@@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
* @desc: IO descriptor
* @hdr: pageio header
*/
-static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_mirror *mirror;
- u32 midx;
-
set_bit(NFS_IOHDR_REDO, &hdr->flags);
nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
- /* TODO: Make sure it's right to clean up all mirrors here
- * and not just hdr->pgio_mirror_idx */
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- }
- return -ENOMEM;
}
/**
@@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
unsigned int pagecount, pageused;
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
- if (!nfs_pgarray_set(&hdr->page_array, pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
pages = hdr->page_array.pagevec;
@@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
*pages++ = last_page = req->wb_page;
}
}
- if (WARN_ON_ONCE(pageused != pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (WARN_ON_ONCE(pageused != pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -EINVAL;
+ return desc->pg_error;
+ }
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror;
struct nfs_pgio_header *hdr;
int ret;
- mirror = nfs_pgio_current_mirror(desc);
-
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- /* TODO: make sure this is right with mirroring - or
- * should it back out all mirrors? */
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
@@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+ if (pgio->pg_error < 0)
+ return pgio->pg_error;
+
if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
return -EINVAL;
@@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
pgio->pg_mirrors_dynamic = NULL;
}
-static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
- const struct nfs_open_context *ctx2)
-{
- return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
-}
-
static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
const struct nfs_lock_context *l2)
{
@@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
+ if (desc->pg_error < 0)
+ return 0;
mirror->pg_base = req->wb_pgbase;
}
if (!nfs_can_coalesce_requests(prev, req, desc))
@@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
bytes = req->wb_bytes;
nfs_pageio_setup_mirroring(desc, req);
+ if (desc->pg_error < 0)
+ goto out_failed;
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
if (midx) {
@@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (IS_ERR(dupreq)) {
nfs_page_group_unlock(req);
- return 0;
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
}
nfs_lock_request(dupreq);
@@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (nfs_pgio_has_mirroring(desc))
desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
- return 0;
+ goto out_failed;
}
return 1;
+
+out_failed:
+ /*
+ * We might have failed before sending any reqs over wire.
+ * Clean up rest of the reqs in mirror pg_list.
+ */
+ if (desc->pg_error) {
+ struct nfs_pgio_mirror *mirror;
+ void (*func)(struct list_head *);
+
+ /* remember fatal errors */
+ if (nfs_error_is_fatal(desc->pg_error))
+ mapping_set_error(desc->pg_inode->i_mapping,
+ desc->pg_error);
+
+ func = desc->pg_completion_ops->error_cleanup;
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = &desc->pg_mirrors[midx];
+ func(&mirror->pg_list);
+ }
+ }
+ return 0;
}
/*
@@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
nfs_pageio_complete(desc);
if (!list_empty(&failed)) {
list_move(&failed, &hdr->pages);
- return -EIO;
+ return desc->pg_error < 0 ? desc->pg_error : -EIO;
}
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bec0384499f7..a3592cc34a20 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
static LIST_HEAD(pnfs_modules_tbl);
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync);
/* Return the registered pnfs layout driver module matching given id */
@@ -385,13 +385,13 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
enum pnfs_iomode iomode;
bool send;
- stateid = lo->plh_stateid;
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
iomode = lo->plh_return_iomode;
send = pnfs_prepare_layoutreturn(lo);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
}
} else
spin_unlock(&inode->i_lock);
@@ -566,10 +566,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
int
pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *recall_range)
+ const struct pnfs_layout_range *recall_range)
{
struct pnfs_layout_segment *lseg, *next;
- int invalid = 0, removed = 0;
+ int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
@@ -582,11 +582,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
lseg->pls_range.length);
- invalid++;
- removed += mark_lseg_invalid(lseg, tmp_list);
+ if (!mark_lseg_invalid(lseg, tmp_list))
+ remaining++;
}
- dprintk("%s:Return %i\n", __func__, invalid - removed);
- return invalid - removed;
+ dprintk("%s:Return %i\n", __func__, remaining);
+ return remaining;
}
/* note free_me must contain lsegs from a single layout_hdr */
@@ -618,7 +618,6 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
pnfs_get_layout_hdr(lo);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
- pnfs_clear_retry_layoutget(lo);
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -703,6 +702,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
ret = -EAGAIN;
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&lseg_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(inode, 0);
pnfs_put_layout_hdr(lo);
iput(inode);
}
@@ -826,7 +827,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
int
pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
struct nfs4_state *open_state)
{
int status = 0;
@@ -861,7 +862,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
gfp_t gfp_flags)
{
struct inode *ino = lo->plh_inode;
@@ -894,7 +895,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
lgp->args.minlength = i_size - range->offset;
}
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- lgp->args.range = *range;
+ pnfs_copy_range(&lgp->args.range, range);
lgp->args.type = server->pnfs_curr_ld->id;
lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx);
@@ -904,17 +905,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
lseg = nfs4_proc_layoutget(lgp, gfp_flags);
} while (lseg == ERR_PTR(-EAGAIN));
- if (IS_ERR(lseg)) {
- switch (PTR_ERR(lseg)) {
- case -ENOMEM:
- case -ERESTARTSYS:
- break;
- default:
- /* remember that LAYOUTGET failed and suspend trying */
- pnfs_layout_io_set_failed(lo, range->iomode);
- }
- return NULL;
- } else
+ if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
+ lseg = NULL;
+ else
pnfs_layout_clear_fail_bit(lo,
pnfs_iomode_to_fail_bit(range->iomode));
@@ -945,7 +938,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
}
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync)
{
struct inode *ino = lo->plh_inode;
@@ -962,7 +955,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
goto out;
}
- lrp->args.stateid = stateid;
+ nfs4_stateid_copy(&lrp->args.stateid, stateid);
lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
lrp->args.inode = ino;
lrp->args.range.iomode = iomode;
@@ -1005,7 +998,7 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout to return\n", __func__);
goto out;
}
- stateid = nfsi->layout->plh_stateid;
+ nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs);
@@ -1033,7 +1026,7 @@ _pnfs_return_layout(struct inode *ino)
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
if (send)
- status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
out_put_layout_hdr:
pnfs_put_layout_hdr(lo);
out:
@@ -1096,13 +1089,12 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
- stateid = lo->plh_stateid;
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */
if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&lo->plh_flags))
layoutreturn = pnfs_prepare_layoutreturn(lo);
- pnfs_clear_retry_layoutget(lo);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
/* If we are sending layoutreturn, invalidate all valid lsegs */
if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
@@ -1124,7 +1116,7 @@ out_noroc:
pnfs_free_lseg_list(&tmp_list);
pnfs_layoutcommit_inode(ino, true);
if (layoutreturn)
- pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
return roc;
}
@@ -1149,6 +1141,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
+ pnfs_mark_layout_returned_if_empty(lo);
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
@@ -1465,25 +1458,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
return ret;
}
-/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
-static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode)
-{
- if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
- return 1;
- return nfs_wait_bit_killable(key, mode);
-}
-
static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
{
- if (!pnfs_should_retry_layoutget(lo))
- return false;
/*
* send layoutcommit as it can hold up layoutreturn due to lseg
* reference
*/
pnfs_layoutcommit_inode(lo->plh_inode, false);
return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
- pnfs_layoutget_retry_bit_wait,
+ nfs_wait_bit_killable,
TASK_UNINTERRUPTIBLE);
}
@@ -1520,14 +1503,23 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL;
bool first;
- if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+ if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_NO_PNFS);
goto out;
+ }
- if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+ if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
goto out;
+ }
- if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+ if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_MDSTHRESH);
goto out;
+ }
lookup_again:
first = false;
@@ -1535,19 +1527,25 @@ lookup_again:
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
}
/* Do we even need to bother with this? */
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_BULK_RECALL);
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
/* if LAYOUTGET already failed once we don't try again */
- if (pnfs_layout_io_test_failed(lo, iomode) &&
- !pnfs_should_retry_layoutget(lo))
+ if (pnfs_layout_io_test_failed(lo, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
goto out_unlock;
+ }
first = list_empty(&lo->plh_segs);
if (first) {
@@ -1567,8 +1565,11 @@ lookup_again:
* already exists
*/
lseg = pnfs_find_lseg(lo, &arg);
- if (lseg)
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_FOUND_CACHED);
goto out_unlock;
+ }
}
/*
@@ -1585,11 +1586,16 @@ lookup_again:
dprintk("%s retrying\n", __func__);
goto lookup_again;
}
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_RETURN);
goto out_put_layout_hdr;
}
- if (pnfs_layoutgets_blocked(lo))
+ if (pnfs_layoutgets_blocked(lo)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_BLOCKED);
goto out_unlock;
+ }
atomic_inc(&lo->plh_outstanding);
spin_unlock(&ino->i_lock);
@@ -1612,8 +1618,9 @@ lookup_again:
arg.length = PAGE_CACHE_ALIGN(arg.length);
lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
- pnfs_clear_retry_layoutget(lo);
atomic_dec(&lo->plh_outstanding);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
out_put_layout_hdr:
if (first)
pnfs_clear_first_layoutget(lo);
@@ -1623,7 +1630,7 @@ out:
"(%s, offset: %llu, length: %llu)\n",
__func__, ino->i_sb->s_id,
(unsigned long long)NFS_FILEID(ino),
- lseg == NULL ? "not found" : "found",
+ IS_ERR_OR_NULL(lseg) ? "not found" : "found",
iomode==IOMODE_RW ? "read/write" : "read-only",
(unsigned long long)pos,
(unsigned long long)count);
@@ -1730,16 +1737,29 @@ out_forget_reply:
}
static void
+pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+{
+ if (lo->plh_return_iomode == iomode)
+ return;
+ if (lo->plh_return_iomode != 0)
+ iomode = IOMODE_ANY;
+ lo->plh_return_iomode = iomode;
+}
+
+int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *return_range)
+ const struct pnfs_layout_range *return_range)
{
struct pnfs_layout_segment *lseg, *next;
+ int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
if (list_empty(&lo->plh_segs))
- return;
+ return 0;
+
+ assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (should_free_lseg(&lseg->pls_range, return_range)) {
@@ -1749,38 +1769,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg->pls_range.offset,
lseg->pls_range.length);
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
- mark_lseg_invalid(lseg, tmp_list);
+ pnfs_set_plh_return_iomode(lo, return_range->iomode);
+ if (!mark_lseg_invalid(lseg, tmp_list))
+ remaining++;
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&lo->plh_flags);
}
+ return remaining;
}
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg)
{
struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
struct pnfs_layout_range range = {
.iomode = lseg->pls_range.iomode,
.offset = 0,
.length = NFS4_MAX_UINT64,
};
LIST_HEAD(free_me);
+ bool return_now = false;
spin_lock(&inode->i_lock);
- /* set failure bit so that pnfs path will be retried later */
- pnfs_layout_set_fail_bit(lo, iomode);
- if (lo->plh_return_iomode == 0)
- lo->plh_return_iomode = range.iomode;
- else if (lo->plh_return_iomode != range.iomode)
- lo->plh_return_iomode = IOMODE_ANY;
+ pnfs_set_plh_return_iomode(lo, range.iomode);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
- spin_unlock(&inode->i_lock);
+ if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode = lo->plh_return_iomode;
+
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ return_now = pnfs_prepare_layoutreturn(lo);
+ spin_unlock(&inode->i_lock);
+ if (return_now)
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ } else {
+ spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
+ }
pnfs_free_lseg_list(&free_me);
}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
@@ -1802,6 +1831,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
rd_size,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
}
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
@@ -1814,13 +1848,19 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
- if (pgio->pg_lseg == NULL)
+ if (pgio->pg_lseg == NULL) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
req_offset(req),
wb_size,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_write_mds(pgio);
@@ -1988,15 +2028,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
@@ -2119,15 +2157,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d1990e90e7a0..9f4e2a47f4aa 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -98,7 +98,6 @@ enum {
NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
- NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
};
enum layoutdriver_policy_flags {
@@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
bool update_barrier);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
struct nfs4_state *open_state);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *recall_range);
+ const struct pnfs_layout_range *recall_range);
+int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
return d;
}
-static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
- atomic_inc(&lo->plh_refcount);
-}
-
-static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
- atomic_dec(&lo->plh_refcount);
- /* wake up waiters for LAYOUTRETURN as that is not needed */
- wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
- }
-}
-
-static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
-}
-
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
return lseg;
}
+static inline bool
+pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg)
+{
+ return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0;
+}
+
/* Return true if a layout driver is being used for this mountpoint */
static inline int pnfs_enabled_sb(struct nfs_server *nfss)
{
@@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end)
return 1 + end - offset;
}
+/**
+ * pnfs_mark_layout_returned_if_empty - marks the layout as returned
+ * @lo: layout header
+ *
+ * Note: Caller must hold inode->i_lock
+ */
+static inline void
+pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
+{
+ if (list_empty(&lo->plh_segs))
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+}
+
+static inline void
+pnfs_copy_range(struct pnfs_layout_range *dst,
+ const struct pnfs_layout_range *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
extern unsigned int layoutstats_timer;
#ifdef NFS_DEBUG
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 24655b807d44..81ac6480f9e7 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
} else {
nfs_retry_commit(mds_pages, NULL, cinfo, 0);
pnfs_generic_retry_commit(cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
}
nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
- if (nreq == 0) {
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
+ if (nreq == 0)
goto out;
- }
atomic_add(nreq, &cinfo->mds->rpcs_out);
@@ -871,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
buckets = cinfo->ds->buckets;
list = &buckets[ds_commit_idx].written;
if (list_empty(list)) {
+ if (!pnfs_is_valid_lseg(lseg)) {
+ spin_unlock(cinfo->lock);
+ cinfo->completion_ops->resched_write(cinfo, req);
+ return;
+ }
/* Non-empty buckets hold a reference on the lseg. That ref
* is normally transferred to the COMMIT call and released
* there. It could also be released if the last req is pulled
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 0a5e33f33b5c..eb31e23e7def 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+static void nfs_readpage_release(struct nfs_page *req)
+{
+ struct inode *inode = d_inode(req->wb_context->dentry);
+
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+ (long long)req_offset(req));
+
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(inode, req->wb_page, 0);
+
+ unlock_page(req->wb_page);
+ }
+ nfs_release_request(req);
+}
+
int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
struct page *page)
{
@@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
- nfs_pageio_add_request(&pgio, new);
+ if (!nfs_pageio_add_request(&pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
+ }
nfs_pageio_complete(&pgio);
/* It doesn't make sense to do mirrored reads! */
@@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
- return 0;
-}
-
-static void nfs_readpage_release(struct nfs_page *req)
-{
- struct inode *inode = d_inode(req->wb_context->dentry);
-
- dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
- (long long)req_offset(req));
-
- if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(inode, req->wb_page, 0);
-
- unlock_page(req->wb_page);
- }
- nfs_release_request(req);
+ return pgio.pg_error < 0 ? pgio.pg_error : 0;
}
static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -361,6 +364,8 @@ readpage_async_filler(void *data, struct page *page)
if (len < PAGE_CACHE_SIZE)
zero_user_segment(page, len, PAGE_CACHE_SIZE);
if (!nfs_pageio_add_request(desc->pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
error = desc->pgio->pg_error;
goto out_unlock;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7b9316406930..5754835a2886 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -21,6 +21,8 @@
#include <linux/nfs_page.h>
#include <linux/backing-dev.h>
#include <linux/export.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
#include <asm/uaccess.h>
@@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc)
{
int ret = 0;
if (wbc->for_reclaim)
- return FLUSH_HIGHPRI | FLUSH_STABLE;
+ return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
if (wbc->sync_mode == WB_SYNC_ALL)
ret = FLUSH_COND_STABLE;
- if (wbc->for_kupdate || wbc->for_background)
- ret |= FLUSH_LOWPRI;
return ret;
}
@@ -545,12 +545,22 @@ try_again:
return head;
}
+static void nfs_write_error_remove_page(struct nfs_page *req)
+{
+ nfs_unlock_request(req);
+ nfs_end_page_writeback(req);
+ nfs_release_request(req);
+ generic_error_remove_page(page_file_mapping(req->wb_page),
+ req->wb_page);
+}
+
/*
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page, bool nonblock)
+ struct page *page, bool nonblock,
+ bool launder)
{
struct nfs_page *req;
int ret = 0;
@@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
- nfs_redirty_request(req);
ret = pgio->pg_error;
+ /*
+ * Remove the problematic req upon fatal errors
+ * in launder case, while other dirty pages can
+ * still be around until they get flushed.
+ */
+ if (nfs_error_is_fatal(ret)) {
+ nfs_context_set_write_error(req->wb_context, ret);
+ if (launder) {
+ nfs_write_error_remove_page(req);
+ goto out;
+ }
+ }
+ nfs_redirty_request(req);
+ ret = -EAGAIN;
} else
nfs_add_stats(page_file_mapping(page)->host,
NFSIOS_WRITEPAGES, 1);
@@ -576,12 +599,14 @@ out:
return ret;
}
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio, bool launder)
{
int ret;
nfs_pageio_cond_complete(pgio, page_file_index(page));
- ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+ ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
+ launder);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
ret = 0;
@@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
/*
* Write an mmapped page to the server.
*/
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_writepage_locked(struct page *page,
+ struct writeback_control *wbc,
+ bool launder)
{
struct nfs_pageio_descriptor pgio;
struct inode *inode = page_file_mapping(page)->host;
@@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
false, &nfs_async_write_completion_ops);
- err = nfs_do_writepage(page, wbc, &pgio);
+ err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio);
if (err < 0)
return err;
@@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
- ret = nfs_writepage_locked(page, wbc);
+ ret = nfs_writepage_locked(page, wbc, false);
unlock_page(page);
return ret;
}
@@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
{
int ret;
- ret = nfs_do_writepage(page, wbc, data);
+ ret = nfs_do_writepage(page, wbc, data, false);
unlock_page(page);
return ret;
}
@@ -803,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
* holding the nfs_page lock.
*/
void
-nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
- struct nfs_commit_info *cinfo)
+nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
spin_lock(cinfo->lock);
- nfs_request_add_commit_list_locked(req, dst, cinfo);
+ nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
spin_unlock(cinfo->lock);
nfs_mark_page_unstable(req->wb_page, cinfo);
}
@@ -865,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
{
if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
return;
- nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+ nfs_request_add_commit_list(req, cinfo);
}
static void
@@ -1128,7 +1154,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
if (req == NULL)
return 0;
l_ctx = req->wb_lock_context;
- do_flush = req->wb_page != page || req->wb_context != ctx;
+ do_flush = req->wb_page != page ||
+ !nfs_match_open_context(req->wb_context, ctx);
/* for now, flush if more than 1 request in page_group */
do_flush |= req->wb_this_page != req;
if (l_ctx && flctx &&
@@ -1326,9 +1353,15 @@ static void nfs_async_write_error(struct list_head *head)
}
}
+static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ nfs_async_write_error(&hdr->pages);
+}
+
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
.error_cleanup = nfs_async_write_error,
.completion = nfs_write_completion,
+ .reschedule_io = nfs_async_write_reschedule_io,
};
void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -1529,27 +1562,21 @@ static void nfs_writeback_result(struct rpc_task *task,
}
}
-
-static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
{
- int ret;
+ return wait_on_atomic_t(&cinfo->rpcs_out,
+ nfs_wait_atomic_killable, TASK_KILLABLE);
+}
- if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
- return 1;
- if (!may_wait)
- return 0;
- ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
- NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
- return (ret < 0) ? ret : 1;
+static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+{
+ atomic_inc(&cinfo->rpcs_out);
}
-static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
{
- clear_bit(NFS_INO_COMMIT, &nfsi->flags);
- smp_mb__after_atomic();
- wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+ if (atomic_dec_and_test(&cinfo->rpcs_out))
+ wake_up_atomic_t(&cinfo->rpcs_out);
}
void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1666,6 +1693,13 @@ void nfs_retry_commit(struct list_head *page_list,
}
EXPORT_SYMBOL_GPL(nfs_retry_commit);
+static void
+nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ __set_page_dirty_nobuffers(req->wb_page);
+}
+
/*
* Commit dirty pages
*/
@@ -1687,7 +1721,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
data->mds_ops, how, 0);
out_bad:
nfs_retry_commit(head, NULL, cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
@@ -1749,8 +1782,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
- if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
- nfs_commit_clear_lock(NFS_I(data->inode));
+ nfs_commit_end(cinfo.mds);
}
static void nfs_commit_release(void *calldata)
@@ -1769,7 +1801,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
.completion = nfs_commit_release_pages,
- .error_cleanup = nfs_commit_clear_lock,
+ .resched_write = nfs_commit_resched_write,
};
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
@@ -1788,30 +1820,25 @@ int nfs_commit_inode(struct inode *inode, int how)
LIST_HEAD(head);
struct nfs_commit_info cinfo;
int may_wait = how & FLUSH_SYNC;
+ int error = 0;
int res;
- res = nfs_commit_set_lock(NFS_I(inode), may_wait);
- if (res <= 0)
- goto out_mark_dirty;
nfs_init_cinfo_from_inode(&cinfo, inode);
+ nfs_commit_begin(cinfo.mds);
res = nfs_scan_commit(inode, &head, &cinfo);
- if (res) {
- int error;
-
+ if (res)
error = nfs_generic_commit_list(inode, &head, how, &cinfo);
- if (error < 0)
- return error;
- if (!may_wait)
- goto out_mark_dirty;
- error = wait_on_bit_action(&NFS_I(inode)->flags,
- NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
- if (error < 0)
- return error;
- } else
- nfs_commit_clear_lock(NFS_I(inode));
+ nfs_commit_end(cinfo.mds);
+ if (error < 0)
+ goto out_error;
+ if (!may_wait)
+ goto out_mark_dirty;
+ error = wait_on_commit(cinfo.mds);
+ if (error < 0)
+ return error;
return res;
+out_error:
+ res = error;
/* Note: If we exit without ensuring that the commit is complete,
* we must mark the inode as dirty. Otherwise, future calls to
* sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
@@ -1821,6 +1848,7 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return res;
}
+EXPORT_SYMBOL_GPL(nfs_commit_inode);
int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
@@ -1911,7 +1939,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
/*
* Write back all requests on one page - we do this before reading it.
*/
-int nfs_wb_page(struct inode *inode, struct page *page)
+int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
{
loff_t range_start = page_file_offset(page);
loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
@@ -1928,7 +1956,7 @@ int nfs_wb_page(struct inode *inode, struct page *page)
for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- ret = nfs_writepage_locked(page, &wbc);
+ ret = nfs_writepage_locked(page, &wbc, launder);
if (ret < 0)
goto out_error;
continue;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 77e7a5cca888..1a03bc3059e8 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -58,7 +58,7 @@ nlm_fclose(struct file *filp)
fput(filp);
}
-static struct nlmsvc_binding nfsd_nlm_ops = {
+static const struct nlmsvc_binding nfsd_nlm_ops = {
.fopen = nlm_fopen, /* open file for locking */
.fclose = nlm_fclose, /* close file */
};
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d8b16c2568f3..5fbf3bbd00d0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -92,7 +92,7 @@ struct nfsd_net {
struct file *rec_file;
bool in_grace;
- struct nfsd4_client_tracking_ops *client_tracking_ops;
+ const struct nfsd4_client_tracking_ops *client_tracking_ops;
time_t nfsd4_lease;
time_t nfsd4_grace;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e7f50c4081d6..7389cb1d7409 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -792,12 +792,16 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
clp->cl_cb_state = NFSD4_CB_DOWN;
warn_no_callback_path(clp, reason);
}
static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
clp->cl_cb_state = NFSD4_CB_FAULT;
warn_no_callback_path(clp, reason);
}
@@ -1143,7 +1147,7 @@ nfsd4_run_cb_work(struct work_struct *work)
}
void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
- struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
{
cb->cb_clp = clp;
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index c9d6c715c0fb..ce2d010d3b17 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -22,7 +22,7 @@ struct nfs4_layout {
static struct kmem_cache *nfs4_layout_cache;
static struct kmem_cache *nfs4_layout_stateid_cache;
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
@@ -624,24 +624,39 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
struct nfs4_layout_stateid *ls =
container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ struct nfsd_net *nn;
+ ktime_t now, cutoff;
LIST_HEAD(reaplist);
+
switch (task->tk_status) {
case 0:
- return 1;
+ case -NFS4ERR_DELAY:
+ /*
+ * Anything left? If not, then call it done. Note that we don't
+ * take the spinlock since this is an optimization and nothing
+ * should get added until the cb counter goes to zero.
+ */
+ if (list_empty(&ls->ls_layouts))
+ return 1;
+
+ /* Poll the client until it's done with the layout */
+ now = ktime_get();
+ nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id);
+
+ /* Client gets 2 lease periods to return it */
+ cutoff = ktime_add_ns(task->tk_start,
+ nn->nfsd4_lease * NSEC_PER_SEC * 2);
+
+ if (ktime_before(now, cutoff)) {
+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
+ return 0;
+ }
+ /* Fallthrough */
case -NFS4ERR_NOMATCHING_LAYOUT:
trace_layout_recall_done(&ls->ls_stid.sc_stateid);
task->tk_status = 0;
return 1;
- case -NFS4ERR_DELAY:
- /* Poll the client until it's done with the layout */
- /* FIXME: cap number of retries.
- * The pnfs standard states that we need to only expire
- * the client after at-least "lease time" .eg lease-time * 2
- * when failing to communicate a recall
- */
- rpc_delay(task, HZ/100); /* 10 mili-seconds */
- return 0;
default:
/*
* Unknown error or non-responding client, we'll need to fence.
@@ -665,7 +680,7 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
nfs4_put_stid(&ls->ls_stid);
}
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
.prepare = nfsd4_cb_layout_prepare,
.done = nfsd4_cb_layout_done,
.release = nfsd4_cb_layout_release,
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 819ad812c71b..4cba7865f496 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u
struct inode *inode = d_inode(resfh->fh_dentry);
int status;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
status = security_inode_setsecctx(resfh->fh_dentry,
label->data, label->len);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (status)
/*
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e3d47091b191..dc8ebecf5618 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -192,7 +192,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
dir = nn->rec_file->f_path.dentry;
/* lock the parent */
- mutex_lock(&d_inode(dir)->i_mutex);
+ inode_lock(d_inode(dir));
dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
if (IS_ERR(dentry)) {
@@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
out_put:
dput(dentry);
out_unlock:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
if (status == 0) {
if (nn->in_grace) {
crp = nfs4_client_to_reclaim(dname, nn);
@@ -286,7 +286,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
}
status = iterate_dir(nn->rec_file, &ctx.ctx);
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
if (!status) {
@@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
list_del(&entry->list);
kfree(entry);
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
nfs4_reset_creds(original_cred);
list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
@@ -322,7 +322,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
dir = nn->rec_file->f_path.dentry;
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
dentry = lookup_one_len(name, dir, namlen);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
@@ -335,7 +335,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
out:
dput(dentry);
out_unlock:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
return status;
}
@@ -631,7 +631,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
return -ENOENT;
}
-static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
.init = nfsd4_legacy_tracking_init,
.exit = nfsd4_legacy_tracking_exit,
.create = nfsd4_create_clid_dir,
@@ -1050,7 +1050,7 @@ out_err:
printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
}
-static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
.init = nfsd4_init_cld_pipe,
.exit = nfsd4_remove_cld_pipe,
.create = nfsd4_cld_create,
@@ -1394,7 +1394,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
kfree(legacy);
}
-static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
.init = nfsd4_umh_cltrack_init,
.exit = NULL,
.create = nfsd4_umh_cltrack_create,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index df5dba687265..c484a2b6cd10 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -98,7 +98,7 @@ static struct kmem_cache *odstate_slab;
static void free_session(struct nfsd4_session *);
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static bool is_session_dead(struct nfsd4_session *ses)
{
@@ -1857,15 +1857,28 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
-static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+int strdup_if_nonnull(char **target, char *source)
{
- if (source->cr_principal) {
- target->cr_principal =
- kstrdup(source->cr_principal, GFP_KERNEL);
- if (target->cr_principal == NULL)
+ if (source) {
+ *target = kstrdup(source, GFP_KERNEL);
+ if (!*target)
return -ENOMEM;
} else
- target->cr_principal = NULL;
+ *target = NULL;
+ return 0;
+}
+
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
+ int ret;
+
+ ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
+ if (ret)
+ return ret;
+ ret = strdup_if_nonnull(&target->cr_raw_principal,
+ source->cr_raw_principal);
+ if (ret)
+ return ret;
target->cr_flavor = source->cr_flavor;
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;
@@ -1969,6 +1982,9 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
return false;
if (!svc_rqst_integrity_protected(rqstp))
return false;
+ if (cl->cl_cred.cr_raw_principal)
+ return 0 == strcmp(cl->cl_cred.cr_raw_principal,
+ cr->cr_raw_principal);
if (!cr->cr_principal)
return false;
return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
@@ -2240,7 +2256,8 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
base = resp->cstate.data_offset;
slot->sl_datalen = buf->len - base;
if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
- WARN("%s: sessions DRC could not cache compound\n", __func__);
+ WARN(1, "%s: sessions DRC could not cache compound\n",
+ __func__);
return;
}
@@ -2365,10 +2382,27 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
+ new = create_client(exid->clname, rqstp, &verf);
+ if (new == NULL)
+ return nfserr_jukebox;
+
switch (exid->spa_how) {
case SP4_MACH_CRED:
- if (!svc_rqst_integrity_protected(rqstp))
- return nfserr_inval;
+ if (!svc_rqst_integrity_protected(rqstp)) {
+ status = nfserr_inval;
+ goto out_nolock;
+ }
+ /*
+ * Sometimes userspace doesn't give us a principal.
+ * Which is a bug, really. Anyway, we can't enforce
+ * MACH_CRED in that case, better to give up now:
+ */
+ if (!new->cl_cred.cr_principal &&
+ !new->cl_cred.cr_raw_principal) {
+ status = nfserr_serverfault;
+ goto out_nolock;
+ }
+ new->cl_mach_cred = true;
case SP4_NONE:
break;
default: /* checked by xdr code */
@@ -2377,10 +2411,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
return nfserr_encr_alg_unsupp;
}
- new = create_client(exid->clname, rqstp, &verf);
- if (new == NULL)
- return nfserr_jukebox;
-
/* Cases below refer to rfc 5661 section 18.35.4: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&exid->clname, nn);
@@ -2442,7 +2472,6 @@ out_new:
goto out;
}
new->cl_minorversion = cstate->minorversion;
- new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -2460,6 +2489,7 @@ out_copy:
out:
spin_unlock(&nn->client_lock);
+out_nolock:
if (new)
expire_client(new);
if (unconf)
@@ -3648,7 +3678,7 @@ static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
nfs4_put_stid(&dp->dl_stid);
}
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
.prepare = nfsd4_cb_recall_prepare,
.done = nfsd4_cb_recall_done,
.release = nfsd4_cb_recall_release,
@@ -4541,8 +4571,7 @@ static void
laundromat_main(struct work_struct *laundry)
{
time_t t;
- struct delayed_work *dwork = container_of(laundry, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(laundry);
struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
laundromat_work);
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 2087bae17582..f84fe6bf9aee 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -7,6 +7,7 @@
#ifndef _LINUX_NFSD_NFSFH_H
#define _LINUX_NFSD_NFSFH_H
+#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
#include <uapi/linux/nfsd/nfsfh.h>
@@ -205,6 +206,28 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
return true;
}
+#ifdef CONFIG_CRC32
+/**
+ * knfsd_fh_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+}
+#else
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_NFSD_V3
/*
* The wcc data stored in current_fh should be cleared
@@ -265,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
}
inode = d_inode(dentry);
- mutex_lock_nested(&inode->i_mutex, subclass);
+ inode_lock_nested(inode, subclass);
fill_pre_wcc(fhp);
fhp->fh_locked = true;
}
@@ -284,7 +307,7 @@ fh_unlock(struct svc_fh *fhp)
{
if (fhp->fh_locked) {
fill_post_wcc(fhp);
- mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
+ inode_unlock(d_inode(fhp->fh_dentry));
fhp->fh_locked = false;
}
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ad4e2377dd63..45007acaf364 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -14,9 +14,13 @@
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/bind.h>
#include <linux/nfsacl.h>
#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
#include <net/net_namespace.h>
#include "nfsd.h"
#include "cache.h"
@@ -306,22 +310,81 @@ static void nfsd_shutdown_net(struct net *net)
nfsd_shutdown_generic();
}
+static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in sin;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inetaddr_notifier = {
+ .notifier_call = nfsd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int nfsd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct net_device *dev = ifa->idev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in6 sin6;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inet6addr_notifier = {
+ .notifier_call = nfsd_inet6addr_event,
+};
+#endif
+
static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
* started, then nfsd_last_thread will be run before any of this
- * other initialization has been done.
+ * other initialization has been done except the rpcb information.
*/
+ svc_rpcb_cleanup(serv, net);
if (!nn->nfsd_net_up)
return;
- nfsd_shutdown_net(net);
-
- svc_rpcb_cleanup(serv, net);
+ nfsd_shutdown_net(net);
printk(KERN_WARNING "nfsd: last server has exited, flushing export "
"cache\n");
nfsd_export_flush(net);
@@ -425,6 +488,10 @@ int nfsd_create_serv(struct net *net)
}
set_max_drc();
+ register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
return 0;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 99432b7ecb9c..c050c53036a6 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -65,7 +65,7 @@ struct nfsd4_callback {
struct nfs4_client *cb_clp;
u32 cb_minorversion;
struct rpc_message cb_msg;
- struct nfsd4_callback_ops *cb_ops;
+ const struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
int cb_seq_status;
int cb_status;
@@ -599,7 +599,7 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
- struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
extern void nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 0befe762762b..3287041905da 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -8,6 +8,47 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+#include "nfsfh.h"
+
+DECLARE_EVENT_CLASS(nfsd_io_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ loff_t offset,
+ int len),
+ TP_ARGS(rqstp, fhp, offset, len),
+ TP_STRUCT__entry(
+ __field(__be32, xid)
+ __field_struct(struct knfsd_fh, fh)
+ __field(loff_t, offset)
+ __field(int, len)
+ ),
+ TP_fast_assign(
+ __entry->xid = rqstp->rq_xid,
+ fh_copy_shallow(&__entry->fh, &fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d",
+ __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh),
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_NFSD_IO_EVENT(name) \
+DEFINE_EVENT(nfsd_io_class, name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ loff_t offset, \
+ int len), \
+ TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_IO_EVENT(read_start);
+DEFINE_NFSD_IO_EVENT(read_opened);
+DEFINE_NFSD_IO_EVENT(read_io_done);
+DEFINE_NFSD_IO_EVENT(read_done);
+DEFINE_NFSD_IO_EVENT(write_start);
+DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_io_done);
+DEFINE_NFSD_IO_EVENT(write_done);
#include "state.h"
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d41c149fae75..5d2a57e4c03a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -43,6 +43,7 @@
#include "nfsd.h"
#include "vfs.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
@@ -492,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
host_error = security_inode_setsecctx(dentry, label->data, label->len);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return nfserrno(host_error);
}
#else
@@ -997,16 +998,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct raparms *ra;
__be32 err;
+ trace_read_start(rqstp, fhp, offset, vlen);
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
if (err)
return err;
ra = nfsd_init_raparms(file);
+
+ trace_read_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+ trace_read_io_done(rqstp, fhp, offset, vlen);
+
if (ra)
nfsd_put_raparams(file, ra);
fput(file);
+ trace_read_done(rqstp, fhp, offset, vlen);
+
return err;
}
@@ -1022,24 +1030,31 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
{
__be32 err = 0;
+ trace_write_start(rqstp, fhp, offset, vlen);
+
if (file) {
err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
+ trace_write_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
stablep);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
} else {
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
if (err)
goto out;
+ trace_write_opened(rqstp, fhp, offset, vlen);
if (cnt)
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
cnt, stablep);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
fput(file);
}
out:
+ trace_write_done(rqstp, fhp, offset, vlen);
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 10b22527a617..21a1e2e0d92f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -1003,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
@@ -1113,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret == 1)
ret = 0;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index aba43811d6ef..e8fe24882b5b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
flags = nilfs_mask_flags(inode->i_mode, flags);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
oldflags = NILFS_I(inode)->i_flags;
@@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
nilfs_mark_inode_dirty(inode);
ret = nilfs_transaction_commit(inode->i_sb);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7343844e6b6..7f5d3d9f1c37 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1416,7 +1416,8 @@ static int __init nilfs_init_cachep(void)
{
nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
sizeof(struct nilfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ nilfs_inode_init_once);
if (!nilfs_inode_cachep)
goto fail;
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e785fd954c30..741077deef3b 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
break;
}
spin_unlock(&next_i->i_lock);
- next_i = list_entry(next_i->i_sb_list.next,
- struct inode, i_sb_list);
+ next_i = list_next_entry(next_i, i_sb_list);
}
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc0df4442f7b..cfcbf114676e 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -92,9 +92,6 @@
#include "fsnotify.h"
struct srcu_struct fsnotify_mark_srcu;
-static DEFINE_SPINLOCK(destroy_lock);
-static LIST_HEAD(destroy_list);
-static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -168,10 +165,19 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
atomic_dec(&group->num_marks);
}
+static void
+fsnotify_mark_free_rcu(struct rcu_head *rcu)
+{
+ struct fsnotify_mark *mark;
+
+ mark = container_of(rcu, struct fsnotify_mark, g_rcu);
+ fsnotify_put_mark(mark);
+}
+
/*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Free fsnotify mark. The freeing is actually happening from a call_srcu
+ * callback. Caller must have a reference to the mark or be protected by
+ * fsnotify_mark_srcu.
*/
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
@@ -186,10 +192,7 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
+ call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
/*
* Some groups like to know that marks are being freed. This is a
@@ -385,11 +388,7 @@ err:
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
-
+ call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
return ret;
}
@@ -492,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
atomic_set(&mark->refcnt, 1);
mark->free_mark = free_mark;
}
-
-static int fsnotify_mark_destroy(void *ignored)
-{
- struct fsnotify_mark *mark, *next;
- struct list_head private_destroy_list;
-
- for (;;) {
- spin_lock(&destroy_lock);
- /* exchange the list head */
- list_replace_init(&destroy_list, &private_destroy_list);
- spin_unlock(&destroy_lock);
-
- synchronize_srcu(&fsnotify_mark_srcu);
-
- list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
- list_del_init(&mark->g_list);
- fsnotify_put_mark(mark);
- }
-
- wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
- }
-
- return 0;
-}
-
-static int __init fsnotify_mark_init(void)
-{
- struct task_struct *thread;
-
- thread = kthread_run(fsnotify_mark_destroy, NULL,
- "fsnotify_mark");
- if (IS_ERR(thread))
- panic("unable to start fsnotify mark destruction thread.");
-
- return 0;
-}
-device_initcall(fsnotify_mark_init);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9e38dafa3bc7..b2eff5816adc 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
err = filemap_write_and_wait_range(vi->i_mapping, start, end);
if (err)
return err;
- mutex_lock(&vi->i_mutex);
+ inode_lock(vi);
BUG_ON(!S_ISDIR(vi->i_mode));
/* If the bitmap attribute inode is in memory sync it, too. */
@@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
else
ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
"%u.", datasync ? "data" : "", vi->i_ino, -ret);
- mutex_unlock(&vi->i_mutex);
+ inode_unlock(vi);
return ret;
}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 9d383e5eff0e..bed4d427dfae 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t written = 0;
ssize_t err;
- mutex_lock(&vi->i_mutex);
+ inode_lock(vi);
/* We can write back this queue in page reclaim. */
current->backing_dev_info = inode_to_bdi(vi);
err = ntfs_prepare_file_for_write(iocb, from);
if (iov_iter_count(from) && !err)
written = ntfs_perform_write(file, from, iocb->ki_pos);
current->backing_dev_info = NULL;
- mutex_unlock(&vi->i_mutex);
+ inode_unlock(vi);
if (likely(written > 0)) {
err = generic_write_sync(file, iocb->ki_pos, written);
if (err < 0)
@@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
err = filemap_write_and_wait_range(vi->i_mapping, start, end);
if (err)
return err;
- mutex_lock(&vi->i_mutex);
+ inode_lock(vi);
BUG_ON(S_ISDIR(vi->i_mode));
if (!datasync || !NInoNonResident(NTFS_I(vi)))
@@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
else
ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
"%u.", datasync ? "data" : "", vi->i_ino, -ret);
- mutex_unlock(&vi->i_mutex);
+ inode_unlock(vi);
return ret;
}
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index d80e3315cab0..9793e68ba1dd 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
ntfs_error(vol->sb, "Quota inodes are not open.");
return false;
}
- mutex_lock(&vol->quota_q_ino->i_mutex);
+ inode_lock(vol->quota_q_ino);
ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
if (!ictx) {
ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
ntfs_index_entry_mark_dirty(ictx);
set_done:
ntfs_index_ctx_put(ictx);
- mutex_unlock(&vol->quota_q_ino->i_mutex);
+ inode_unlock(vol->quota_q_ino);
/*
* We set the flag so we do not try to mark the quotas out of date
* again on remount.
@@ -110,7 +110,7 @@ done:
err_out:
if (ictx)
ntfs_index_ctx_put(ictx);
- mutex_unlock(&vol->quota_q_ino->i_mutex);
+ inode_unlock(vol->quota_q_ino);
return false;
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d1a853585b53..1b38abdaa3ed 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
* Find the inode number for the hibernation file by looking up the
* filename hiberfil.sys in the root directory.
*/
- mutex_lock(&vol->root_ino->i_mutex);
+ inode_lock(vol->root_ino);
mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
&name);
- mutex_unlock(&vol->root_ino->i_mutex);
+ inode_unlock(vol->root_ino);
if (IS_ERR_MREF(mref)) {
ret = MREF_ERR(mref);
/* If the file does not exist, Windows is not hibernated. */
@@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol)
* Find the inode number for the quota file by looking up the filename
* $Quota in the extended system files directory $Extend.
*/
- mutex_lock(&vol->extend_ino->i_mutex);
+ inode_lock(vol->extend_ino);
mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
&name);
- mutex_unlock(&vol->extend_ino->i_mutex);
+ inode_unlock(vol->extend_ino);
if (IS_ERR_MREF(mref)) {
/*
* If the file does not exist, quotas are disabled and have
@@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
* Find the inode number for the transaction log file by looking up the
* filename $UsnJrnl in the extended system files directory $Extend.
*/
- mutex_lock(&vol->extend_ino->i_mutex);
+ inode_lock(vol->extend_ino);
mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
&name);
- mutex_unlock(&vol->extend_ino->i_mutex);
+ inode_unlock(vol->extend_ino);
if (IS_ERR_MREF(mref)) {
/*
* If the file does not exist, transaction logging is disabled,
@@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void)
ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
sizeof(big_ntfs_inode), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- ntfs_big_inode_init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ntfs_big_inode_init_once);
if (!ntfs_big_inode_cache) {
pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
goto big_inode_err_out;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 86181d6526dc..d002579c6f2b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
.eo_update_clusters = ocfs2_dinode_update_clusters,
@@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_value_update_clusters,
@@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_tree_update_clusters,
@@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
et->et_root_el = &dx_root->dr_list;
}
-static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
.eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
.eo_update_clusters = ocfs2_dx_root_update_clusters,
@@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
return CONTIG_NONE;
}
-static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_refcount_tree_update_clusters,
@@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
struct buffer_head *bh,
ocfs2_journal_access_func access,
void *obj,
- struct ocfs2_extent_tree_operations *ops)
+ const struct ocfs2_extent_tree_operations *ops)
{
et->et_ops = ops;
et->et_root_bh = bh;
@@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
goto bail;
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
out_commit:
ocfs2_commit_trans(osb, handle);
out:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
@@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+ BUG_ON(inode_trylock(tl_inode));
start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+ BUG_ON(inode_trylock(tl_inode));
di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
- mutex_lock(&data_alloc_inode->i_mutex);
+ inode_lock(data_alloc_inode);
status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
if (status < 0) {
@@ -6035,7 +6035,7 @@ out_unlock:
ocfs2_inode_unlock(data_alloc_inode, 1);
out_mutex:
- mutex_unlock(&data_alloc_inode->i_mutex);
+ inode_unlock(data_alloc_inode);
iput(data_alloc_inode);
out:
@@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
int status;
struct inode *tl_inode = osb->osb_tl_inode;
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
status = __ocfs2_flush_truncate_log(osb);
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
return status;
}
@@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
}
bail:
- if (tl_inode)
- iput(tl_inode);
+ iput(tl_inode);
brelse(tl_bh);
if (status < 0) {
@@ -6209,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
num_recs);
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
for(i = 0; i < num_recs; i++) {
if (ocfs2_truncate_log_needs_flush(osb)) {
status = __ocfs2_flush_truncate_log(osb);
@@ -6240,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
}
bail_up:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
return status;
}
@@ -6347,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
goto out;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
@@ -6396,7 +6395,7 @@ out_unlock:
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
out:
while(head) {
@@ -6440,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
handle_t *handle;
int ret = 0;
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
while (head) {
if (ocfs2_truncate_log_needs_flush(osb)) {
@@ -6472,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
}
}
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
while (head) {
/* Premature exit may have left some dangling items. */
@@ -7356,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
if (ret < 0) {
@@ -7423,7 +7422,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
return ret;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fb09b97db162..f3dc1b0dfffc 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -54,7 +54,7 @@
*/
struct ocfs2_extent_tree_operations;
struct ocfs2_extent_tree {
- struct ocfs2_extent_tree_operations *et_ops;
+ const struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
struct ocfs2_caching_info *et_ci;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7f604727f487..794fd1587f34 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2046,9 +2046,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
int ret = 0;
unsigned int truncated_clusters;
- mutex_lock(&osb->osb_tl_inode->i_mutex);
+ inode_lock(osb->osb_tl_inode);
truncated_clusters = osb->truncated_clusters;
- mutex_unlock(&osb->osb_tl_inode->i_mutex);
+ inode_unlock(osb->osb_tl_inode);
/*
* Check whether we can succeed in allocating if we free
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 709fbbd44c65..a3cc6d2fc896 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1780,8 +1780,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
}
++live_threshold;
atomic_set(&reg->hr_steady_iterations, live_threshold);
- /* unsteady_iterations is double the steady_iterations */
- atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+ /* unsteady_iterations is triple the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 72afdca3cea7..ebe543894db0 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -757,7 +757,7 @@ int o2nm_depend_item(struct config_item *item)
void o2nm_undepend_item(struct config_item *item)
{
- configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+ configfs_undepend_item(item);
}
int o2nm_depend_this_node(void)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ffecf89c8c1c..e1adf285fc31 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
mlog_errno(ret);
goto out;
}
- mutex_lock(&dx_alloc_inode->i_mutex);
+ inode_lock(dx_alloc_inode);
ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
if (ret) {
@@ -4410,7 +4410,7 @@ out_unlock:
ocfs2_inode_unlock(dx_alloc_inode, 1);
out_mutex:
- mutex_unlock(&dx_alloc_inode->i_mutex);
+ inode_unlock(dx_alloc_inode);
brelse(dx_alloc_bh);
out:
iput(dx_alloc_inode);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8c83ff..68c607e63ff6 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -376,17 +376,6 @@ struct dlm_lock
lksb_kernel_allocated:1;
};
-
-#define DLM_LKSB_UNUSED1 0x01
-#define DLM_LKSB_PUT_LVB 0x02
-#define DLM_LKSB_GET_LVB 0x04
-#define DLM_LKSB_UNUSED2 0x08
-#define DLM_LKSB_UNUSED3 0x10
-#define DLM_LKSB_UNUSED4 0x20
-#define DLM_LKSB_UNUSED5 0x40
-#define DLM_LKSB_UNUSED6 0x80
-
-
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
DLM_CONVERTING_LIST = 1,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84f2f8079466..9477d6e1de37 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
spin_lock(&res->spinlock);
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
if (test_bit(node, res->refmap)) {
- __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
@@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ /* get an extra reference on the mle.
+ * otherwise the assert_master from the new
+ * master will destroy this.
+ */
+ dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -2544,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
fail:
- if (oldmle) {
+ if (ret != -EEXIST && oldmle) {
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, oldmle);
dlm_put_mle(oldmle);
@@ -2554,6 +2559,7 @@ fail:
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
@@ -2571,17 +2577,6 @@ fail:
* ensure that all assert_master work is flushed. */
flush_workqueue(dlm->dlm_worker);
- /* get an extra reference on the mle.
- * otherwise the assert_master from the new
- * master will destroy this.
- * also, make sure that all callers of dlm_get_mle
- * take both dlm->spinlock and dlm->master_lock */
- spin_lock(&dlm->spinlock);
- spin_lock(&dlm->master_lock);
- dlm_get_mle_inuse(mle);
- spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
-
/* notify new node and send all lock state */
/* call send_one_lockres with migration flag.
* this serves as notice to the target node that a
@@ -3050,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
int ret = 0;
if (!dlm_grab(dlm))
- return -EINVAL;
+ return 0;
name = migrate->name;
namelen = migrate->namelen;
@@ -3141,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
mlog(0, "tried to migrate %.*s, but some "
"process beat me to it\n",
namelen, name);
- ret = -EEXIST;
+ spin_unlock(&tmp->spinlock);
+ return -EEXIST;
} else {
/* bad. 2 NODES are trying to migrate! */
mlog(ML_ERROR, "migration error mle: "
@@ -3312,6 +3308,15 @@ top:
mle->new_master != dead_node)
continue;
+ if (mle->new_master == dead_node && mle->inuse) {
+ mlog(ML_NOTICE, "%s: target %u died during "
+ "migration from %u, the MLE is "
+ "still keep used, ignore it!\n",
+ dlm->name, dead_node,
+ mle->master);
+ continue;
+ }
+
/* If we have reached this point, this mle needs to be
* removed from the list and freed. */
dlm_clean_migration_mle(dlm, mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9e4f862d20fe..c5bdf02c213b 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
char *buf = NULL;
struct dlm_work_item *item = NULL;
struct dlm_lock_resource *res = NULL;
+ unsigned int hash;
if (!dlm_grab(dlm))
return -EINVAL;
@@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
/* lookup the lock to see if we have a secondary queue for this
* already... just add the locks in and this will have its owner
* and RECOVERY flag changed when it completes. */
- res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+ hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+ hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
@@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
}
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
} else {
+ spin_unlock(&dlm->spinlock);
/* need to allocate, just like if it was
* mastered here normally */
res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
@@ -2450,11 +2457,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
- if (test_bit(idx, dlm->recovery_map))
- mlog(0, "domain %s, node %u already added "
- "to recovery map!\n", dlm->name, idx);
- else
- set_bit(idx, dlm->recovery_map);
+ set_bit(idx, dlm->recovery_map);
}
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 2e3c9dbab68c..1082b2c3014b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
if (!dlm_grab(dlm))
- return DLM_REJECTED;
+ return DLM_FORWARD;
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b5cf27dcb18a..03768bb3aab1 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void)
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 20276e340339..474e57f834e6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
unsigned int gen;
int noqueue_attempted = 0;
int dlm_locked = 0;
+ int kick_dc = 0;
if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
mlog_errno(-EINVAL);
@@ -1524,7 +1525,12 @@ update_holders:
unlock:
lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+ kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
spin_unlock_irqrestore(&lockres->l_lock, flags);
+ if (kick_dc)
+ ocfs2_wake_downconvert_thread(osb);
out:
/*
* This is helping work around a lock inversion between the page lock
@@ -2432,12 +2438,6 @@ bail:
* done this we have to return AOP_TRUNCATED_PAGE so the aop method
* that called us can bubble that back up into the VFS who will then
* immediately retry the aop call.
- *
- * We do a blocking lock and immediate unlock before returning, though, so that
- * the lock has a great chance of being cached on this node by the time the VFS
- * calls back to retry the aop. This has a potential to livelock as nodes
- * ping locks back and forth, but that's a risk we're willing to take to avoid
- * the lock inversion simply.
*/
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
@@ -2449,8 +2449,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
- if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
- ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f92e..7cb38fdca229 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt,
}
generic_fillattr(inode, stat);
+ /*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others don't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+ stat->blocks += (stat->size + 511)>>9;
/* We set the blksize from the cluster size for performance */
stat->blksize = osb->s_clustersize;
@@ -1864,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* This prevents concurrent writes on other nodes
@@ -1983,7 +1991,7 @@ out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -2291,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
relock:
/*
@@ -2427,7 +2435,7 @@ out:
ocfs2_rw_unlock(inode, rw_level);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (written)
ret = written;
@@ -2539,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file->f_mapping->host;
int ret = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case SEEK_SET:
@@ -2577,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret)
return ret;
return offset;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 97a563bab9a8..36294446d960 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -630,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail;
}
- mutex_lock(&inode_alloc_inode->i_mutex);
+ inode_lock(inode_alloc_inode);
status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
if (status < 0) {
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
mlog_errno(status);
goto bail;
@@ -680,7 +680,7 @@ bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_inode_unlock(inode_alloc_inode, 1);
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
brelse(inode_alloc_bh);
bail:
iput(inode_alloc_inode);
@@ -751,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
/* Lock the orphan dir. The lock will be held for the entire
* delete_inode operation. We do this now to avoid races with
* recovery completion on other nodes. */
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
mlog_errno(status);
goto bail;
@@ -803,7 +803,7 @@ bail_unlock_dir:
return status;
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
brelse(orphan_dir_bh);
bail:
iput(orphan_dir_inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 3cb097ccce60..4506ec5ec2ea 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
unsigned oldflags;
int status;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
@@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
bail_unlock:
ocfs2_inode_unlock(inode, 1);
bail:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
brelse(bh);
@@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
struct ocfs2_dinode *dinode_alloc = NULL;
if (inode_alloc)
- mutex_lock(&inode_alloc->i_mutex);
+ inode_lock(inode_alloc);
if (o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
@@ -317,7 +317,7 @@ bail:
ocfs2_inode_unlock(inode_alloc, 0);
if (inode_alloc)
- mutex_unlock(&inode_alloc->i_mutex);
+ inode_unlock(inode_alloc);
brelse(bh);
@@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
struct ocfs2_dinode *gb_dinode = NULL;
if (gb_inode)
- mutex_lock(&gb_inode->i_mutex);
+ inode_lock(gb_inode);
if (o2info_coherent(&ffg->iff_req)) {
status = ocfs2_inode_lock(gb_inode, &bh, 0);
@@ -604,11 +604,9 @@ bail:
ocfs2_inode_unlock(gb_inode, 0);
if (gb_inode)
- mutex_unlock(&gb_inode->i_mutex);
-
- if (gb_inode)
- iput(gb_inode);
+ inode_unlock(gb_inode);
+ iput(gb_inode);
brelse(bh);
return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 13534f4fe5b5..61b833b721d8 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
// up_write(&journal->j_trans_barrier);
done:
- if (inode)
- iput(inode);
+ iput(inode);
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1687,9 +1686,7 @@ done:
if (got_lock)
ocfs2_inode_unlock(inode, 1);
- if (inode)
- iput(inode);
-
+ iput(inode);
brelse(bh);
return status;
@@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
ocfs2_inode_unlock(inode, 1);
bail:
- if (inode)
- iput(inode);
+ iput(inode);
return status;
}
@@ -2092,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
return status;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
@@ -2110,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
out_cluster:
ocfs2_inode_unlock(orphan_dir_inode, 0);
out:
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
return status;
}
@@ -2200,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
oi->ip_next_orphan = NULL;
if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = ocfs2_rw_lock(inode, 1);
if (ret < 0) {
mlog_errno(ret);
@@ -2239,7 +2235,7 @@ unlock_inode:
unlock_rw:
ocfs2_rw_unlock(inode, 1);
unlock_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/* clear dio flag in ocfs2_inode_info */
oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0711..7d62c43a2c3e 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
bail:
if (status < 0)
brelse(alloc_bh);
- if (inode)
- iput(inode);
+ iput(inode);
trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
@@ -415,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
@@ -469,12 +468,11 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
- if (local_alloc_inode)
- iput(local_alloc_inode);
+ iput(local_alloc_inode);
kfree(alloc_copy);
}
@@ -508,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
goto bail;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
status = ocfs2_read_inode_block_full(inode, &alloc_bh,
OCFS2_BH_IGNORE_CACHE);
@@ -541,7 +539,7 @@ bail:
brelse(alloc_bh);
if (inode) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
}
@@ -573,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
@@ -603,7 +601,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
brelse(main_bm_bh);
@@ -645,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
goto bail;
}
- mutex_lock(&local_alloc_inode->i_mutex);
+ inode_lock(local_alloc_inode);
/*
* We must double check state and allocator bits because
@@ -711,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
status = 0;
bail:
if (status < 0 && local_alloc_inode) {
- mutex_unlock(&local_alloc_inode->i_mutex);
+ inode_unlock(local_alloc_inode);
iput(local_alloc_inode);
}
@@ -1327,9 +1325,7 @@ bail:
brelse(main_bm_bh);
- if (main_bm_inode)
- iput(main_bm_inode);
-
+ iput(main_bm_inode);
kfree(alloc_copy);
if (ac)
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 124471d26a73..e3d05d9901a3 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
* context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
*/
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -338,7 +338,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock_mutex:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
if (context->data_ac) {
ocfs2_free_alloc_context(context->data_ac);
@@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
goto out;
}
- mutex_lock(&gb_inode->i_mutex);
+ inode_lock(gb_inode);
ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
if (ret) {
@@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
goto out_unlock_gb_mutex;
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
@@ -708,11 +708,11 @@ out_commit:
brelse(gd_bh);
out_unlock_tl_inode:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
ocfs2_inode_unlock(gb_inode, 1);
out_unlock_gb_mutex:
- mutex_unlock(&gb_inode->i_mutex);
+ inode_unlock(gb_inode);
brelse(gb_bh);
iput(gb_inode);
@@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* This prevents concurrent writes from other nodes
@@ -969,7 +969,7 @@ out_inode_unlock:
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return status;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index afb81eae2c18..6b3e87189a64 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1045,7 +1045,7 @@ leave:
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
@@ -1664,7 +1664,7 @@ bail:
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
@@ -1683,8 +1683,7 @@ bail:
if (new_inode)
sync_mapping_buffers(old_inode->i_mapping);
- if (new_inode)
- iput(new_inode);
+ iput(new_inode);
ocfs2_free_dir_lookup_result(&target_lookup_res);
ocfs2_free_dir_lookup_result(&old_entry_lookup);
@@ -2122,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
return ret;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (ret < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
mlog_errno(ret);
@@ -2227,7 +2226,7 @@ out:
if (ret) {
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
}
@@ -2373,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
name, strlen(name));
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
/* find it's spot in the orphan directory */
status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
&lookup);
@@ -2388,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(orphan_dir_inode),
- orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* do the i_nlink dance! :) */
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
@@ -2496,7 +2495,7 @@ out:
ocfs2_free_alloc_context(inode_ac);
/* Unroll orphan dir locking */
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
ocfs2_inode_unlock(orphan_dir, 1);
iput(orphan_dir);
}
@@ -2603,7 +2602,7 @@ leave:
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
@@ -2690,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
bail_unlock_orphan:
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
ocfs2_free_dir_lookup_result(&orphan_insert);
@@ -2722,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
goto bail;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
mlog_errno(status);
goto bail;
@@ -2771,7 +2770,7 @@ bail_commit:
bail_unlock_orphan:
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
brelse(orphan_dir_bh);
iput(orphan_dir_inode);
@@ -2835,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto leave;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
mlog_errno(status);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
goto leave;
}
@@ -2902,7 +2901,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
orphan_unlock:
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
leave:
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index b6d51333ad02..d153e6e31529 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -82,7 +82,7 @@ struct ocfs2_quota_chunk {
extern struct kmem_cache *ocfs2_dquot_cachep;
extern struct kmem_cache *ocfs2_qf_chunk_cachep;
-extern struct qtree_fmt_operations ocfs2_global_ops;
+extern const struct qtree_fmt_operations ocfs2_global_ops;
struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
struct ocfs2_super *osb, int slot_num);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index c93d67220887..9c9dd30bc945 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
dquot->dq_id);
}
-struct qtree_fmt_operations ocfs2_global_ops = {
+const struct qtree_fmt_operations ocfs2_global_ops = {
.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
.disk2mem_dqblk = ocfs2_global_disk2memdqb,
.is_id = ocfs2_global_is_id,
@@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
WARN_ON(bh != oinfo->dqi_gqi_bh);
spin_unlock(&dq_data_lock);
if (ex) {
- mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+ inode_lock(oinfo->dqi_gqinode);
down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
} else {
down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
@@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
{
if (ex) {
up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
- mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+ inode_unlock(oinfo->dqi_gqinode);
} else {
up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 252119860e6c..3eff031aaf26 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
mlog_errno(ret);
goto out;
}
- mutex_lock(&alloc_inode->i_mutex);
+ inode_lock(alloc_inode);
ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
if (ret) {
@@ -867,7 +867,7 @@ out_unlock:
}
out_mutex:
if (alloc_inode) {
- mutex_unlock(&alloc_inode->i_mutex);
+ inode_unlock(alloc_inode);
iput(alloc_inode);
}
out:
@@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto out;
}
- mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(new_inode, I_MUTEX_CHILD);
ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
OI_LS_REFLINK_TARGET);
if (ret) {
@@ -4231,7 +4231,7 @@ inode_unlock:
ocfs2_inode_unlock(new_inode, 1);
brelse(new_bh);
out_unlock:
- mutex_unlock(&new_inode->i_mutex);
+ inode_unlock(new_inode);
out:
if (!ret) {
ret = filemap_fdatawait(inode->i_mapping);
@@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
return error;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = dquot_initialize(dir);
if (!error)
error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!error)
fsnotify_create(dir, new_dentry);
return error;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 79b8021302b3..576b9a04873f 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
@@ -375,7 +375,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
@@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
@@ -590,7 +590,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e78a203d44c8..1e09592148ad 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
if (si == NULL)
return;
- if (si->si_inode)
- iput(si->si_inode);
+ iput(si->si_inode);
if (si->si_bh) {
for (i = 0; i < si->si_blocks; i++) {
if (si->si_bh[i]) {
@@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ /*
+ * if write block failed, invalidate slot to avoid overwrite
+ * slot during dismount in case another node rightly has mounted
+ */
+ spin_lock(&osb->osb_lock);
+ ocfs2_invalidate_slot(si, osb->slot_num);
+ osb->slot_num = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ }
bail:
return status;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index fc6d25f6d444..2f19aeec5482 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
if (ac->ac_which != OCFS2_AC_USE_LOCAL)
ocfs2_inode_unlock(inode, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
ac->ac_inode = NULL;
@@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
return -EINVAL;
}
- mutex_lock(&alloc_inode->i_mutex);
+ inode_lock(alloc_inode);
status = ocfs2_inode_lock(alloc_inode, &bh, 1);
if (status < 0) {
- mutex_unlock(&alloc_inode->i_mutex);
+ inode_unlock(alloc_inode);
iput(alloc_inode);
mlog_errno(status);
@@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
goto bail;
}
- mutex_lock(&inode_alloc_inode->i_mutex);
+ inode_lock(inode_alloc_inode);
status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
if (status < 0) {
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
iput(inode_alloc_inode);
mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
(u32)suballoc_slot, status);
@@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
ocfs2_inode_unlock(inode_alloc_inode, 0);
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
iput(inode_alloc_inode);
brelse(alloc_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a9340c..faa1365097bc 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb,
int status, user_stack = 0;
char *p;
u32 tmp;
+ int token, option;
+ substring_t args[MAX_OPT_ARGS];
trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
@@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb,
}
while ((p = strsep(&options, ",")) != NULL) {
- int token, option;
- substring_t args[MAX_OPT_ARGS];
-
if (!*p)
continue;
@@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->atime_quantum = option;
break;
case Opt_slot:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->slot = (s16)option;
break;
case Opt_commit:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->commit_interval = HZ * option;
break;
case Opt_localalloc:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1726,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
- if (inode)
- iput(inode);
+ iput(inode);
if (status)
mlog_errno(status);
@@ -1771,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void)
sizeof(struct ocfs2_inode_info),
0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
ocfs2_inode_init_once);
ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
sizeof(struct ocfs2_dquot),
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f0e241ffd94f..7d3d979f57d9 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2524,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
mlog_errno(ret);
goto out;
}
- mutex_lock(&xb_alloc_inode->i_mutex);
+ inode_lock(xb_alloc_inode);
ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
if (ret < 0) {
@@ -2549,7 +2549,7 @@ out_unlock:
ocfs2_inode_unlock(xb_alloc_inode, 1);
brelse(xb_alloc_bh);
out_mutex:
- mutex_unlock(&xb_alloc_inode->i_mutex);
+ inode_unlock(xb_alloc_inode);
iput(xb_alloc_inode);
out:
brelse(blk_bh);
@@ -3619,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode,
}
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
if (ret < 0) {
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
mlog_errno(ret);
goto cleanup;
}
}
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
&xbs, &ctxt, ref_meta, &credits);
@@ -5460,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
return ret;
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -5504,7 +5504,7 @@ out_commit:
out:
ocfs2_schedule_truncate_log_flush(osb, 1);
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
diff --git a/fs/open.c b/fs/open.c
index b25b1542c530..55bdc75e2172 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -58,10 +58,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
if (ret)
newattrs.ia_valid |= ret | ATTR_FORCE;
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock(dentry->d_inode);
/* Note any delegations or leases have already been broken: */
ret = notify_change(dentry, &newattrs, NULL);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
return ret;
}
@@ -510,7 +510,7 @@ static int chmod_common(struct path *path, umode_t mode)
if (error)
return error;
retry_deleg:
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = security_path_chmod(path, mode);
if (error)
goto out_unlock;
@@ -518,7 +518,7 @@ retry_deleg:
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
error = notify_change(path->dentry, &newattrs, &delegated_inode);
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
@@ -593,11 +593,11 @@ retry_deleg:
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = security_path_chown(path, uid, gid);
if (!error)
error = notify_change(path->dentry, &newattrs, &delegated_inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 15e4500cda3e..b61b883c8ff8 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -443,7 +443,7 @@ static int __init init_openprom_fs(void)
sizeof(struct op_inode_info),
0,
(SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
op_inode_init_once);
if (!op_inode_cachep)
return -ENOMEM;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 0a8983492d91..d894e7cd9a86 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -22,9 +22,9 @@
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
- ssize_t list_size, size;
- char *buf, *name, *value;
- int error;
+ ssize_t list_size, size, value_size = 0;
+ char *buf, *name, *value = NULL;
+ int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
@@ -41,29 +41,40 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
if (!buf)
return -ENOMEM;
- error = -ENOMEM;
- value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
- if (!value)
- goto out;
-
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
- goto out_free_value;
+ goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
- size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
- if (size <= 0) {
+retry:
+ size = vfs_getxattr(old, name, value, value_size);
+ if (size == -ERANGE)
+ size = vfs_getxattr(old, name, NULL, 0);
+
+ if (size < 0) {
error = size;
- goto out_free_value;
+ break;
+ }
+
+ if (size > value_size) {
+ void *new;
+
+ new = krealloc(value, size, GFP_KERNEL);
+ if (!new) {
+ error = -ENOMEM;
+ break;
+ }
+ value = new;
+ value_size = size;
+ goto retry;
}
+
error = vfs_setxattr(new, name, value, size, 0);
if (error)
- goto out_free_value;
+ break;
}
-
-out_free_value:
kfree(value);
out:
kfree(buf);
@@ -237,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (err)
goto out_cleanup;
- mutex_lock(&newdentry->d_inode->i_mutex);
+ inode_lock(newdentry->d_inode);
err = ovl_set_attr(newdentry, stat);
- mutex_unlock(&newdentry->d_inode->i_mutex);
+ inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 692ceda3bc21..ed95272d57a6 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry;
int err;
- mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
@@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
out_dput:
dput(newdentry);
out_unlock:
- mutex_unlock(&udir->i_mutex);
+ inode_unlock(udir);
return err;
}
@@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (err)
goto out_cleanup;
- mutex_lock(&opaquedir->d_inode->i_mutex);
+ inode_lock(opaquedir->d_inode);
err = ovl_set_attr(opaquedir, &stat);
- mutex_unlock(&opaquedir->d_inode->i_mutex);
+ inode_unlock(opaquedir->d_inode);
if (err)
goto out_cleanup;
@@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
struct dentry *upper = ovl_dentry_upper(dentry);
int err;
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
err = -ESTALE;
if (upper->d_parent == upperdir) {
/* Don't let d_delete() think it can reset d_inode */
@@ -619,7 +619,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
* now.
*/
d_drop(dentry);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return err;
}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 964a60fa7afc..49e204560655 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -42,6 +42,19 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
int err;
struct dentry *upperdentry;
+ /*
+ * Check for permissions before trying to copy-up. This is redundant
+ * since it will be rechecked later by ->setattr() on upper dentry. But
+ * without this, copy-up can be triggered by just about anybody.
+ *
+ * We don't initialize inode->size, which just means that
+ * inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
+ * check for a swapfile (which this won't be anyway).
+ */
+ err = inode_change_ok(dentry->d_inode, attr);
+ if (err)
+ return err;
+
err = ovl_want_write(dentry);
if (err)
goto out;
@@ -50,9 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
- mutex_lock(&upperdentry->d_inode->i_mutex);
+ inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
- mutex_unlock(&upperdentry->d_inode->i_mutex);
+ inode_unlock(upperdentry->d_inode);
}
ovl_drop_write(dentry);
out:
@@ -95,6 +108,29 @@ int ovl_permission(struct inode *inode, int mask)
realdentry = ovl_entry_real(oe, &is_upper);
+ if (ovl_is_default_permissions(inode)) {
+ struct kstat stat;
+ struct path realpath = { .dentry = realdentry };
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
+
+ err = vfs_getattr(&realpath, &stat);
+ if (err)
+ return err;
+
+ if ((stat.mode ^ inode->i_mode) & S_IFMT)
+ return -ESTALE;
+
+ inode->i_mode = stat.mode;
+ inode->i_uid = stat.uid;
+ inode->i_gid = stat.gid;
+
+ return generic_permission(inode, mask);
+ }
+
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e17154aeaae4..99b4168c36ff 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -142,7 +142,10 @@ struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
+ bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+bool ovl_is_default_permissions(struct inode *inode);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 70e9af551600..fdaf28f75e12 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
dput(dentry);
}
}
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
}
revert_creds(old_cred);
put_cred(override_cred);
@@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
loff_t res;
struct ovl_dir_file *od = file->private_data;
- mutex_lock(&file_inode(file)->i_mutex);
+ inode_lock(file_inode(file));
if (!file->f_pos)
ovl_dir_reset(file);
@@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
res = offset;
}
out_unlock:
- mutex_unlock(&file_inode(file)->i_mutex);
+ inode_unlock(file_inode(file));
return res;
}
@@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
@@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
fput(realfile);
realfile = od->upperfile;
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
}
@@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ovl_cache_put(od, file->f_path.dentry);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
fput(od->realfile);
if (od->upperfile)
@@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
- mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
@@ -571,8 +571,9 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
(int) PTR_ERR(dentry));
continue;
}
- ovl_cleanup(upper->d_inode, dentry);
+ if (dentry->d_inode)
+ ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
- mutex_unlock(&upper->d_inode->i_mutex);
+ inode_unlock(upper->d_inode);
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e38ee0fed24a..8d826bd56b26 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -9,12 +9,14 @@
#include <linux/fs.h>
#include <linux/namei.h>
+#include <linux/pagemap.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/module.h>
+#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/statfs.h>
#include <linux/seq_file.h>
@@ -24,12 +26,11 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Overlay filesystem");
MODULE_LICENSE("GPL");
-#define OVERLAYFS_SUPER_MAGIC 0x794c7630
-
struct ovl_config {
char *lowerdir;
char *upperdir;
char *workdir;
+ bool default_permissions;
};
/* private information held for overlayfs's superblock */
@@ -154,6 +155,18 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
return realdentry;
}
+struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
+ bool is_upper)
+{
+ if (is_upper) {
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ return ofs->upper_mnt;
+ } else {
+ return oe->numlower ? oe->lowerstack[0].mnt : NULL;
+ }
+}
+
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -161,6 +174,13 @@ struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
return oe->cache;
}
+bool ovl_is_default_permissions(struct inode *inode)
+{
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ return ofs->config.default_permissions;
+}
+
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -209,7 +229,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
- WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+ WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
WARN_ON(oe->__upperdentry);
BUG_ON(!upperdentry->d_inode);
/*
@@ -224,7 +244,7 @@ void ovl_dentry_version_inc(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
- WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ WARN_ON(!inode_is_locked(dentry->d_inode));
oe->version++;
}
@@ -232,7 +252,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
- WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ WARN_ON(!inode_is_locked(dentry->d_inode));
return oe->version;
}
@@ -355,9 +375,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
{
struct dentry *dentry;
- mutex_lock(&dir->d_inode->i_mutex);
+ inode_lock(dir->d_inode);
dentry = lookup_one_len(name->name, dir, name->len);
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
if (IS_ERR(dentry)) {
if (PTR_ERR(dentry) == -ENOENT)
@@ -594,6 +614,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
seq_show_option(m, "upperdir", ufs->config.upperdir);
seq_show_option(m, "workdir", ufs->config.workdir);
}
+ if (ufs->config.default_permissions)
+ seq_puts(m, ",default_permissions");
return 0;
}
@@ -618,6 +640,7 @@ enum {
OPT_LOWERDIR,
OPT_UPPERDIR,
OPT_WORKDIR,
+ OPT_DEFAULT_PERMISSIONS,
OPT_ERR,
};
@@ -625,6 +648,7 @@ static const match_table_t ovl_tokens = {
{OPT_LOWERDIR, "lowerdir=%s"},
{OPT_UPPERDIR, "upperdir=%s"},
{OPT_WORKDIR, "workdir=%s"},
+ {OPT_DEFAULT_PERMISSIONS, "default_permissions"},
{OPT_ERR, NULL}
};
@@ -685,6 +709,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
return -ENOMEM;
break;
+ case OPT_DEFAULT_PERMISSIONS:
+ config->default_permissions = true;
+ break;
+
default:
pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
return -EINVAL;
@@ -716,7 +744,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
if (err)
return ERR_PTR(err);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
retry:
work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
strlen(OVL_WORKDIR_NAME));
@@ -742,7 +770,7 @@ retry:
goto out_dput;
}
out_unlock:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
mnt_drop_write(mnt);
return work;
@@ -910,6 +938,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_stack_depth = 0;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
if (ufs->config.upperdir) {
if (!ufs->config.workdir) {
pr_err("overlayfs: missing 'workdir'\n");
@@ -1053,6 +1082,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
root_dentry->d_fsdata = oe;
+ ovl_copyattr(ovl_dentry_real(root_dentry)->d_inode,
+ root_dentry->d_inode);
+
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_op = &ovl_super_operations;
sb->s_root = root_dentry;
diff --git a/fs/pipe.c b/fs/pipe.c
index 42cf8ddf0e55..ab8dad3ccb6a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576;
*/
unsigned int pipe_min_size = PAGE_SIZE;
+/* Maximum allocatable pages per user. Hard limit is unset by default, soft
+ * matches default values.
+ */
+unsigned long pipe_user_pages_hard;
+unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
+
/*
* We use a start+len construction, which provides full use of the
* allocated memory.
@@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on)
return retval;
}
+static void account_pipe_buffers(struct pipe_inode_info *pipe,
+ unsigned long old, unsigned long new)
+{
+ atomic_long_add(new - old, &pipe->user->pipe_bufs);
+}
+
+static bool too_many_pipe_buffers_soft(struct user_struct *user)
+{
+ return pipe_user_pages_soft &&
+ atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
+}
+
+static bool too_many_pipe_buffers_hard(struct user_struct *user)
+{
+ return pipe_user_pages_hard &&
+ atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
+}
+
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
if (pipe) {
- pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+ unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
+ struct user_struct *user = get_current_user();
+
+ if (!too_many_pipe_buffers_hard(user)) {
+ if (too_many_pipe_buffers_soft(user))
+ pipe_bufs = 1;
+ pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
+ }
+
if (pipe->bufs) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
- pipe->buffers = PIPE_DEF_BUFFERS;
+ pipe->buffers = pipe_bufs;
+ pipe->user = user;
+ account_pipe_buffers(pipe, 0, pipe_bufs);
mutex_init(&pipe->mutex);
return pipe;
}
+ free_uid(user);
kfree(pipe);
}
@@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
+ account_pipe_buffers(pipe, pipe->buffers, 0);
+ free_uid(pipe->user);
for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
@@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
}
+ account_pipe_buffers(pipe, pipe->buffers, nr_pages);
pipe->curbuf = 0;
kfree(pipe->bufs);
pipe->bufs = bufs;
@@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
ret = -EPERM;
goto out;
+ } else if ((too_many_pipe_buffers_hard(pipe->user) ||
+ too_many_pipe_buffers_soft(pipe->user)) &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
}
ret = pipe_set_size(pipe, nr_pages);
break;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d73291f5f0fc..b6c00ce0e29e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
state = *get_task_state(task);
vsize = eip = esp = 0;
- permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
+ permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2cf5d7e37375..4f764c2ac1a5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
+ struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (mm && !IS_ERR(mm)) {
unsigned int nwords = 0;
do {
@@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
wchan = get_wchan(task);
- if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
+ if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
+ && !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
else
seq_putc(m, '0');
@@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task)
int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
if (err)
return err;
- if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
mutex_unlock(&task->signal->cred_guard_mutex);
return -EPERM;
}
@@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode)
*/
task = get_proc_task(inode);
if (task) {
- allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
put_task_struct(task);
}
return allowed;
@@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
return true;
if (in_group_p(pid->pid_gid))
return true;
- return ptrace_may_access(task, PTRACE_MODE_READ);
+ return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}
@@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
struct mm_struct *mm = ERR_PTR(-ESRCH);
if (task) {
- mm = mm_access(task, mode);
+ mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
put_task_struct(task);
if (!IS_ERR_OR_NULL(mm)) {
@@ -952,6 +953,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
unsigned long src = *ppos;
int ret = 0;
struct mm_struct *mm = file->private_data;
+ unsigned long env_start, env_end;
if (!mm)
return 0;
@@ -963,19 +965,25 @@ static ssize_t environ_read(struct file *file, char __user *buf,
ret = 0;
if (!atomic_inc_not_zero(&mm->mm_users))
goto free;
+
+ down_read(&mm->mmap_sem);
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ up_read(&mm->mmap_sem);
+
while (count > 0) {
size_t this_len, max_len;
int retval;
- if (src >= (mm->env_end - mm->env_start))
+ if (src >= (env_end - env_start))
break;
- this_len = mm->env_end - (mm->env_start + src);
+ this_len = env_end - (env_start + src);
max_len = min_t(size_t, PAGE_SIZE, count);
this_len = min(max_len, this_len);
- retval = access_remote_vm(mm, (mm->env_start + src),
+ retval = access_remote_vm(mm, (env_start + src),
page, this_len, 0);
if (retval <= 0) {
@@ -1860,7 +1868,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (!task)
goto out_notask;
- mm = mm_access(task, PTRACE_MODE_READ);
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR_OR_NULL(mm))
goto out;
@@ -2013,7 +2021,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
goto out;
result = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
result = -ENOENT;
@@ -2066,7 +2074,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
goto out;
ret = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
ret = 0;
@@ -2533,7 +2541,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
if (result)
return result;
- if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
result = -EACCES;
goto out_unlock;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d0e9b9b6223e..42305ddcbaa0 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,8 @@ void __init proc_init_inodecache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_PANIC),
init_once);
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 92e6726f6e37..a939f5ed7f89 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
i_size_write(inode, proc_root_kcore->size);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 9155a5a0d3b9..df4661abadc4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -57,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
/*
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
- *
- * Free memory cannot be taken below the low watermark, before the
- * system starts swapping.
*/
- available = i.freeram - wmark_low;
+ available = i.freeram - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 1dece8781f91..276f12431dbf 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -46,7 +46,7 @@ static const char *proc_ns_get_link(struct dentry *dentry,
if (!task)
return error;
- if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
error = ns_get_path(&ns_path, task, ns_ops);
if (!error)
nd_jump_link(&ns_path);
@@ -67,7 +67,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (!task)
return res;
- if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0)
res = readlink_copy(buffer, buflen, name);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 93484034a03d..b2855eea5405 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapped() is not enough.
+ * simple test in page_mapcount() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (!PageSlab(page) && page_mapcount(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 67e8db442cf0..b6a8d3529fea 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -50,7 +50,7 @@ int proc_setup_self(struct super_block *s)
struct pid_namespace *ns = s->s_fs_info;
struct dentry *self;
- mutex_lock(&root_inode->i_mutex);
+ inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
if (self) {
struct inode *inode = new_inode_pseudo(s);
@@ -69,7 +69,7 @@ int proc_setup_self(struct super_block *s)
} else {
self = ERR_PTR(-ENOMEM);
}
- mutex_unlock(&root_inode->i_mutex);
+ inode_unlock(root_inode);
if (IS_ERR(self)) {
pr_err("proc_fill_super: can't allocate /proc/self\n");
return PTR_ERR(self);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187b3b5f242e..85d16c67c33e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,6 +14,7 @@
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -22,9 +23,13 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib, swap, ptes, pmds;
+ unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ anon = get_mm_counter(mm, MM_ANONPAGES);
+ file = get_mm_counter(mm, MM_FILEPAGES);
+ shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
* hiwater_rss only when about to *lower* total_vm or rss. Any
@@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
hiwater_vm = total_vm = mm->total_vm;
if (hiwater_vm < mm->hiwater_vm)
hiwater_vm = mm->hiwater_vm;
- hiwater_rss = total_rss = get_mm_rss(mm);
+ hiwater_rss = total_rss = anon + file + shmem;
if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
- data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmPin:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
+ "RssAnon:\t%8lu kB\n"
+ "RssFile:\t%8lu kB\n"
+ "RssShmem:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
@@ -65,7 +72,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
- data << (PAGE_SHIFT-10),
+ anon << (PAGE_SHIFT-10),
+ file << (PAGE_SHIFT-10),
+ shmem << (PAGE_SHIFT-10),
+ mm->data_vm << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
ptes >> 10,
pmds >> 10,
@@ -82,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES);
+ *shared = get_mm_counter(mm, MM_FILEPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
- *data = mm->total_vm - mm->shared_vm;
+ *data = mm->data_vm + mm->stack_vm;
*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -451,12 +462,14 @@ struct mem_size_stats {
unsigned long private_hugetlb;
u64 pss;
u64 swap_pss;
+ bool check_shmem_swap;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- unsigned long size, bool young, bool dirty)
+ bool compound, bool young, bool dirty)
{
- int mapcount;
+ int i, nr = compound ? 1 << compound_order(page) : 1;
+ unsigned long size = nr * PAGE_SIZE;
if (PageAnon(page))
mss->anonymous += size;
@@ -465,26 +478,53 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
/* Accumulate the size in pages that have been accessed. */
if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- u64 pss_delta;
- if (dirty || PageDirty(page))
- mss->shared_dirty += size;
- else
- mss->shared_clean += size;
- pss_delta = (u64)size << PSS_SHIFT;
- do_div(pss_delta, mapcount);
- mss->pss += pss_delta;
- } else {
+ /*
+ * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * If any subpage of the compound page mapped with PTE it would elevate
+ * page_count().
+ */
+ if (page_count(page) == 1) {
if (dirty || PageDirty(page))
mss->private_dirty += size;
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ return;
+ }
+
+ for (i = 0; i < nr; i++, page++) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2) {
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += PAGE_SIZE;
+ else
+ mss->shared_clean += PAGE_SIZE;
+ mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += PAGE_SIZE;
+ else
+ mss->private_clean += PAGE_SIZE;
+ mss->pss += PAGE_SIZE << PSS_SHIFT;
+ }
}
}
+#ifdef CONFIG_SHMEM
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+
+ mss->swap += shmem_partial_swap_usage(
+ walk->vma->vm_file->f_mapping, addr, end);
+
+ return 0;
+}
+#endif
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -512,11 +552,25 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+ && pte_none(*pte))) {
+ page = find_get_entry(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
+ if (!page)
+ return;
+
+ if (radix_tree_exceptional_entry(page))
+ mss->swap += PAGE_SIZE;
+ else
+ page_cache_release(page);
+
+ return;
}
if (!page)
return;
- smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -532,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (IS_ERR_OR_NULL(page))
return;
mss->anonymous_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, HPAGE_PMD_SIZE,
- pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -549,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
return 0;
@@ -671,6 +725,31 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
};
memset(&mss, 0, sizeof mss);
+
+#ifdef CONFIG_SHMEM
+ if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+ /*
+ * For shared or readonly shmem mappings we know that all
+ * swapped out pages belong to the shmem object, and we can
+ * obtain the swap value much more efficiently. For private
+ * writable mappings, we might have COW pages that are
+ * not affected by the parent swapped out pages of the shmem
+ * object, so we have to distinguish them during the page walk.
+ * Unless we know that the shmem object (or the part mapped by
+ * our VMA) has no swapped out pages at all.
+ */
+ unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+ if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE)) {
+ mss.swap = shmem_swapped;
+ } else {
+ mss.check_shmem_swap = true;
+ smaps_walk.pte_hole = smaps_pte_hole;
+ }
+ }
+#endif
+
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
@@ -817,9 +896,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
pmd = pmd_wrprotect(pmd);
pmd = pmd_clear_soft_dirty(pmd);
- if (vma->vm_flags & VM_SOFTDIRTY)
- vma->vm_flags &= ~VM_SOFTDIRTY;
-
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
#else
@@ -838,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty_pmd(vma, addr, pmd);
goto out;
@@ -1112,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmdp, vma);
+ if (ptl) {
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
@@ -1444,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 9eacd59e0360..e58a31e8fb2a 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -52,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s)
struct pid_namespace *ns = s->s_fs_info;
struct dentry *thread_self;
- mutex_lock(&root_inode->i_mutex);
+ inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
if (thread_self) {
struct inode *inode = new_inode_pseudo(s);
@@ -71,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s)
} else {
thread_self = ERR_PTR(-ENOMEM);
}
- mutex_unlock(&root_inode->i_mutex);
+ inode_unlock(root_inode);
if (IS_ERR(thread_self)) {
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
return PTR_ERR(thread_self);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d8c439d813ce..dc645b66cd79 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
break;
}
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
dentry = d_alloc_name(root, name);
if (!dentry)
@@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
list_add(&private->list, &allpstore);
spin_unlock_irqrestore(&allpstore_lock, flags);
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
return 0;
fail_lockedalloc:
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
kfree(private);
fail_alloc:
iput(inode);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index f37b3deb01b4..3a67cfb142d8 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -365,7 +365,7 @@ static int init_inodecache(void)
qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
sizeof(struct qnx4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (qnx4_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 9728b5499e1d..47bb1de07155 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -625,7 +625,7 @@ static int init_inodecache(void)
qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
sizeof(struct qnx6_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (!qnx6_inode_cachep)
return -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ef0d64b2a6d9..3c3b81bb6dfe 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type)
continue;
if (!sb_has_quota_active(sb, cnt))
continue;
- mutex_lock(&dqopt->files[cnt]->i_mutex);
+ inode_lock(dqopt->files[cnt]);
truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
- mutex_unlock(&dqopt->files[cnt]->i_mutex);
+ inode_unlock(dqopt->files[cnt]);
}
mutex_unlock(&dqopt->dqonoff_mutex);
@@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
/* If quota was reenabled in the meantime, we have
* nothing to do */
if (!sb_has_quota_loaded(sb, cnt)) {
- mutex_lock(&toputinode[cnt]->i_mutex);
+ inode_lock(toputinode[cnt]);
toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
S_NOATIME | S_NOQUOTA);
truncate_inode_pages(&toputinode[cnt]->i_data,
0);
- mutex_unlock(&toputinode[cnt]->i_mutex);
+ inode_unlock(toputinode[cnt]);
mark_inode_dirty_sync(toputinode[cnt]);
}
mutex_unlock(&dqopt->dqonoff_mutex);
@@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
/* We don't want quota and atime on quota files (deadlocks
* possible) Also nobody should write to the file - we use
* special IO operations which ignore the immutable bit. */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
S_NOQUOTA);
inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* When S_NOQUOTA is set, remove dquot references as no more
* references can be added
@@ -2305,12 +2305,12 @@ out_file_init:
iput(inode);
out_lock:
if (oldflags != -1) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Set the flags back (in the case of accidental quotaon()
* on a wrong file we don't want to mess up the flags) */
inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
inode->i_flags |= oldflags;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
mutex_unlock(&dqopt->dqonoff_mutex);
out_fmt:
@@ -2430,9 +2430,9 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
- mutex_lock(&d_inode(sb->s_root)->i_mutex);
+ inode_lock(d_inode(sb->s_root));
dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
- mutex_unlock(&d_inode(sb->s_root)->i_mutex);
+ inode_unlock(d_inode(sb->s_root));
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -2924,4 +2924,4 @@ static int __init dquot_init(void)
return 0;
}
-module_init(dquot_init);
+fs_initcall(dquot_init);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index bb2869f5dfd8..d07a2f91d858 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -1,7 +1,5 @@
-
#include <linux/cred.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/quotaops.h>
#include <linux/sched.h>
@@ -105,5 +103,4 @@ static int __init quota_init(void)
"VFS: Failed to create quota netlink interface.\n");
return 0;
};
-
-module_init(quota_init);
+fs_initcall(quota_init);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 2aa012a68e90..ed85d4f35c04 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
static int v2r1_is_id(void *dp, struct dquot *dquot);
-static struct qtree_fmt_operations v2r0_qtree_ops = {
+static const struct qtree_fmt_operations v2r0_qtree_ops = {
.mem2disk_dqblk = v2r0_mem2diskdqb,
.disk2mem_dqblk = v2r0_disk2memdqb,
.is_id = v2r0_is_id,
};
-static struct qtree_fmt_operations v2r1_qtree_ops = {
+static const struct qtree_fmt_operations v2r1_qtree_ops = {
.mem2disk_dqblk = v2r1_mem2diskdqb,
.disk2mem_dqblk = v2r1_disk2memdqb,
.is_id = v2r1_is_id,
diff --git a/fs/read_write.c b/fs/read_write.c
index 06b07d5a08fe..324ec271cc4e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -238,7 +238,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file_inode(file);
loff_t retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case SEEK_END:
offset += i_size_read(inode);
@@ -283,7 +283,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
retval = offset;
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
EXPORT_SYMBOL(default_llseek);
@@ -1656,6 +1656,9 @@ next_file:
mnt_drop_write_file(dst_file);
next_loop:
fdput(dst_fd);
+
+ if (fatal_signal_pending(current))
+ goto out;
}
out:
diff --git a/fs/readdir.c b/fs/readdir.c
index ced679179cac..e69ef3b79787 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
fsnotify_access(file);
file_accessed(file);
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out:
return res;
}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 4a024e2ceb9f..3abd4004184b 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
reiserfs_write_lock(inode->i_sb);
err = reiserfs_commit_for_inode(inode);
reiserfs_write_unlock(inode->i_sb);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (err < 0)
return err;
return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 96a1bcf33db4..9424a4ba93a9 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
BUG_ON(!S_ISREG(inode->i_mode));
err = sync_mapping_buffers(inode->i_mapping);
reiserfs_write_lock(inode->i_sb);
@@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
reiserfs_write_unlock(inode->i_sb);
if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (barrier_done < 0)
return barrier_done;
return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 6ec8a30a0911..036a1fc0a8c3 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -224,7 +224,7 @@ out_unlock:
page_cache_release(page);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
reiserfs_write_unlock(inode->i_sb);
return retval;
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4a62fe8cc3bf..c0306ec8ed7b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s)
pathrelse(&path);
inode = reiserfs_iget(s, &obj_key);
- if (!inode) {
+ if (IS_ERR_OR_NULL(inode)) {
/*
* the unlink almost completed, it just did not
* manage to remove "save" link and release objectid
@@ -626,7 +626,8 @@ static int __init init_inodecache(void)
sizeof(struct
reiserfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT),
init_once);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e5ddb4e5ea94..57e0b2310532 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -64,14 +64,14 @@
#ifdef CONFIG_REISERFS_FS_XATTR
static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
{
- BUG_ON(!mutex_is_locked(&dir->i_mutex));
+ BUG_ON(!inode_is_locked(dir));
return dir->i_op->create(dir, dentry, mode, true);
}
#endif
static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- BUG_ON(!mutex_is_locked(&dir->i_mutex));
+ BUG_ON(!inode_is_locked(dir));
return dir->i_op->mkdir(dir, dentry, mode);
}
@@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
- BUG_ON(!mutex_is_locked(&dir->i_mutex));
+ BUG_ON(!inode_is_locked(dir));
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
error = dir->i_op->unlink(dir, dentry);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
if (!error)
d_delete(dentry);
@@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
{
int error;
- BUG_ON(!mutex_is_locked(&dir->i_mutex));
+ BUG_ON(!inode_is_locked(dir));
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
error = dir->i_op->rmdir(dir, dentry);
if (!error)
d_inode(dentry)->i_flags |= S_DEAD;
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
if (!error)
d_delete(dentry);
@@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
if (d_really_is_negative(privroot))
return ERR_PTR(-ENODATA);
- mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
xaroot = dget(REISERFS_SB(sb)->xattr_root);
if (!xaroot)
@@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
}
}
- mutex_unlock(&d_inode(privroot)->i_mutex);
+ inode_unlock(d_inode(privroot));
return xaroot;
}
@@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
le32_to_cpu(INODE_PKEY(inode)->k_objectid),
inode->i_generation);
- mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
@@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
}
}
- mutex_unlock(&d_inode(xaroot)->i_mutex);
+ inode_unlock(d_inode(xaroot));
dput(xaroot);
return xadir;
}
@@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
container_of(ctx, struct reiserfs_dentry_buf, ctx);
struct dentry *dentry;
- WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
return -ENOSPC;
@@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
goto out_dir;
}
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
buf.xadir = dir;
while (1) {
@@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
break;
buf.count = 0;
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
cleanup_dentry_buf(&buf);
@@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
if (!err) {
int jerror;
- mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex,
+ inode_lock_nested(d_inode(dir->d_parent),
I_MUTEX_XATTR);
err = action(dir, data);
reiserfs_write_lock(inode->i_sb);
jerror = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
- mutex_unlock(&d_inode(dir->d_parent)->i_mutex);
+ inode_unlock(d_inode(dir->d_parent));
err = jerror ?: err;
}
}
@@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
if (IS_ERR(xadir))
return ERR_CAST(xadir);
- mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
xafile = lookup_one_len(name, xadir, strlen(name));
if (IS_ERR(xafile)) {
err = PTR_ERR(xafile);
@@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
if (err)
dput(xafile);
out:
- mutex_unlock(&d_inode(xadir)->i_mutex);
+ inode_unlock(d_inode(xadir));
dput(xadir);
if (err)
return ERR_PTR(err);
@@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
if (IS_ERR(xadir))
return PTR_ERR(xadir);
- mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
dentry = lookup_one_len(name, xadir, strlen(name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
@@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
dput(dentry);
out_dput:
- mutex_unlock(&d_inode(xadir)->i_mutex);
+ inode_unlock(d_inode(xadir));
dput(xadir);
return err;
}
@@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
.ia_valid = ATTR_SIZE | ATTR_CTIME,
};
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
inode_dio_wait(d_inode(dentry));
err = reiserfs_setattr(dentry, &newattrs);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
} else
update_ctime(inode);
out_unlock:
@@ -888,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
goto out;
}
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
if (!err)
err = buf.pos;
@@ -905,7 +905,7 @@ static int create_privroot(struct dentry *dentry)
int err;
struct inode *inode = d_inode(dentry->d_parent);
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(inode));
err = xattr_mkdir(inode, dentry, 0700);
if (err || d_really_is_negative(dentry)) {
@@ -995,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
int err = 0;
/* If we don't have the privroot located yet - go find it */
- mutex_lock(&d_inode(s->s_root)->i_mutex);
+ inode_lock(d_inode(s->s_root));
dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
strlen(PRIVROOT_NAME));
if (!IS_ERR(dentry)) {
@@ -1005,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
d_inode(dentry)->i_flags |= S_PRIVATE;
} else
err = PTR_ERR(dentry);
- mutex_unlock(&d_inode(s->s_root)->i_mutex);
+ inode_unlock(d_inode(s->s_root));
return err;
}
@@ -1025,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
goto error;
if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
- mutex_lock(&d_inode(s->s_root)->i_mutex);
+ inode_lock(d_inode(s->s_root));
err = create_privroot(REISERFS_SB(s)->priv_root);
- mutex_unlock(&d_inode(s->s_root)->i_mutex);
+ inode_unlock(d_inode(s->s_root));
}
if (d_really_is_positive(privroot)) {
s->s_xattr = reiserfs_xattr_handlers;
- mutex_lock(&d_inode(privroot)->i_mutex);
+ inode_lock(d_inode(privroot));
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
@@ -1043,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
else
err = PTR_ERR(dentry);
}
- mutex_unlock(&d_inode(privroot)->i_mutex);
+ inode_unlock(d_inode(privroot));
}
error:
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index bb894e78a821..6b00ca357c58 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -619,8 +619,8 @@ static int __init init_romfs_fs(void)
romfs_inode_cachep =
kmem_cache_create("romfs_i",
sizeof(struct romfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- romfs_i_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, romfs_i_init_once);
if (!romfs_inode_cachep) {
pr_err("Failed to initialise inode cache\n");
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index dded920cbc8f..5e79bfa4f260 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -419,7 +419,8 @@ static int __init init_inodecache(void)
{
squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
sizeof(struct squashfs_inode_info), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ init_once);
return squashfs_inode_cachep ? 0 : -ENOMEM;
}
diff --git a/fs/stat.c b/fs/stat.c
index d4a61d8dc021..bc045c7994e1 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
#ifndef INIT_STRUCT_STAT_PADDING
diff --git a/fs/super.c b/fs/super.c
index cc658a20a29e..1182af8fd5ff 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1197,7 +1197,7 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
else
ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
- WARN_ON(force_trylock & !ret);
+ WARN_ON(force_trylock && !ret);
return ret;
}
EXPORT_SYMBOL(__sb_start_write);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 07ac18c355e7..d62c423a5a2d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -346,7 +346,7 @@ int __init sysv_init_icache(void)
{
sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (!sysv_inode_cachep)
return -ENOMEM;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index c66f2423e1f5..4a0e48f92104 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo
* the files within the tracefs system. It is up to the individual
* mkdir routine to handle races.
*/
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = tracefs_ops.mkdir(name);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
kfree(name);
@@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
* This time we need to unlock not only the parent (inode) but
* also the directory that is being deleted.
*/
- mutex_unlock(&inode->i_mutex);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(inode);
+ inode_unlock(dentry->d_inode);
ret = tracefs_ops.rmdir(name);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
+ inode_lock(dentry->d_inode);
kfree(name);
@@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = tracefs_mount->mnt_root;
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
dentry = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(dentry) && dentry->d_inode) {
dput(dentry);
@@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
}
if (IS_ERR(dentry)) {
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
}
@@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
static struct dentry *failed_creating(struct dentry *dentry)
{
- mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ inode_unlock(dentry->d_parent->d_inode);
dput(dentry);
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
return NULL;
@@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
static struct dentry *end_creating(struct dentry *dentry)
{
- mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ inode_unlock(dentry->d_parent->d_inode);
return dentry;
}
@@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry)
if (!parent || !parent->d_inode)
return;
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
ret = __tracefs_remove(dentry, parent);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
if (!ret)
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
}
@@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
parent = dentry;
down:
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
loop:
/*
* The parent->d_subdirs is protected by the d_lock. Outside that
@@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
/* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
spin_unlock(&parent->d_lock);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
parent = child;
goto down;
}
@@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry)
}
spin_unlock(&parent->d_lock);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
child = parent;
parent = parent->d_parent;
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
if (child != dentry)
/* go up */
@@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
if (!__tracefs_remove(child, parent))
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
}
/**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e49bd2808bf3..795992a8321e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu",
dentry, inode->i_ino,
inode->i_nlink, dir->i_ino);
- ubifs_assert(mutex_is_locked(&dir->i_mutex));
- ubifs_assert(mutex_is_locked(&inode->i_mutex));
+ ubifs_assert(inode_is_locked(dir));
+ ubifs_assert(inode_is_locked(inode));
err = dbg_check_synced_i_size(c, inode);
if (err)
@@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
dentry, inode->i_ino,
inode->i_nlink, dir->i_ino);
- ubifs_assert(mutex_is_locked(&dir->i_mutex));
- ubifs_assert(mutex_is_locked(&inode->i_mutex));
+ ubifs_assert(inode_is_locked(dir));
+ ubifs_assert(inode_is_locked(inode));
err = dbg_check_synced_i_size(c, inode);
if (err)
return err;
@@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry,
inode->i_ino, dir->i_ino);
- ubifs_assert(mutex_is_locked(&dir->i_mutex));
- ubifs_assert(mutex_is_locked(&inode->i_mutex));
+ ubifs_assert(inode_is_locked(dir));
+ ubifs_assert(inode_is_locked(inode));
err = check_dir_empty(c, d_inode(dentry));
if (err)
return err;
@@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu",
old_dentry, old_inode->i_ino, old_dir->i_ino,
new_dentry, new_dir->i_ino);
- ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
- ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+ ubifs_assert(inode_is_locked(old_dir));
+ ubifs_assert(inode_is_locked(new_dir));
if (unlink)
- ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+ ubifs_assert(inode_is_locked(new_inode));
if (unlink && is_dir) {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index eff62801acbf..065c88f8e4b8 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
err = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Synchronize the inode unless this is a 'datasync()' call. */
if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
@@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
err = ubifs_sync_wbufs_by_inode(c, inode);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fd90c079537..a233ba913be4 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2248,8 +2248,8 @@ static int __init ubifs_init(void)
ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
sizeof(struct ubifs_inode), 0,
- SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
- &inode_slab_ctor);
+ SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT, &inode_slab_ctor);
if (!ubifs_inode_slab)
return -ENOMEM;
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e53292d0c21b..c7f4d434d098 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value,
union ubifs_key key;
int err, type;
- ubifs_assert(mutex_is_locked(&host->i_mutex));
+ ubifs_assert(inode_is_locked(host));
if (size > UBIFS_MAX_INO_DATA)
return -ERANGE;
@@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
dbg_gen("xattr '%s', ino %lu ('%pd')", name,
host->i_ino, dentry);
- ubifs_assert(mutex_is_locked(&host->i_mutex));
+ ubifs_assert(inode_is_locked(host));
err = check_namespace(&nm);
if (err < 0)
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 6d6a96b4e73f..e0fd65fe73e8 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb,
*/
int adsize;
- struct short_ad *sad = NULL;
- struct long_ad *lad = NULL;
- struct allocExtDesc *aed;
eloc.logicalBlockNum = start;
elen = EXT_RECORDED_ALLOCATED |
@@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb,
}
if (epos.offset + (2 * adsize) > sb->s_blocksize) {
- unsigned char *sptr, *dptr;
- int loffset;
-
- brelse(oepos.bh);
- oepos = epos;
-
/* Steal a block from the extent being free'd */
- epos.block.logicalBlockNum = eloc.logicalBlockNum;
+ udf_setup_indirect_aext(table, eloc.logicalBlockNum,
+ &epos);
+
eloc.logicalBlockNum++;
elen -= sb->s_blocksize;
-
- epos.bh = udf_tread(sb,
- udf_get_lb_pblock(sb, &epos.block, 0));
- if (!epos.bh) {
- brelse(oepos.bh);
- goto error_return;
- }
- aed = (struct allocExtDesc *)(epos.bh->b_data);
- aed->previousAllocExtLocation =
- cpu_to_le32(oepos.block.logicalBlockNum);
- if (epos.offset + adsize > sb->s_blocksize) {
- loffset = epos.offset;
- aed->lengthAllocDescs = cpu_to_le32(adsize);
- sptr = iinfo->i_ext.i_data + epos.offset
- - adsize;
- dptr = epos.bh->b_data +
- sizeof(struct allocExtDesc);
- memcpy(dptr, sptr, adsize);
- epos.offset = sizeof(struct allocExtDesc) +
- adsize;
- } else {
- loffset = epos.offset + adsize;
- aed->lengthAllocDescs = cpu_to_le32(0);
- if (oepos.bh) {
- sptr = oepos.bh->b_data + epos.offset;
- aed = (struct allocExtDesc *)
- oepos.bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs,
- adsize);
- } else {
- sptr = iinfo->i_ext.i_data +
- epos.offset;
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(table);
- }
- epos.offset = sizeof(struct allocExtDesc);
- }
- if (sbi->s_udfrev >= 0x0200)
- udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
- 3, 1, epos.block.logicalBlockNum,
- sizeof(struct tag));
- else
- udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
- 2, 1, epos.block.logicalBlockNum,
- sizeof(struct tag));
-
- switch (iinfo->i_alloc_type) {
- case ICBTAG_FLAG_AD_SHORT:
- sad = (struct short_ad *)sptr;
- sad->extLength = cpu_to_le32(
- EXT_NEXT_EXTENT_ALLOCDECS |
- sb->s_blocksize);
- sad->extPosition =
- cpu_to_le32(epos.block.logicalBlockNum);
- break;
- case ICBTAG_FLAG_AD_LONG:
- lad = (struct long_ad *)sptr;
- lad->extLength = cpu_to_le32(
- EXT_NEXT_EXTENT_ALLOCDECS |
- sb->s_blocksize);
- lad->extLocation =
- cpu_to_lelb(epos.block);
- break;
- }
- if (oepos.bh) {
- udf_update_tag(oepos.bh->b_data, loffset);
- mark_buffer_dirty(oepos.bh);
- } else {
- mark_inode_dirty(table);
- }
}
/* It's possible that stealing the block emptied the extent */
- if (elen) {
- udf_write_aext(table, &epos, &eloc, elen, 1);
-
- if (!epos.bh) {
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(table);
- } else {
- aed = (struct allocExtDesc *)epos.bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs, adsize);
- udf_update_tag(epos.bh->b_data, epos.offset);
- mark_buffer_dirty(epos.bh);
- }
- }
+ if (elen)
+ __udf_add_aext(table, &epos, &eloc, elen, 1);
}
brelse(epos.bh);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bddf3d071dae..1af98963d860 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct udf_inode_info *iinfo = UDF_I(inode);
int err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = generic_write_checks(iocb, from);
if (retval <= 0)
@@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
(udf_file_entry_alloc_offset(inode) + end)) {
err = udf_expand_file_adinicb(inode);
if (err) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
udf_debug("udf_expand_adinicb: err=%d\n", err);
return err;
}
@@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
retval = __generic_file_write_iter(iocb, from);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (retval > 0) {
mark_inode_dirty(inode);
@@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp)
* Grab i_mutex to avoid races with writes changing i_size
* while we are running.
*/
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
down_write(&UDF_I(inode)->i_data_sem);
udf_discard_prealloc(inode);
udf_truncate_tail_extent(inode);
up_write(&UDF_I(inode)->i_data_sem);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 055746350d16..166d3ed32c39 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode)
.nr_to_write = 1,
};
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(inode));
if (!iinfo->i_lenAlloc) {
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
@@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode,
udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
count++;
- } else
+ } else {
+ struct kernel_lb_addr tmploc;
+ uint32_t tmplen;
+
udf_write_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
+ /*
+ * We've rewritten the last extent but there may be empty
+ * indirect extent after it - enter it.
+ */
+ udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
+ }
/* Managed to do everything necessary? */
if (!blocks)
@@ -1867,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
return inode;
}
-int udf_add_aext(struct inode *inode, struct extent_position *epos,
- struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+int udf_setup_indirect_aext(struct inode *inode, int block,
+ struct extent_position *epos)
{
- int adsize;
- struct short_ad *sad = NULL;
- struct long_ad *lad = NULL;
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *bh;
struct allocExtDesc *aed;
- uint8_t *ptr;
- struct udf_inode_info *iinfo = UDF_I(inode);
+ struct extent_position nepos;
+ struct kernel_lb_addr neloc;
+ int ver, adsize;
- if (!epos->bh)
- ptr = iinfo->i_ext.i_data + epos->offset -
- udf_file_entry_alloc_offset(inode) +
- iinfo->i_lenEAttr;
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
else
- ptr = epos->bh->b_data + epos->offset;
+ return -EIO;
+
+ neloc.logicalBlockNum = block;
+ neloc.partitionReferenceNum = epos->block.partitionReferenceNum;
+
+ bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0));
+ if (!bh)
+ return -EIO;
+ lock_buffer(bh);
+ memset(bh->b_data, 0x00, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ mark_buffer_dirty_inode(bh, inode);
+
+ aed = (struct allocExtDesc *)(bh->b_data);
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) {
+ aed->previousAllocExtLocation =
+ cpu_to_le32(epos->block.logicalBlockNum);
+ }
+ aed->lengthAllocDescs = cpu_to_le32(0);
+ if (UDF_SB(sb)->s_udfrev >= 0x0200)
+ ver = 3;
+ else
+ ver = 2;
+ udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block,
+ sizeof(struct tag));
+
+ nepos.block = neloc;
+ nepos.offset = sizeof(struct allocExtDesc);
+ nepos.bh = bh;
+
+ /*
+ * Do we have to copy current last extent to make space for indirect
+ * one?
+ */
+ if (epos->offset + adsize > sb->s_blocksize) {
+ struct kernel_lb_addr cp_loc;
+ uint32_t cp_len;
+ int cp_type;
+
+ epos->offset -= adsize;
+ cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0);
+ cp_len |= ((uint32_t)cp_type) << 30;
+
+ __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1);
+ udf_write_aext(inode, epos, &nepos.block,
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ } else {
+ __udf_add_aext(inode, epos, &nepos.block,
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ }
+
+ brelse(epos->bh);
+ *epos = nepos;
+
+ return 0;
+}
+
+/*
+ * Append extent at the given position - should be the first free one in inode
+ * / indirect extent. This function assumes there is enough space in the inode
+ * or indirect extent. Use udf_add_aext() if you didn't check for this before.
+ */
+int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ struct allocExtDesc *aed;
+ int adsize;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -1891,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
else
return -EIO;
- if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
- unsigned char *sptr, *dptr;
- struct buffer_head *nbh;
- int err, loffset;
- struct kernel_lb_addr obloc = epos->block;
-
- epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
- obloc.partitionReferenceNum,
- obloc.logicalBlockNum, &err);
- if (!epos->block.logicalBlockNum)
- return -ENOSPC;
- nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
- &epos->block,
- 0));
- if (!nbh)
- return -EIO;
- lock_buffer(nbh);
- memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
- set_buffer_uptodate(nbh);
- unlock_buffer(nbh);
- mark_buffer_dirty_inode(nbh, inode);
-
- aed = (struct allocExtDesc *)(nbh->b_data);
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
- aed->previousAllocExtLocation =
- cpu_to_le32(obloc.logicalBlockNum);
- if (epos->offset + adsize > inode->i_sb->s_blocksize) {
- loffset = epos->offset;
- aed->lengthAllocDescs = cpu_to_le32(adsize);
- sptr = ptr - adsize;
- dptr = nbh->b_data + sizeof(struct allocExtDesc);
- memcpy(dptr, sptr, adsize);
- epos->offset = sizeof(struct allocExtDesc) + adsize;
- } else {
- loffset = epos->offset + adsize;
- aed->lengthAllocDescs = cpu_to_le32(0);
- sptr = ptr;
- epos->offset = sizeof(struct allocExtDesc);
-
- if (epos->bh) {
- aed = (struct allocExtDesc *)epos->bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs, adsize);
- } else {
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(inode);
- }
- }
- if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
- udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
- epos->block.logicalBlockNum, sizeof(struct tag));
- else
- udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
- epos->block.logicalBlockNum, sizeof(struct tag));
- switch (iinfo->i_alloc_type) {
- case ICBTAG_FLAG_AD_SHORT:
- sad = (struct short_ad *)sptr;
- sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
- inode->i_sb->s_blocksize);
- sad->extPosition =
- cpu_to_le32(epos->block.logicalBlockNum);
- break;
- case ICBTAG_FLAG_AD_LONG:
- lad = (struct long_ad *)sptr;
- lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
- inode->i_sb->s_blocksize);
- lad->extLocation = cpu_to_lelb(epos->block);
- memset(lad->impUse, 0x00, sizeof(lad->impUse));
- break;
- }
- if (epos->bh) {
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
- UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
- udf_update_tag(epos->bh->b_data, loffset);
- else
- udf_update_tag(epos->bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos->bh, inode);
- brelse(epos->bh);
- } else {
- mark_inode_dirty(inode);
- }
- epos->bh = nbh;
+ if (!epos->bh) {
+ WARN_ON(iinfo->i_lenAlloc !=
+ epos->offset - udf_file_entry_alloc_offset(inode));
+ } else {
+ aed = (struct allocExtDesc *)epos->bh->b_data;
+ WARN_ON(le32_to_cpu(aed->lengthAllocDescs) !=
+ epos->offset - sizeof(struct allocExtDesc));
+ WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize);
}
udf_write_aext(inode, epos, eloc, elen, inc);
@@ -1996,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
return 0;
}
+/*
+ * Append extent at given position - should be the first free one in inode
+ * / indirect extent. Takes care of allocating and linking indirect blocks.
+ */
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+ int adsize;
+ struct super_block *sb = inode->i_sb;
+
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
+ else
+ return -EIO;
+
+ if (epos->offset + (2 * adsize) > sb->s_blocksize) {
+ int err;
+ int new_block;
+
+ new_block = udf_new_block(sb, NULL,
+ epos->block.partitionReferenceNum,
+ epos->block.logicalBlockNum, &err);
+ if (!new_block)
+ return -ENOSPC;
+
+ err = udf_setup_indirect_aext(inode, new_block, epos);
+ if (err)
+ return err;
+ }
+
+ return __udf_add_aext(inode, epos, eloc, elen, inc);
+}
+
void udf_write_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
@@ -2048,14 +2086,29 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos,
epos->offset += adsize;
}
+/*
+ * Only 1 indirect extent in a row really makes sense but allow upto 16 in case
+ * someone does some weird stuff.
+ */
+#define UDF_MAX_INDIR_EXTS 16
+
int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
int8_t etype;
+ unsigned int indirections = 0;
while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
(EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
int block;
+
+ if (++indirections > UDF_MAX_INDIR_EXTS) {
+ udf_err(inode->i_sb,
+ "too many indirect extents in inode %lu\n",
+ inode->i_ino);
+ return -1;
+ }
+
epos->block = *eloc;
epos->offset = sizeof(struct allocExtDesc);
brelse(epos->bh);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 81155b9b445b..a522c15a0bfd 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -179,7 +179,8 @@ static int __init init_inodecache(void)
udf_inode_cachep = kmem_cache_create("udf_inode_cache",
sizeof(struct udf_inode_info),
0, (SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT),
init_once);
if (!udf_inode_cachep)
return -ENOMEM;
@@ -278,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
{
int i;
int nr_groups = bitmap->s_nr_groups;
- int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
- nr_groups);
for (i = 0; i < nr_groups; i++)
if (bitmap->s_block_bitmap[i])
brelse(bitmap->s_block_bitmap[i]);
- if (size <= PAGE_SIZE)
- kfree(bitmap);
- else
- vfree(bitmap);
+ kvfree(bitmap);
}
static void udf_free_partition(struct udf_part_map *map)
@@ -1586,6 +1582,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
}
/*
+ * Maximum number of Terminating Descriptor redirections. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_TD_NESTING 64
+
+/*
* Process a main/reserve volume descriptor sequence.
* @block First block of first extent of the sequence.
* @lastblock Lastblock of first extent of the sequence.
@@ -1609,6 +1612,7 @@ static noinline int udf_process_sequence(
uint16_t ident;
long next_s = 0, next_e = 0;
int ret;
+ unsigned int indirections = 0;
memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
@@ -1679,6 +1683,12 @@ static noinline int udf_process_sequence(
}
break;
case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
+ if (++indirections > UDF_MAX_TD_NESTING) {
+ udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING);
+ brelse(bh);
+ return -EIO;
+ }
+
vds[VDS_POS_TERMINATING_DESC].block = block;
if (next_e) {
block = next_s;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index ce169b49429d..fa0044b6b81d 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -158,6 +158,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern long udf_block_map(struct inode *, sector_t);
extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
struct kernel_lb_addr *, uint32_t *, sector_t *);
+extern int udf_setup_indirect_aext(struct inode *inode, int block,
+ struct extent_position *epos);
+extern int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc);
extern int udf_add_aext(struct inode *, struct extent_position *,
struct kernel_lb_addr *, uint32_t, int);
extern void udf_write_aext(struct inode *, struct extent_position *,
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index ab478e62baae..e788a05aab83 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -128,11 +128,15 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
if (c < 0x80U)
utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
else if (c < 0x800U) {
+ if (utf_o->u_len > (UDF_NAME_LEN - 4))
+ break;
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xc0 | (c >> 6));
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0x80 | (c & 0x3f));
} else {
+ if (utf_o->u_len > (UDF_NAME_LEN - 5))
+ break;
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xe0 | (c >> 12));
utf_o->u_name[utf_o->u_len++] =
@@ -173,17 +177,22 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
{
unsigned c, i, max_val, utf_char;
- int utf_cnt, u_len;
+ int utf_cnt, u_len, u_ch;
memset(ocu, 0, sizeof(dstring) * length);
ocu[0] = 8;
max_val = 0xffU;
+ u_ch = 1;
try_again:
u_len = 0U;
utf_char = 0U;
utf_cnt = 0U;
for (i = 0U; i < utf->u_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + 1 + u_ch >= length)
+ return 0;
+
c = (uint8_t)utf->u_name[i];
/* Complete a multi-byte UTF-8 character */
@@ -225,6 +234,7 @@ try_again:
if (max_val == 0xffU) {
max_val = 0xffffU;
ocu[0] = (uint8_t)0x10U;
+ u_ch = 2;
goto try_again;
}
goto error_out;
@@ -277,7 +287,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
c = (c << 8) | ocu[i++];
len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
- UDF_NAME_LEN - utf_o->u_len);
+ UDF_NAME_LEN - 2 - utf_o->u_len);
/* Valid character? */
if (len >= 0)
utf_o->u_len += len;
@@ -295,15 +305,19 @@ static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
int len;
unsigned i, max_val;
uint16_t uni_char;
- int u_len;
+ int u_len, u_ch;
memset(ocu, 0, sizeof(dstring) * length);
ocu[0] = 8;
max_val = 0xffU;
+ u_ch = 1;
try_again:
u_len = 0U;
for (i = 0U; i < uni->u_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + 1 + u_ch >= length)
+ return 0;
len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
if (!len)
continue;
@@ -316,6 +330,7 @@ try_again:
if (uni_char > max_val) {
max_val = 0xffffU;
ocu[0] = (uint8_t)0x10U;
+ u_ch = 2;
goto try_again;
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f6390eec02ca..442fd52ebffe 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1427,7 +1427,7 @@ static int __init init_inodecache(void)
ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
sizeof(struct ufs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ufs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/utimes.c b/fs/utimes.c
index aa138d64560a..85c40f4f373d 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times)
}
}
retry_deleg:
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = notify_change(path->dentry, &newattrs, &delegated_inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
diff --git a/fs/xattr.c b/fs/xattr.c
index d5dd6c8b82a7..07d0e47f6a7f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (error)
return error;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = security_inode_setxattr(dentry, name, value, size, flags);
if (error)
goto out;
@@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -277,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
if (error)
return error;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = security_inode_removexattr(dentry, name);
if (error)
goto out;
@@ -290,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index cc6b768fc068..d1c66e465ca5 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT SLAB_ACCOUNT
#define kmem_zone kmem_cache
#define kmem_zone_t struct kmem_cache
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index e2536bb1c760..dc97eb21af07 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
/*
* Values for di_flags
- * There should be a one-to-one correspondence between these flags and the
- * XFS_XFLAG_s.
*/
#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
@@ -1026,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
/*
+ * Values for di_flags2 These start by being exposed to userspace in the upper
+ * 16 bits of the XFS_XFLAG_s range.
+ */
+#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
+#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
+
+#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX)
+
+/*
* Inode number format:
* low inopblog bits - offset in block
* next agblklog bits - block number in ag
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index b2b73a998d42..fffe3d01bd9f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -36,40 +36,6 @@ struct dioattr {
#endif
/*
- * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
- */
-#ifndef HAVE_FSXATTR
-struct fsxattr {
- __u32 fsx_xflags; /* xflags field value (get/set) */
- __u32 fsx_extsize; /* extsize field value (get/set)*/
- __u32 fsx_nextents; /* nextents field value (get) */
- __u32 fsx_projid; /* project identifier (get/set) */
- unsigned char fsx_pad[12];
-};
-#endif
-
-/*
- * Flags for the bs_xflags/fsx_xflags field
- * There should be a one-to-one correspondence between these flags and the
- * XFS_DIFLAG_s.
- */
-#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
-#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
-#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
-#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */
-#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
-#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */
-#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
-#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
-#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
-#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
-#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
-#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
-#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
-#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
-#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
-
-/*
* Structure for XFS_IOC_GETBMAP.
* On input, fill in bmv_offset and bmv_length of the first structure
* to indicate the area of interest in the file, and bmv_entries with
@@ -514,8 +480,8 @@ typedef struct xfs_swapext
#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
-#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr)
-#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
+#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index daed4bfb85b2..435c7de42e5f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1527,6 +1527,16 @@ xfs_wait_buftarg(
LIST_HEAD(dispose);
int loop = 0;
+ /*
+ * We need to flush the buffer workqueue to ensure that all IO
+ * completion processing is 100% done. Just waiting on buffer locks is
+ * not sufficient for async IO as the reference count held over IO is
+ * not released until after the buffer lock is dropped. Hence we need to
+ * ensure here that all reference counts have been dropped before we
+ * start walking the LRU list.
+ */
+ drain_workqueue(btp->bt_mount->m_buf_workqueue);
+
/* loop until there is nothing left on the lru list. */
while (list_lru_count(&btp->bt_lru)) {
list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ebe9b8290a70..52883ac3cf84 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -55,7 +55,7 @@ xfs_rw_ilock(
int type)
{
if (type & XFS_IOLOCK_EXCL)
- mutex_lock(&VFS_I(ip)->i_mutex);
+ inode_lock(VFS_I(ip));
xfs_ilock(ip, type);
}
@@ -66,7 +66,7 @@ xfs_rw_iunlock(
{
xfs_iunlock(ip, type);
if (type & XFS_IOLOCK_EXCL)
- mutex_unlock(&VFS_I(ip)->i_mutex);
+ inode_unlock(VFS_I(ip));
}
static inline void
@@ -76,7 +76,7 @@ xfs_rw_ilock_demote(
{
xfs_ilock_demote(ip, type);
if (type & XFS_IOLOCK_EXCL)
- mutex_unlock(&VFS_I(ip)->i_mutex);
+ inode_unlock(VFS_I(ip));
}
/*
@@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault(
/*
* pfn_mkwrite was originally inteneded to ensure we capture time stamp
* updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
- * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
- * barrier in place.
+ * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
+ * to ensure we serialise the fault barrier in place.
*/
static int
xfs_filemap_pfn_mkwrite(
@@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite(
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
+ else if (IS_DAX(inode))
+ ret = dax_pfn_mkwrite(vma, vmf);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ae3758a90ed6..ceba1a83cacc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -610,60 +610,69 @@ __xfs_iflock(
STATIC uint
_xfs_dic2xflags(
- __uint16_t di_flags)
+ __uint16_t di_flags,
+ uint64_t di_flags2,
+ bool has_attr)
{
uint flags = 0;
if (di_flags & XFS_DIFLAG_ANY) {
if (di_flags & XFS_DIFLAG_REALTIME)
- flags |= XFS_XFLAG_REALTIME;
+ flags |= FS_XFLAG_REALTIME;
if (di_flags & XFS_DIFLAG_PREALLOC)
- flags |= XFS_XFLAG_PREALLOC;
+ flags |= FS_XFLAG_PREALLOC;
if (di_flags & XFS_DIFLAG_IMMUTABLE)
- flags |= XFS_XFLAG_IMMUTABLE;
+ flags |= FS_XFLAG_IMMUTABLE;
if (di_flags & XFS_DIFLAG_APPEND)
- flags |= XFS_XFLAG_APPEND;
+ flags |= FS_XFLAG_APPEND;
if (di_flags & XFS_DIFLAG_SYNC)
- flags |= XFS_XFLAG_SYNC;
+ flags |= FS_XFLAG_SYNC;
if (di_flags & XFS_DIFLAG_NOATIME)
- flags |= XFS_XFLAG_NOATIME;
+ flags |= FS_XFLAG_NOATIME;
if (di_flags & XFS_DIFLAG_NODUMP)
- flags |= XFS_XFLAG_NODUMP;
+ flags |= FS_XFLAG_NODUMP;
if (di_flags & XFS_DIFLAG_RTINHERIT)
- flags |= XFS_XFLAG_RTINHERIT;
+ flags |= FS_XFLAG_RTINHERIT;
if (di_flags & XFS_DIFLAG_PROJINHERIT)
- flags |= XFS_XFLAG_PROJINHERIT;
+ flags |= FS_XFLAG_PROJINHERIT;
if (di_flags & XFS_DIFLAG_NOSYMLINKS)
- flags |= XFS_XFLAG_NOSYMLINKS;
+ flags |= FS_XFLAG_NOSYMLINKS;
if (di_flags & XFS_DIFLAG_EXTSIZE)
- flags |= XFS_XFLAG_EXTSIZE;
+ flags |= FS_XFLAG_EXTSIZE;
if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
- flags |= XFS_XFLAG_EXTSZINHERIT;
+ flags |= FS_XFLAG_EXTSZINHERIT;
if (di_flags & XFS_DIFLAG_NODEFRAG)
- flags |= XFS_XFLAG_NODEFRAG;
+ flags |= FS_XFLAG_NODEFRAG;
if (di_flags & XFS_DIFLAG_FILESTREAM)
- flags |= XFS_XFLAG_FILESTREAM;
+ flags |= FS_XFLAG_FILESTREAM;
}
+ if (di_flags2 & XFS_DIFLAG2_ANY) {
+ if (di_flags2 & XFS_DIFLAG2_DAX)
+ flags |= FS_XFLAG_DAX;
+ }
+
+ if (has_attr)
+ flags |= FS_XFLAG_HASATTR;
+
return flags;
}
uint
xfs_ip2xflags(
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- xfs_icdinode_t *dic = &ip->i_d;
+ struct xfs_icdinode *dic = &ip->i_d;
- return _xfs_dic2xflags(dic->di_flags) |
- (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
+ return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
}
uint
xfs_dic2xflags(
- xfs_dinode_t *dip)
+ struct xfs_dinode *dip)
{
- return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
- (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
+ return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
+ be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
}
/*
@@ -862,7 +871,8 @@ xfs_ialloc(
case S_IFREG:
case S_IFDIR:
if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
- uint di_flags = 0;
+ uint64_t di_flags2 = 0;
+ uint di_flags = 0;
if (S_ISDIR(mode)) {
if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
@@ -898,7 +908,11 @@ xfs_ialloc(
di_flags |= XFS_DIFLAG_NODEFRAG;
if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
di_flags |= XFS_DIFLAG_FILESTREAM;
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+ di_flags2 |= XFS_DIFLAG2_DAX;
+
ip->i_d.di_flags |= di_flags;
+ ip->i_d.di_flags2 |= di_flags2;
}
/* FALLTHROUGH */
case S_IFLNK:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d42738deec6d..478d04e07f95 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -859,25 +859,25 @@ xfs_merge_ioc_xflags(
unsigned int xflags = start;
if (flags & FS_IMMUTABLE_FL)
- xflags |= XFS_XFLAG_IMMUTABLE;
+ xflags |= FS_XFLAG_IMMUTABLE;
else
- xflags &= ~XFS_XFLAG_IMMUTABLE;
+ xflags &= ~FS_XFLAG_IMMUTABLE;
if (flags & FS_APPEND_FL)
- xflags |= XFS_XFLAG_APPEND;
+ xflags |= FS_XFLAG_APPEND;
else
- xflags &= ~XFS_XFLAG_APPEND;
+ xflags &= ~FS_XFLAG_APPEND;
if (flags & FS_SYNC_FL)
- xflags |= XFS_XFLAG_SYNC;
+ xflags |= FS_XFLAG_SYNC;
else
- xflags &= ~XFS_XFLAG_SYNC;
+ xflags &= ~FS_XFLAG_SYNC;
if (flags & FS_NOATIME_FL)
- xflags |= XFS_XFLAG_NOATIME;
+ xflags |= FS_XFLAG_NOATIME;
else
- xflags &= ~XFS_XFLAG_NOATIME;
+ xflags &= ~FS_XFLAG_NOATIME;
if (flags & FS_NODUMP_FL)
- xflags |= XFS_XFLAG_NODUMP;
+ xflags |= FS_XFLAG_NODUMP;
else
- xflags &= ~XFS_XFLAG_NODUMP;
+ xflags &= ~FS_XFLAG_NODUMP;
return xflags;
}
@@ -945,40 +945,51 @@ xfs_set_diflags(
unsigned int xflags)
{
unsigned int di_flags;
+ uint64_t di_flags2;
/* can't set PREALLOC this way, just preserve it */
di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
- if (xflags & XFS_XFLAG_IMMUTABLE)
+ if (xflags & FS_XFLAG_IMMUTABLE)
di_flags |= XFS_DIFLAG_IMMUTABLE;
- if (xflags & XFS_XFLAG_APPEND)
+ if (xflags & FS_XFLAG_APPEND)
di_flags |= XFS_DIFLAG_APPEND;
- if (xflags & XFS_XFLAG_SYNC)
+ if (xflags & FS_XFLAG_SYNC)
di_flags |= XFS_DIFLAG_SYNC;
- if (xflags & XFS_XFLAG_NOATIME)
+ if (xflags & FS_XFLAG_NOATIME)
di_flags |= XFS_DIFLAG_NOATIME;
- if (xflags & XFS_XFLAG_NODUMP)
+ if (xflags & FS_XFLAG_NODUMP)
di_flags |= XFS_DIFLAG_NODUMP;
- if (xflags & XFS_XFLAG_NODEFRAG)
+ if (xflags & FS_XFLAG_NODEFRAG)
di_flags |= XFS_DIFLAG_NODEFRAG;
- if (xflags & XFS_XFLAG_FILESTREAM)
+ if (xflags & FS_XFLAG_FILESTREAM)
di_flags |= XFS_DIFLAG_FILESTREAM;
if (S_ISDIR(ip->i_d.di_mode)) {
- if (xflags & XFS_XFLAG_RTINHERIT)
+ if (xflags & FS_XFLAG_RTINHERIT)
di_flags |= XFS_DIFLAG_RTINHERIT;
- if (xflags & XFS_XFLAG_NOSYMLINKS)
+ if (xflags & FS_XFLAG_NOSYMLINKS)
di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if (xflags & XFS_XFLAG_EXTSZINHERIT)
+ if (xflags & FS_XFLAG_EXTSZINHERIT)
di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- if (xflags & XFS_XFLAG_PROJINHERIT)
+ if (xflags & FS_XFLAG_PROJINHERIT)
di_flags |= XFS_DIFLAG_PROJINHERIT;
} else if (S_ISREG(ip->i_d.di_mode)) {
- if (xflags & XFS_XFLAG_REALTIME)
+ if (xflags & FS_XFLAG_REALTIME)
di_flags |= XFS_DIFLAG_REALTIME;
- if (xflags & XFS_XFLAG_EXTSIZE)
+ if (xflags & FS_XFLAG_EXTSIZE)
di_flags |= XFS_DIFLAG_EXTSIZE;
}
-
ip->i_d.di_flags = di_flags;
+
+ /* diflags2 only valid for v3 inodes. */
+ if (ip->i_d.di_version < 3)
+ return;
+
+ di_flags2 = 0;
+ if (xflags & FS_XFLAG_DAX)
+ di_flags2 |= XFS_DIFLAG2_DAX;
+
+ ip->i_d.di_flags2 = di_flags2;
+
}
STATIC void
@@ -988,22 +999,27 @@ xfs_diflags_to_linux(
struct inode *inode = VFS_I(ip);
unsigned int xflags = xfs_ip2xflags(ip);
- if (xflags & XFS_XFLAG_IMMUTABLE)
+ if (xflags & FS_XFLAG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
else
inode->i_flags &= ~S_IMMUTABLE;
- if (xflags & XFS_XFLAG_APPEND)
+ if (xflags & FS_XFLAG_APPEND)
inode->i_flags |= S_APPEND;
else
inode->i_flags &= ~S_APPEND;
- if (xflags & XFS_XFLAG_SYNC)
+ if (xflags & FS_XFLAG_SYNC)
inode->i_flags |= S_SYNC;
else
inode->i_flags &= ~S_SYNC;
- if (xflags & XFS_XFLAG_NOATIME)
+ if (xflags & FS_XFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
+ if (xflags & FS_XFLAG_DAX)
+ inode->i_flags |= S_DAX;
+ else
+ inode->i_flags &= ~S_DAX;
+
}
static int
@@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags(
/* Can't change realtime flag if any extents are allocated. */
if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
- XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+ XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
return -EINVAL;
/* If realtime flag is set then must have realtime device */
- if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+ if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
(ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
return -EINVAL;
@@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags(
* we have appropriate permission.
*/
if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
- (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+ (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
@@ -1095,8 +1111,8 @@ out_cancel:
* extent size hint validation is somewhat cumbersome. Rules are:
*
* 1. extent size hint is only valid for directories and regular files
- * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
- * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
* 4. can only be changed on regular files if no extents are allocated
* 5. can be changed on directories at any time
* 6. extsize hint of 0 turns off hints, clears inode flags.
@@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize(
{
struct xfs_mount *mp = ip->i_mount;
- if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
return -EINVAL;
- if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
!S_ISDIR(ip->i_d.di_mode))
return -EINVAL;
@@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize(
return -EINVAL;
if (XFS_IS_REALTIME_INODE(ip) ||
- (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+ (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
} else {
size = mp->m_sb.sb_blocksize;
@@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize(
if (fa->fsx_extsize % size)
return -EINVAL;
} else
- fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+ fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
return 0;
}
@@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid(
if (xfs_get_projid(ip) != fa->fsx_projid)
return -EINVAL;
- if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+ if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=
(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
return -EINVAL;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 06eafafe636e..76b71a1c6c32 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1205,8 +1205,8 @@ xfs_diflags_to_iflags(
inode->i_flags |= S_SYNC;
if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
- /* XXX: Also needs an on-disk per inode flag! */
- if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+ if (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+ ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
inode->i_flags |= S_DAX;
}
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index dc6221942b85..ade236e90bb3 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -42,11 +42,11 @@ xfs_break_layouts(
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock);
if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
error = break_layout(inode, true);
*iolock = XFS_IOLOCK_EXCL;
if (with_imutex)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
xfs_ilock(ip, *iolock);
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b35775752b74..59c9b7bd958d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1714,8 +1714,8 @@ xfs_init_zones(void)
xfs_inode_zone =
kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
- xfs_fs_inode_init_once);
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+ KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index aa67339b9537..4f18fd92ca13 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -497,7 +497,6 @@ xfsaild(
long tout = 0; /* milliseconds */
current->flags |= PF_MEMALLOC;
- set_freezable();
while (!kthread_should_stop()) {
if (tout && tout <= 20)