summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_iops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_iops.c')
-rw-r--r--fs/xfs/xfs_iops.c167
1 files changed, 118 insertions, 49 deletions
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a0d77f5f512e..40289fe6f5b2 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,6 +17,8 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_btree.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
@@ -25,6 +27,8 @@
#include "xfs_error.h"
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
+#include "xfs_file.h"
+#include "xfs_bmap.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -38,7 +42,9 @@
* held. For regular files, the lock order is the other way around - the
* mmap_lock is taken during the page fault, and then we lock the ilock to do
* block mapping. Hence we need a different class for the directory ilock so
- * that lockdep can tell them apart.
+ * that lockdep can tell them apart. Directories in the metadata directory
+ * tree get a separate class so that lockdep reports will warn us if someone
+ * ever tries to lock regular directories after locking metadata directories.
*/
static struct lock_class_key xfs_nondir_ilock_class;
static struct lock_class_key xfs_dir_ilock_class;
@@ -62,7 +68,7 @@ xfs_initxattrs(
.value = xattr->value,
.valuelen = xattr->value_len,
};
- error = xfs_attr_change(&args);
+ error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT);
if (error < 0)
break;
}
@@ -169,38 +175,55 @@ xfs_generic_create(
dev_t rdev,
struct file *tmpfile) /* unnamed file */
{
- struct inode *inode;
- struct xfs_inode *ip = NULL;
- struct posix_acl *default_acl, *acl;
- struct xfs_name name;
- int error;
+ struct xfs_icreate_args args = {
+ .idmap = idmap,
+ .pip = XFS_I(dir),
+ .rdev = rdev,
+ .mode = mode,
+ };
+ struct inode *inode;
+ struct xfs_inode *ip = NULL;
+ struct posix_acl *default_acl, *acl;
+ struct xfs_name name;
+ int error;
/*
* Irix uses Missed'em'V split, but doesn't want to see
* the upper 5 bits of (14bit) major.
*/
- if (S_ISCHR(mode) || S_ISBLK(mode)) {
- if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+ if (S_ISCHR(args.mode) || S_ISBLK(args.mode)) {
+ if (unlikely(!sysv_valid_dev(args.rdev) ||
+ MAJOR(args.rdev) & ~0x1ff))
return -EINVAL;
} else {
- rdev = 0;
+ args.rdev = 0;
}
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ error = posix_acl_create(dir, &args.mode, &default_acl, &acl);
if (error)
return error;
/* Verify mode is valid also for tmpfile case */
- error = xfs_dentry_mode_to_name(&name, dentry, mode);
+ error = xfs_dentry_mode_to_name(&name, dentry, args.mode);
if (unlikely(error))
goto out_free_acl;
if (!tmpfile) {
- error = xfs_create(idmap, XFS_I(dir), &name, mode, rdev,
- xfs_create_need_xattr(dir, default_acl, acl),
- &ip);
+ if (xfs_create_need_xattr(dir, default_acl, acl))
+ args.flags |= XFS_ICREATE_INIT_XATTRS;
+
+ error = xfs_create(&args, &name, &ip);
} else {
- error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, &ip);
+ args.flags |= XFS_ICREATE_TMPFILE;
+
+ /*
+ * If this temporary file will not be linkable, don't bother
+ * creating an attr fork to receive a parent pointer.
+ */
+ if (tmpfile->f_flags & O_EXCL)
+ args.flags |= XFS_ICREATE_UNLINKABLE;
+
+ error = xfs_create_tmpfile(&args, &ip);
}
if (unlikely(error))
goto out_free_acl;
@@ -346,7 +369,7 @@ xfs_vn_ci_lookup(
dname.name = ci_name.name;
dname.len = ci_name.len;
dentry = d_add_ci(dentry, VFS_I(ip), &dname);
- kmem_free(ci_name.name);
+ kfree(ci_name.name);
return dentry;
}
@@ -364,6 +387,9 @@ xfs_vn_link(
if (unlikely(error))
return error;
+ if (IS_PRIVATE(inode))
+ return -EPERM;
+
error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
if (unlikely(error))
return error;
@@ -521,7 +547,7 @@ xfs_stat_blksize(
* always return the realtime extent size.
*/
if (XFS_IS_REALTIME_INODE(ip))
- return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip));
+ return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip) ? : 1);
/*
* Allow large block sizes to be reported to userspace programs if the
@@ -543,7 +569,47 @@ xfs_stat_blksize(
return 1U << mp->m_allocsize_log;
}
- return PAGE_SIZE;
+ return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
+}
+
+static void
+xfs_report_dioalign(
+ struct xfs_inode *ip,
+ struct kstat *stat)
+{
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+
+ stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+
+ /*
+ * For COW inodes, we can only perform out of place writes of entire
+ * allocation units (blocks or RT extents).
+ * For writes smaller than the allocation unit, we must fall back to
+ * buffered I/O to perform read-modify-write cycles. At best this is
+ * highly inefficient; at worst it leads to page cache invalidation
+ * races. Tell applications to avoid this by reporting the larger write
+ * alignment in dio_offset_align, and the smaller read alignment in
+ * dio_read_offset_align.
+ */
+ stat->dio_read_offset_align = bdev_logical_block_size(bdev);
+ if (xfs_is_cow_inode(ip))
+ stat->dio_offset_align = xfs_inode_alloc_unitsize(ip);
+ else
+ stat->dio_offset_align = stat->dio_read_offset_align;
+}
+
+static void
+xfs_report_atomic_write(
+ struct xfs_inode *ip,
+ struct kstat *stat)
+{
+ unsigned int unit_min = 0, unit_max = 0;
+
+ if (xfs_inode_can_atomicwrite(ip))
+ unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
+ generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
}
STATIC int
@@ -573,8 +639,9 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode_get_atime(inode);
- stat->mtime = inode_get_mtime(inode);
- stat->ctime = inode_get_ctime(inode);
+
+ fill_mg_cmtime(stat, request_mask, inode);
+
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
if (xfs_has_v3inodes(mp)) {
@@ -584,11 +651,6 @@ xfs_vn_getattr(
}
}
- if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
- stat->change_cookie = inode_query_iversion(inode);
- stat->result_mask |= STATX_CHANGE_COOKIE;
- }
-
/*
* Note: If you add another clause to set an attribute flag, please
* update attributes_mask below.
@@ -611,14 +673,10 @@ xfs_vn_getattr(
stat->rdev = inode->i_rdev;
break;
case S_IFREG:
- if (request_mask & STATX_DIOALIGN) {
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
- struct block_device *bdev = target->bt_bdev;
-
- stat->result_mask |= STATX_DIOALIGN;
- stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
- stat->dio_offset_align = bdev_logical_block_size(bdev);
- }
+ if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN))
+ xfs_report_dioalign(ip, stat);
+ if (request_mask & STATX_WRITE_ATOMIC)
+ xfs_report_atomic_write(ip, stat);
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
@@ -794,10 +852,10 @@ xfs_setattr_size(
struct xfs_trans *tp;
int error;
uint lock_flags = 0;
+ uint resblks = 0;
bool did_zeroing = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
@@ -846,16 +904,6 @@ xfs_setattr_size(
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
&did_zeroing);
} else {
- /*
- * iomap won't detect a dirty page over an unwritten block (or a
- * cow block over a hole) and subsequently skips zeroing the
- * newly post-EOF portion of the page. Flush the new EOF to
- * convert the block before the pagecache truncate.
- */
- error = filemap_write_and_wait_range(inode->i_mapping, newsize,
- newsize);
- if (error)
- return error;
error = xfs_truncate_page(ip, newsize, &did_zeroing);
}
@@ -901,7 +949,17 @@ xfs_setattr_size(
return error;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ /*
+ * For realtime inode with more than one block rtextsize, we need the
+ * block reservation for bmap btree block allocations/splits that can
+ * happen since it could split the tail written extent and convert the
+ * right beyond EOF one to unwritten.
+ */
+ if (xfs_inode_has_bigrtalloc(ip))
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+ 0, 0, &tp);
if (error)
return error;
@@ -1265,6 +1323,7 @@ xfs_setup_inode(
{
struct inode *inode = &ip->i_vnode;
gfp_t gfp_mask;
+ bool is_meta = xfs_is_internal_inode(ip);
inode->i_ino = ip->i_ino;
inode->i_state |= I_NEW;
@@ -1276,6 +1335,16 @@ xfs_setup_inode(
i_size_write(inode, ip->i_disk_size);
xfs_diflags_to_iflags(ip, true);
+ /*
+ * Mark our metadata files as private so that LSMs and the ACL code
+ * don't try to add their own metadata or reason about these files,
+ * and users cannot ever obtain file handles to them.
+ */
+ if (is_meta) {
+ inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
+
if (S_ISDIR(inode->i_mode)) {
/*
* We set the i_rwsem class here to avoid potential races with
@@ -1285,9 +1354,9 @@ xfs_setup_inode(
*/
lockdep_set_class(&inode->i_rwsem,
&inode->i_sb->s_type->i_mutex_dir_key);
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+ lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class);
} else {
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+ lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class);
}
/*