diff options
Diffstat (limited to 'fs/xfs/xfs_iops.c')
| -rw-r--r-- | fs/xfs/xfs_iops.c | 1489 |
1 files changed, 881 insertions, 608 deletions
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 96dda62d497b..ad94fbf55014 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1,52 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_acl.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" +#include "xfs_acl.h" +#include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_vnodeops.h" -#include "xfs_inode_item.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_btree.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_dir2.h" +#include "xfs_iomap.h" +#include "xfs_error.h" +#include "xfs_ioctl.h" +#include "xfs_xattr.h" +#include "xfs_file.h" +#include "xfs_bmap.h" +#include "xfs_zone_alloc.h" -#include <linux/capability.h> -#include <linux/xattr.h> -#include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/security.h> +#include <linux/iversion.h> #include <linux/fiemap.h> -#include <linux/slab.h> + +/* + * Directories have different lock order w.r.t. mmap_lock compared to regular + * files. This is due to readdir potentially triggering page faults on a user + * buffer inside filldir(), and this happens with the ilock on the directory + * held. For regular files, the lock order is the other way around - the + * mmap_lock is taken during the page fault, and then we lock the ilock to do + * block mapping. Hence we need a different class for the directory ilock so + * that lockdep can tell them apart. Directories in the metadata directory + * tree get a separate class so that lockdep reports will warn us if someone + * ever tries to lock regular directories after locking metadata directories. + */ +static struct lock_class_key xfs_nondir_ilock_class; +static struct lock_class_key xfs_dir_ilock_class; static int xfs_initxattrs( @@ -59,8 +61,15 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_SECURE, + .name = xattr->name, + .namelen = strlen(xattr->name), + .value = xattr->value, + .valuelen = xattr->value_len, + }; + error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT); if (error < 0) break; } @@ -73,15 +82,14 @@ xfs_initxattrs( * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ - -STATIC int -xfs_init_security( +int +xfs_inode_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) { return security_inode_init_security(inode, dir, qstr, - &xfs_initxattrs, NULL); + &xfs_initxattrs, NULL); } static void @@ -91,6 +99,23 @@ xfs_dentry_to_name( { namep->name = dentry->d_name.name; namep->len = dentry->d_name.len; + namep->type = XFS_DIR3_FT_UNKNOWN; +} + +static int +xfs_dentry_mode_to_name( + struct xfs_name *namep, + struct dentry *dentry, + int mode) +{ + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; + namep->type = xfs_mode_to_ftype(mode); + + if (unlikely(namep->type == XFS_DIR3_FT_UNKNOWN)) + return -EFSCORRUPTED; + + return 0; } STATIC void @@ -103,95 +128,185 @@ xfs_cleanup_inode( /* Oh, the horror. * If we can't add the ACL or we fail in - * xfs_init_security we must back out. + * xfs_inode_init_security we must back out. * ENOSPC can hit here, among other things. */ xfs_dentry_to_name(&teardown, dentry); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } -STATIC int -xfs_vn_mknod( +/* + * Check to see if we are likely to need an extended attribute to be added to + * the inode we are about to allocate. This allows the attribute fork to be + * created during the inode allocation, reducing the number of transactions we + * need to do in this fast path. + * + * The security checks are optimistic, but not guaranteed. The two LSMs that + * require xattrs to be added here (selinux and smack) are also the only two + * LSMs that add a sb->s_security structure to the superblock. Hence if security + * is enabled and sb->s_security is set, we have a pretty good idea that we are + * going to be asked to add a security xattr immediately after allocating the + * xfs inode and instantiating the VFS inode. + */ +static inline bool +xfs_create_need_xattr( struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t rdev) + struct posix_acl *default_acl, + struct posix_acl *acl) { - struct inode *inode; - struct xfs_inode *ip = NULL; - struct posix_acl *default_acl = NULL; - struct xfs_name name; - int error; + if (acl) + return true; + if (default_acl) + return true; +#if IS_ENABLED(CONFIG_SECURITY) + if (dir->i_sb->s_security) + return true; +#endif + return false; +} + + +STATIC int +xfs_generic_create( + struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev, + struct file *tmpfile) /* unnamed file */ +{ + struct xfs_icreate_args args = { + .idmap = idmap, + .pip = XFS_I(dir), + .rdev = rdev, + .mode = mode, + }; + struct inode *inode; + struct xfs_inode *ip = NULL; + struct posix_acl *default_acl, *acl; + struct xfs_name name; + int error; /* * Irix uses Missed'em'V split, but doesn't want to see * the upper 5 bits of (14bit) major. */ - if (S_ISCHR(mode) || S_ISBLK(mode)) { - if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) + if (S_ISCHR(args.mode) || S_ISBLK(args.mode)) { + if (unlikely(!sysv_valid_dev(args.rdev) || + MAJOR(args.rdev) & ~0x1ff)) return -EINVAL; - rdev = sysv_encode_dev(rdev); } else { - rdev = 0; + args.rdev = 0; } - if (IS_POSIXACL(dir)) { - default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(default_acl)) - return PTR_ERR(default_acl); + error = posix_acl_create(dir, &args.mode, &default_acl, &acl); + if (error) + return error; - if (!default_acl) - mode &= ~current_umask(); - } + /* Verify mode is valid also for tmpfile case */ + error = xfs_dentry_mode_to_name(&name, dentry, args.mode); + if (unlikely(error)) + goto out_free_acl; - xfs_dentry_to_name(&name, dentry); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + if (!tmpfile) { + if (xfs_create_need_xattr(dir, default_acl, acl)) + args.flags |= XFS_ICREATE_INIT_XATTRS; + + error = xfs_create(&args, &name, &ip); + } else { + args.flags |= XFS_ICREATE_TMPFILE; + + /* + * If this temporary file will not be linkable, don't bother + * creating an attr fork to receive a parent pointer. + */ + if (tmpfile->f_flags & O_EXCL) + args.flags |= XFS_ICREATE_UNLINKABLE; + + error = xfs_create_tmpfile(&args, &ip); + } if (unlikely(error)) goto out_free_acl; inode = VFS_I(ip); - error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; if (default_acl) { - error = -xfs_inherit_acl(inode, default_acl); - default_acl = NULL; - if (unlikely(error)) + error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto out_cleanup_inode; + } + if (acl) { + error = __xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) goto out_cleanup_inode; } + xfs_setup_iops(ip); - d_instantiate(dentry, inode); - return -error; + if (tmpfile) { + /* + * The VFS requires that any inode fed to d_tmpfile must have + * nlink == 1 so that it can decrement the nlink in d_tmpfile. + * However, we created the temp file with nlink == 0 because + * we're not allowed to put an inode with nlink > 0 on the + * unlinked list. Therefore we have to set nlink to 1 so that + * d_tmpfile can immediately set it back to zero. + */ + set_nlink(inode, 1); + d_tmpfile(tmpfile, inode); + } else + d_instantiate(dentry, inode); + + xfs_finish_inode_setup(ip); - out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); out_free_acl: posix_acl_release(default_acl); - return -error; + posix_acl_release(acl); + return error; + + out_cleanup_inode: + xfs_finish_inode_setup(ip); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + xfs_irele(ip); + goto out_free_acl; } STATIC int -xfs_vn_create( - struct inode *dir, - struct dentry *dentry, - umode_t mode, - bool flags) +xfs_vn_mknod( + struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) { - return xfs_vn_mknod(dir, dentry, mode, 0); + return xfs_generic_create(idmap, dir, dentry, mode, rdev, NULL); } STATIC int +xfs_vn_create( + struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool flags) +{ + return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); +} + +STATIC struct dentry * xfs_vn_mkdir( - struct inode *dir, - struct dentry *dentry, - umode_t mode) + struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { - return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); + return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL)); } STATIC struct dentry * @@ -200,6 +315,7 @@ xfs_vn_lookup( struct dentry *dentry, unsigned int flags) { + struct inode *inode; struct xfs_inode *cip; struct xfs_name name; int error; @@ -209,14 +325,13 @@ xfs_vn_lookup( xfs_dentry_to_name(&name, dentry); error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); - if (unlikely(error)) { - if (unlikely(error != ENOENT)) - return ERR_PTR(-error); - d_add(dentry, NULL); - return NULL; - } - - return d_splice_alias(VFS_I(cip), dentry); + if (likely(!error)) + inode = VFS_I(cip); + else if (likely(error == -ENOENT)) + inode = NULL; + else + inode = ERR_PTR(error); + return d_splice_alias(inode, dentry); } STATIC struct dentry * @@ -237,8 +352,8 @@ xfs_vn_ci_lookup( xfs_dentry_to_name(&xname, dentry); error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); if (unlikely(error)) { - if (unlikely(error != ENOENT)) - return ERR_PTR(-error); + if (unlikely(error != -ENOENT)) + return ERR_PTR(error); /* * call d_add(dentry, NULL) here when d_drop_negative_children * is called in xfs_vn_mknod (ie. allow negative dentries @@ -255,7 +370,7 @@ xfs_vn_ci_lookup( dname.name = ci_name.name; dname.len = ci_name.len; dentry = d_add_ci(dentry, VFS_I(ip), &dname); - kmem_free(ci_name.name); + kfree(ci_name.name); return dentry; } @@ -265,15 +380,20 @@ xfs_vn_link( struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry); + error = xfs_dentry_mode_to_name(&name, dentry, inode->i_mode); + if (unlikely(error)) + return error; + + if (IS_PRIVATE(inode)) + return -EPERM; error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) - return -error; + return error; ihold(inode); d_instantiate(dentry, inode); @@ -290,7 +410,7 @@ xfs_vn_unlink( xfs_dentry_to_name(&name, dentry); - error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); if (error) return error; @@ -299,63 +419,86 @@ xfs_vn_unlink( * but still hashed. This is incompatible with case-insensitive * mode, so invalidate (unhash) the dentry in CI-mode. */ - if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + if (xfs_has_asciici(XFS_M(dir->i_sb))) d_invalidate(dentry); return 0; } STATIC int xfs_vn_symlink( - struct inode *dir, - struct dentry *dentry, - const char *symname) + struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + const char *symname) { - struct inode *inode; - struct xfs_inode *cip = NULL; - struct xfs_name name; - int error; - umode_t mode; + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode = S_IFLNK | S_IRWXUGO; - mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); - xfs_dentry_to_name(&name, dentry); + error = xfs_dentry_mode_to_name(&name, dentry, mode); + if (unlikely(error)) + goto out; - error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); + error = xfs_symlink(idmap, XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) goto out; inode = VFS_I(cip); - error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; + xfs_setup_iops(cip); + d_instantiate(dentry, inode); + xfs_finish_inode_setup(cip); return 0; out_cleanup_inode: + xfs_finish_inode_setup(cip); xfs_cleanup_inode(dir, inode, dentry); + xfs_irele(cip); out: - return -error; + return error; } STATIC int xfs_vn_rename( - struct inode *odir, - struct dentry *odentry, - struct inode *ndir, - struct dentry *ndentry) + struct mnt_idmap *idmap, + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry, + unsigned int flags) { - struct inode *new_inode = ndentry->d_inode; + struct inode *new_inode = d_inode(ndentry); + int omode = 0; + int error; struct xfs_name oname; struct xfs_name nname; - xfs_dentry_to_name(&oname, odentry); - xfs_dentry_to_name(&nname, ndentry); + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; - return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), - XFS_I(ndir), &nname, new_inode ? - XFS_I(new_inode) : NULL); + /* if we are exchanging files, we need to set i_mode of both files */ + if (flags & RENAME_EXCHANGE) + omode = d_inode(ndentry)->i_mode; + + error = xfs_dentry_mode_to_name(&oname, odentry, omode); + if (omode && unlikely(error)) + return error; + + error = xfs_dentry_mode_to_name(&nname, ndentry, + d_inode(odentry)->i_mode); + if (unlikely(error)) + return error; + + return xfs_rename(idmap, XFS_I(odir), &oname, + XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, + new_inode ? XFS_I(new_inode) : NULL, flags); } /* @@ -363,91 +506,246 @@ xfs_vn_rename( * we need to be very careful about how much stack we use. * uio is kmalloced for this reason... */ -STATIC void * -xfs_vn_follow_link( +STATIC const char * +xfs_vn_get_link( struct dentry *dentry, - struct nameidata *nd) + struct inode *inode, + struct delayed_call *done) { char *link; int error = -ENOMEM; - link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!dentry) + return ERR_PTR(-ECHILD); + + link = kmalloc(XFS_SYMLINK_MAXLEN+1, GFP_KERNEL); if (!link) goto out_err; - error = -xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(d_inode(dentry)), link); if (unlikely(error)) goto out_kfree; - nd_set_link(nd, link); - return NULL; + set_delayed_call(done, kfree_link, link); + return link; out_kfree: kfree(link); out_err: - nd_set_link(nd, ERR_PTR(error)); - return NULL; + return ERR_PTR(error); } -STATIC void -xfs_vn_put_link( - struct dentry *dentry, - struct nameidata *nd, - void *p) +static uint32_t +xfs_stat_blksize( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If the file blocks are being allocated from a realtime volume, then + * always return the realtime extent size. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip) ? : 1); + + /* + * Allow large block sizes to be reported to userspace programs if the + * "largeio" mount option is used. + * + * If compatibility mode is specified, simply return the basic unit of + * caching so that we don't get inefficient read/modify/write I/O from + * user apps. Otherwise.... + * + * If the underlying volume is a stripe, then return the stripe width in + * bytes as the recommended I/O size. It is not a stripe and we've set a + * default buffered I/O size, return that, otherwise return the compat + * default. + */ + if (xfs_has_large_iosize(mp)) { + if (mp->m_swidth) + return XFS_FSB_TO_B(mp, mp->m_swidth); + if (xfs_has_allocsize(mp)) + return 1U << mp->m_allocsize_log; + } + + return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); +} + +static void +xfs_report_dioalign( + struct xfs_inode *ip, + struct kstat *stat) +{ + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + + /* + * For COW inodes, we can only perform out of place writes of entire + * allocation units (blocks or RT extents). + * For writes smaller than the allocation unit, we must fall back to + * buffered I/O to perform read-modify-write cycles. At best this is + * highly inefficient; at worst it leads to page cache invalidation + * races. Tell applications to avoid this by reporting the larger write + * alignment in dio_offset_align, and the smaller read alignment in + * dio_read_offset_align. + */ + stat->dio_read_offset_align = bdev_logical_block_size(bdev); + if (xfs_is_cow_inode(ip)) + stat->dio_offset_align = xfs_inode_alloc_unitsize(ip); + else + stat->dio_offset_align = stat->dio_read_offset_align; +} + +unsigned int +xfs_get_atomic_write_min( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a minimum size of one fsblock. Without this + * mechanism, we can only guarantee atomic writes up to a single LBA. + * + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (xfs_inode_can_hw_atomic_write(ip) || + xfs_inode_can_sw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + + return 0; +} + +unsigned int +xfs_get_atomic_write_max( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (!xfs_inode_can_sw_atomic_write(ip)) { + if (xfs_inode_can_hw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + return 0; + } + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a maximum size of whatever we can complete through + * that means. Hardware support is reported via max_opt, not here. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max); + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max); +} + +unsigned int +xfs_get_atomic_write_max_opt( + struct xfs_inode *ip) { - char *s = nd_get_link(nd); + unsigned int awu_max = xfs_get_atomic_write_max(ip); - if (!IS_ERR(s)) - kfree(s); + /* if the max is 1x block, then just keep behaviour that opt is 0 */ + if (awu_max <= ip->i_mount->m_sb.sb_blocksize) + return 0; + + /* + * Advertise the maximum size of an atomic write that we can tell the + * block device to perform for us. In general the bdev limit will be + * less than our out of place write limit, but we don't want to exceed + * the awu_max. + */ + return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max); +} + +static void +xfs_report_atomic_write( + struct xfs_inode *ip, + struct kstat *stat) +{ + generic_fill_statx_atomic_writes(stat, + xfs_get_atomic_write_min(ip), + xfs_get_atomic_write_max(ip), + xfs_get_atomic_write_max_opt(ip)); } STATIC int xfs_vn_getattr( - struct vfsmount *mnt, - struct dentry *dentry, - struct kstat *stat) + struct mnt_idmap *idmap, + const struct path *path, + struct kstat *stat, + u32 request_mask, + unsigned int query_flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); trace_xfs_getattr(ip); - if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + if (xfs_is_shutdown(mp)) + return -EIO; stat->size = XFS_ISIZE(ip); stat->dev = inode->i_sb->s_dev; - stat->mode = ip->i_d.di_mode; - stat->nlink = ip->i_d.di_nlink; - stat->uid = ip->i_d.di_uid; - stat->gid = ip->i_d.di_gid; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; - stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; - stat->blocks = - XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + stat->atime = inode_get_atime(inode); + + fill_mg_cmtime(stat, request_mask, inode); + + stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); + + if (xfs_has_v3inodes(mp)) { + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = ip->i_crtime; + } + } + + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ + if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (ip->i_diflags & XFS_DIFLAG_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (ip->i_diflags & XFS_DIFLAG_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND | + STATX_ATTR_NODUMP); switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: stat->blksize = BLKDEV_IOSIZE; - stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); + stat->rdev = inode->i_rdev; break; + case S_IFREG: + if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) + xfs_report_dioalign(ip, stat); + if (request_mask & STATX_WRITE_ATOMIC) + xfs_report_atomic_write(ip, stat); + fallthrough; default: - if (XFS_IS_REALTIME_INODE(ip)) { - /* - * If the file blocks are being allocated from a - * realtime volume, then return the inode's realtime - * extent size or the realtime volume's extent size. - */ - stat->blksize = - xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; - } else - stat->blksize = xfs_preferred_iosize(mp); + stat->blksize = xfs_stat_blksize(ip); stat->rdev = 0; break; } @@ -455,55 +753,45 @@ xfs_vn_getattr( return 0; } -static void -xfs_setattr_mode( - struct xfs_trans *tp, - struct xfs_inode *ip, +static int +xfs_vn_change_ok( + struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; + struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount; - ASSERT(tp); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (xfs_is_readonly(mp)) + return -EROFS; - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; + if (xfs_is_shutdown(mp)) + return -EIO; - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; + return setattr_prepare(idmap, dentry, iattr); } -int +/* + * Set non-size attributes of an inode. + * + * Caution: The caller of this function is responsible for calling + * setattr_prepare() or otherwise verifying the change is fine. + */ +static int xfs_setattr_nonsize( + struct mnt_idmap *idmap, + struct dentry *dentry, struct xfs_inode *ip, - struct iattr *iattr, - int flags) + struct iattr *iattr) { xfs_mount_t *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; xfs_trans_t *tp; int error; - uid_t uid = 0, iuid = 0; - gid_t gid = 0, igid = 0; + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; struct xfs_dquot *udqp = NULL, *gdqp = NULL; - struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; - - trace_xfs_setattr(ip); - - /* If acls are being inherited, we already have this checked */ - if (!(flags & XFS_ATTR_NOACL)) { - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - error = -inode_change_ok(inode, iattr); - if (error) - return XFS_ERROR(error); - } + struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL; ASSERT((mask & ATTR_SIZE) == 0); @@ -519,16 +807,18 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = iattr->ia_uid; + uid = from_vfsuid(idmap, i_user_ns(inode), + iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { - uid = ip->i_d.di_uid; + uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = iattr->ia_gid; + gid = from_vfsgid(idmap, i_user_ns(inode), + iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { - gid = ip->i_d.di_gid; + gid = inode->i_gid; } /* @@ -538,138 +828,55 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), - qflags, &udqp, &gdqp, NULL); + error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_projid, + qflags, &udqp, &gdqp, NULL); if (error) return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL, + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_dqrele; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Change file ownership. Must be the owner or privileged. - */ - if (mask & (ATTR_UID|ATTR_GID)) { - /* - * These IDs could have changed since we last looked at them. - * But, we're assured that if the ownership did change - * while we didn't have the inode locked, inode's dquot(s) - * would have changed also. - */ - iuid = ip->i_d.di_uid; - igid = ip->i_d.di_gid; - gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; - uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - - /* - * Do a quota reservation only if uid/gid is actually - * going to change. - */ - if (XFS_IS_QUOTA_RUNNING(mp) && - ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || - (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { - ASSERT(tp); - error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, - NULL, capable(CAP_FOWNER) ? - XFS_QMOPT_FORCE_RES : 0); - if (error) /* out of quota */ - goto out_trans_cancel; - } - } - - xfs_trans_ijoin(tp, ip, 0); - - /* - * Change file ownership. Must be the owner or privileged. - */ - if (mask & (ATTR_UID|ATTR_GID)) { - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) - ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); - - /* - * Change the ownerships and register quota modifications - * in the transaction. - */ - if (iuid != uid) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & ATTR_UID); - ASSERT(udqp); - olddquot1 = xfs_qm_vop_chown(tp, ip, - &ip->i_udquot, udqp); - } - ip->i_d.di_uid = uid; - inode->i_uid = uid; - } - if (igid != gid) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(!XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & ATTR_GID); - ASSERT(gdqp); - olddquot2 = xfs_qm_vop_chown(tp, ip, - &ip->i_gdquot, gdqp); - } - ip->i_d.di_gid = gid; - inode->i_gid = gid; - } - } - - /* - * Change file access modes. - */ - if (mask & ATTR_MODE) - xfs_setattr_mode(tp, ip, iattr); - /* - * Change file access or modified times. + * Register quota modifications in the transaction. Must be the owner + * or privileged. These IDs could have changed since we last looked at + * them. But, we're assured that if the ownership did change while we + * didn't have the inode locked, inode's dquot(s) would have changed + * also. */ - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + if (XFS_IS_UQUOTA_ON(mp) && + i_uid_needs_update(idmap, iattr, inode)) { + ASSERT(udqp); + old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + if (XFS_IS_GQUOTA_ON(mp) && + i_gid_needs_update(idmap, iattr, inode)) { + ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); + ASSERT(gdqp); + old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } + setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - XFS_STATS_INC(xs_ig_attrchg); + XFS_STATS_INC(mp, xs_ig_attrchg); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. */ - xfs_qm_dqrele(olddquot1); - xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(old_udqp); + xfs_qm_dqrele(old_gdqp); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); if (error) - return XFS_ERROR(error); + return error; /* * XXX(hch): Updating the ACL entries is not atomic vs the i_mode @@ -678,17 +885,14 @@ xfs_setattr_nonsize( * to attr_set. No previous user of the generic * Posix ACL code seems to care about this issue either. */ - if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { - error = -xfs_acl_chmod(inode); + if (mask & ATTR_MODE) { + error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (error) - return XFS_ERROR(error); + return error; } return 0; -out_trans_cancel: - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -697,42 +901,31 @@ out_dqrele: /* * Truncate file. Must have write permission and not be a directory. + * + * Caution: The caller of this function is responsible for calling + * setattr_prepare() or otherwise verifying the change is fine. */ -int +STATIC int xfs_setattr_size( + struct mnt_idmap *idmap, + struct dentry *dentry, struct xfs_inode *ip, - struct iattr *iattr, - int flags) + struct iattr *iattr) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); - int mask = iattr->ia_valid; xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags = 0; - uint commit_flags = 0; - - trace_xfs_setattr(ip); + uint resblks = 0; + bool did_zeroing = false; + struct xfs_zone_alloc_ctx ac = { }; - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - error = -inode_change_ok(inode, iattr); - if (error) - return XFS_ERROR(error); - - ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); - - if (!(flags & XFS_ATTR_NOLOCK)) { - lock_flags |= XFS_IOLOCK_EXCL; - xfs_ilock(ip, lock_flags); - } + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + ASSERT(S_ISREG(inode->i_mode)); + ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; @@ -740,86 +933,130 @@ xfs_setattr_size( /* * Short circuit the truncate case for zero length files. */ - if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { - if (!(mask & (ATTR_CTIME|ATTR_MTIME))) - goto out_unlock; + if (newsize == 0 && oldsize == 0 && ip->i_df.if_nextents == 0) { + if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) + return 0; /* * Use the regular setattr path to update the timestamps. */ - xfs_iunlock(ip, lock_flags); iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(ip, iattr, 0); + return xfs_setattr_nonsize(idmap, dentry, ip, iattr); } /* * Make sure that the dquots are attached to the inode. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) - goto out_unlock; + return error; /* - * Now we can make the changes. Before we join the inode to the - * transaction, take care of the part of the truncation that must be - * done without the inode lock. This needs to be done before joining - * the inode to the transaction, because the inode cannot be unlocked - * once it is a part of the transaction. + * Wait for all direct I/O to complete. + */ + inode_dio_wait(inode); + + /* + * Normally xfs_zoned_space_reserve is supposed to be called outside the + * IOLOCK. For truncate we can't do that since ->setattr is called with + * it already held by the VFS. So for now chicken out and try to + * allocate space under it. + * + * To avoid deadlocks this means we can't block waiting for space, which + * can lead to spurious -ENOSPC if there are no directly available + * blocks. We mitigate this a bit by allowing zeroing to dip into the + * reserved pool, but eventually the VFS calling convention needs to + * change. + */ + if (xfs_is_zoned_inode(ip)) { + error = xfs_zoned_space_reserve(mp, 1, + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); + if (error) { + if (error == -EAGAIN) + return -ENOSPC; + return error; + } + } + + /* + * File data changes must be complete before we start the transaction to + * modify the inode. This needs to be done before joining the inode to + * the transaction because the inode cannot be unlocked once it is a + * part of the transaction. + * + * Start with zeroing any data beyond EOF that we may expose on file + * extension, or zeroing out the rest of the block on a downward + * truncate. */ if (newsize > oldsize) { - /* - * Do the first part of growing a file: zero any data in the - * last block that is beyond the old EOF. We need to do this - * before the inode is joined to the transaction to modify - * i_size. - */ - error = xfs_zero_eof(ip, newsize, oldsize); - if (error) - goto out_unlock; + trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); + error = xfs_zero_range(ip, oldsize, newsize - oldsize, + &ac, &did_zeroing); + } else { + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing); } + if (xfs_is_zoned_inode(ip)) + xfs_zoned_space_unreserve(mp, &ac); + + if (error) + return error; + + /* + * We've already locked out new page faults, so now we can safely remove + * pages from the page cache knowing they won't get refaulted until we + * drop the XFS_MMAP_EXCL lock after the extent manipulations are + * complete. The truncate_setsize() call also cleans partial EOF page + * PTEs on extending truncates and hence ensures sub-page block size + * filesystems are correctly handled, too. + * + * We have to do all the page cache truncate work outside the + * transaction context as the "lock" order is page lock->log space + * reservation as defined by extent allocation in the writeback path. + * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but + * having already truncated the in-memory version of the file (i.e. made + * user visible changes). There's not much we can do about this, except + * to hope that the caller sees ENOMEM and retries the truncate + * operation. + * + * And we update in-core i_size and truncate page cache beyond newsize + * before writeback the [i_disk_size, newsize] range, so we're + * guaranteed not to write stale data past the new EOF on truncate down. + */ + truncate_setsize(inode, newsize); + /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files - * problem. - * - * Only flush from the on disk size to the smaller of the in memory - * file size or the new size as that's the range we really care about - * here and prevents waiting for other data not within the range we - * care about here. + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. */ - if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { - error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ip->i_d.di_size, newsize); + if (did_zeroing || + (newsize > ip->i_disk_size && oldsize != ip->i_disk_size)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_disk_size, newsize - 1); if (error) - goto out_unlock; + return error; } /* - * Wait for all direct I/O to complete. + * For realtime inode with more than one block rtextsize, we need the + * block reservation for bmap btree block allocations/splits that can + * happen since it could split the tail written extent and convert the + * right beyond EOF one to unwritten. */ - inode_dio_wait(inode); - - error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); - if (error) - goto out_unlock; + if (xfs_inode_has_bigrtalloc(ip)) + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, 0, &tp); if (error) - goto out_trans_cancel; - - truncate_setsize(inode, newsize); + return error; - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); /* @@ -832,10 +1069,11 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (newsize != oldsize && + !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = - current_fs_time(inode->i_sb); - mask |= ATTR_CTIME | ATTR_MTIME; + current_time(inode); + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } /* @@ -850,13 +1088,13 @@ xfs_setattr_size( * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ - ip->i_d.di_size = newsize; + ip->i_disk_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) - goto out_trans_abort; + goto out_trans_cancel; /* * Truncated "down", so we're removing references to old data @@ -871,137 +1109,121 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - /* - * Change file access modes. - */ - if (mask & ATTR_MODE) - xfs_setattr_mode(tp, ip, iattr); - - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } - + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - XFS_STATS_INC(xs_ig_attrchg); + XFS_STATS_INC(mp, xs_ig_attrchg); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; -out_trans_abort: - commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, commit_flags); + xfs_trans_cancel(tp); goto out_unlock; } +int +xfs_vn_setattr_size( + struct mnt_idmap *idmap, + struct dentry *dentry, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error; + + trace_xfs_setattr(ip); + + error = xfs_vn_change_ok(idmap, dentry, iattr); + if (error) + return error; + return xfs_setattr_size(idmap, dentry, ip, iattr); +} + STATIC int xfs_vn_setattr( - struct dentry *dentry, - struct iattr *iattr) + struct mnt_idmap *idmap, + struct dentry *dentry, + struct iattr *iattr) { - if (iattr->ia_valid & ATTR_SIZE) - return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0); - return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0); + struct inode *inode = d_inode(dentry); + struct xfs_inode *ip = XFS_I(inode); + int error; + + if (iattr->ia_valid & ATTR_SIZE) { + uint iolock; + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); + if (error) { + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + return error; + } + + error = xfs_vn_setattr_size(idmap, dentry, iattr); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + } else { + trace_xfs_setattr(ip); + + error = xfs_vn_change_ok(idmap, dentry, iattr); + if (!error) + error = xfs_setattr_nonsize(idmap, dentry, ip, iattr); + } + + return error; } STATIC int xfs_vn_update_time( struct inode *inode, - struct timespec *now, int flags) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + int log_flags = XFS_ILOG_TIMESTAMP; struct xfs_trans *tp; int error; + struct timespec64 now; trace_xfs_update_time(ip); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return -error; - } + if (inode->i_sb->s_flags & SB_LAZYTIME) { + if (!((flags & S_VERSION) && + inode_maybe_inc_iversion(inode, false))) { + generic_update_time(inode, flags); + return 0; + } - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (flags & S_CTIME) { - inode->i_ctime = *now; - ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec; - } - if (flags & S_MTIME) { - inode->i_mtime = *now; - ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec; + /* Capture the iversion update that just occurred */ + log_flags |= XFS_ILOG_CORE; } - if (flags & S_ATIME) { - inode->i_atime = *now; - ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec; - } - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - return -xfs_trans_commit(tp, 0); -} - -#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) -/* - * Call fiemap helper to fill in user data. - * Returns positive errors to xfs_getbmap. - */ -STATIC int -xfs_fiemap_format( - void **arg, - struct getbmapx *bmv, - int *full) -{ - int error; - struct fiemap_extent_info *fieinfo = *arg; - u32 fiemap_flags = 0; - u64 logical, physical, length; - - /* Do nothing for a hole */ - if (bmv->bmv_block == -1LL) - return 0; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) + return error; - logical = BBTOB(bmv->bmv_offset); - physical = BBTOB(bmv->bmv_block); - length = BBTOB(bmv->bmv_length); + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (flags & (S_CTIME|S_MTIME)) + now = inode_set_ctime_current(inode); + else + now = current_time(inode); - if (bmv->bmv_oflags & BMV_OF_PREALLOC) - fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; - else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - physical = 0; /* no block yet */ - } - if (bmv->bmv_oflags & BMV_OF_LAST) - fiemap_flags |= FIEMAP_EXTENT_LAST; - - error = fiemap_fill_next_extent(fieinfo, logical, physical, - length, fiemap_flags); - if (error > 0) { - error = 0; - *full = 1; /* user array now full */ - } + if (flags & S_MTIME) + inode_set_mtime_to_ts(inode, now); + if (flags & S_ATIME) + inode_set_atime_to_ts(inode, now); - return -error; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, log_flags); + return xfs_trans_commit(tp); } STATIC int @@ -1011,50 +1233,44 @@ xfs_vn_fiemap( u64 start, u64 length) { - xfs_inode_t *ip = XFS_I(inode); - struct getbmapx bm; int error; - error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); - if (error) - return error; + xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; + error = iomap_fiemap(inode, fieinfo, start, length, + &xfs_xattr_iomap_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, length, + &xfs_read_iomap_ops); + } + xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); - /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBB(start); - /* Special case for whole file */ - if (length == FIEMAP_MAX_OFFSET) - bm.bmv_length = -1LL; - else - bm.bmv_length = BTOBB(length); - - /* We add one because in getbmap world count includes the header */ - bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : - fieinfo->fi_extents_max + 1; - bm.bmv_count = min_t(__s32, bm.bmv_count, - (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) - bm.bmv_iflags |= BMV_IF_ATTRFORK; - if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) - bm.bmv_iflags |= BMV_IF_DELALLOC; - - error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); - if (error) - return -error; + return error; +} - return 0; +STATIC int +xfs_vn_tmpfile( + struct mnt_idmap *idmap, + struct inode *dir, + struct file *file, + umode_t mode) +{ + int err = xfs_generic_create(idmap, dir, file->f_path.dentry, mode, 0, file); + + return finish_open_simple(file, err); } static const struct inode_operations xfs_inode_operations = { - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .fiemap = xfs_vn_fiemap, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; static const struct inode_operations xfs_dir_inode_operations = { @@ -1073,14 +1289,15 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; static const struct inode_operations xfs_dir_ci_inode_operations = { @@ -1099,113 +1316,185 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; static const struct inode_operations xfs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = xfs_vn_follow_link, - .put_link = xfs_vn_put_link, - .get_acl = xfs_get_acl, + .get_link = xfs_vn_get_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; -STATIC void -xfs_diflags_to_iflags( - struct inode *inode, +/* Figure out if this file actually supports DAX. */ +static bool +xfs_inode_supports_dax( struct xfs_inode *ip) { - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; + struct xfs_mount *mp = ip->i_mount; + + /* Only supported on regular files. */ + if (!S_ISREG(VFS_I(ip)->i_mode)) + return false; + + /* Block size must match page size */ + if (mp->m_sb.sb_blocksize != PAGE_SIZE) + return false; + + /* Device has to support DAX too. */ + return xfs_inode_buftarg(ip)->bt_daxdev != NULL; +} + +static bool +xfs_inode_should_enable_dax( + struct xfs_inode *ip) +{ + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + if (xfs_has_dax_never(ip->i_mount)) + return false; + if (!xfs_inode_supports_dax(ip)) + return false; + if (xfs_has_dax_always(ip->i_mount)) + return true; + if (ip->i_diflags2 & XFS_DIFLAG2_DAX) + return true; + return false; +} + +void +xfs_diflags_to_iflags( + struct xfs_inode *ip, + bool init) +{ + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + unsigned int flags = 0; + + ASSERT(!(IS_DAX(inode) && init)); + + if (xflags & FS_XFLAG_IMMUTABLE) + flags |= S_IMMUTABLE; + if (xflags & FS_XFLAG_APPEND) + flags |= S_APPEND; + if (xflags & FS_XFLAG_SYNC) + flags |= S_SYNC; + if (xflags & FS_XFLAG_NOATIME) + flags |= S_NOATIME; + if (init && xfs_inode_should_enable_dax(ip)) + flags |= S_DAX; + + /* + * S_DAX can only be set during inode initialization and is never set by + * the VFS, so we cannot mask off S_DAX in i_flags. + */ + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME); + inode->i_flags |= flags; } /* - * Initialize the Linux inode, set up the operation vectors and - * unlock the inode. - * - * When reading existing inodes from disk this is called directly - * from xfs_iget, when creating a new inode it is called from - * xfs_ialloc after setting up the inode. + * Initialize the Linux inode. * - * We are always called with an uninitialised linux inode here. - * We need to initialise the necessary fields and take a reference - * on it. + * When reading existing inodes from disk this is called directly from xfs_iget, + * when creating a new inode it is called from xfs_init_new_inode after setting + * up the inode. These callers have different criteria for clearing XFS_INEW, so + * leave it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( struct xfs_inode *ip) { struct inode *inode = &ip->i_vnode; + gfp_t gfp_mask; + bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; - inode->i_state = I_NEW; + inode_state_set_raw(inode, I_NEW); inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ - hlist_add_fake(&inode->i_hash); + inode_fake_hash(inode); - inode->i_mode = ip->i_d.di_mode; - set_nlink(inode, ip->i_d.di_nlink); - inode->i_uid = ip->i_d.di_uid; - inode->i_gid = ip->i_d.di_gid; + i_size_write(inode, ip->i_disk_size); + xfs_diflags_to_iflags(ip, true); - switch (inode->i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - inode->i_rdev = - MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); - break; - default: - inode->i_rdev = 0; - break; + /* + * Mark our metadata files as private so that LSMs and the ACL code + * don't try to add their own metadata or reason about these files, + * and users cannot ever obtain file handles to them. + */ + if (is_meta) { + inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + } + + if (S_ISDIR(inode->i_mode)) { + /* + * We set the i_rwsem class here to avoid potential races with + * lockdep_annotate_inode_mutex_key() reinitialising the lock + * after a filehandle lookup has already found the inode in + * cache before it has been unlocked via unlock_new_inode(). + */ + lockdep_set_class(&inode->i_rwsem, + &inode->i_sb->s_type->i_mutex_dir_key); + lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class); + } else { + lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class); + } + + /* + * Ensure all page cache allocations are done from GFP_NOFS context to + * prevent direct reclaim recursion back into the filesystem and blowing + * stacks or deadlocking. + */ + gfp_mask = mapping_gfp_mask(inode->i_mapping); + mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); + + /* + * For real-time inodes update the stable write flags to that of the RT + * device instead of the data device. + */ + if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip)) + xfs_update_stable_writes(ip); + + /* + * If there is no attribute fork no ACL can exist on this inode, + * and it can't have any file capabilities attached to it either. + */ + if (!xfs_inode_has_attr_fork(ip)) { + inode_has_no_xattr(inode); + cache_no_acl(inode); } +} - inode->i_generation = ip->i_d.di_gen; - i_size_write(inode, ip->i_d.di_size); - inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; - inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; - inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; - xfs_diflags_to_iflags(inode, ip); +void +xfs_setup_iops( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; + if (IS_DAX(inode)) + inode->i_mapping->a_ops = &xfs_dax_aops; + else + inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: - if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + if (xfs_has_asciici(XFS_M(inode->i_sb))) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; @@ -1213,26 +1502,10 @@ xfs_setup_inode( break; case S_IFLNK: inode->i_op = &xfs_symlink_inode_operations; - if (!(ip->i_df.if_flags & XFS_IFINLINE)) - inode->i_mapping->a_ops = &xfs_address_space_operations; break; default: inode->i_op = &xfs_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); break; } - - /* - * If there is no attribute fork no ACL can exist on this inode, - * and it can't have any file capabilities attached to it either. - */ - if (!XFS_IFORK_Q(ip)) { - inode_has_no_xattr(inode); - cache_no_acl(inode); - } - - xfs_iflags_clear(ip, XFS_INEW); - barrier(); - - unlock_new_inode(inode); } |
