diff options
Diffstat (limited to 'fs/xfs/xfs_super.c')
| -rw-r--r-- | fs/xfs/xfs_super.c | 3087 |
1 files changed, 1853 insertions, 1234 deletions
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 38aaacdbb8b3..bc71aa9dcee8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" @@ -23,18 +11,15 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_alloc.h" -#include "xfs_error.h" #include "xfs_fsops.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" @@ -50,471 +35,183 @@ #include "xfs_refcount_item.h" #include "xfs_bmap_item.h" #include "xfs_reflink.h" - -#include <linux/namei.h> -#include <linux/dax.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/mount.h> -#include <linux/mempool.h> -#include <linux/writeback.h> -#include <linux/kthread.h> -#include <linux/freezer.h> -#include <linux/parser.h> +#include "xfs_pwork.h" +#include "xfs_ag.h" +#include "xfs_defer.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" +#include "xfs_iunlink_item.h" +#include "xfs_dahash_test.h" +#include "xfs_rtbitmap.h" +#include "xfs_exchmaps_item.h" +#include "xfs_parent.h" +#include "xfs_rtalloc.h" +#include "xfs_zone_alloc.h" +#include "scrub/stats.h" +#include "scrub/rcbag_btree.h" + +#include <linux/magic.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> static const struct super_operations xfs_super_operations; -struct bio_set *xfs_ioend_bioset; +static struct dentry *xfs_debugfs; /* top-level xfs debugfs dir */ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif -/* - * Table driven mount option parser. - */ -enum { - Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize, - Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, - Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, - Opt_allocsize, Opt_norecovery, Opt_barrier, Opt_nobarrier, - Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep, - Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams, - Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, - Opt_uquota, Opt_gquota, Opt_pquota, - Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, -}; - -static const match_table_t tokens = { - {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */ - {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */ - {Opt_logdev, "logdev=%s"}, /* log device */ - {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */ - {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */ - {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */ - {Opt_noalign, "noalign"}, /* turn off stripe alignment */ - {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */ - {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */ - {Opt_swidth, "swidth=%u"}, /* data volume stripe width */ - {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */ - {Opt_mtpt, "mtpt"}, /* filesystem mount point */ - {Opt_grpid, "grpid"}, /* group-ID from parent directory */ - {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */ - {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */ - {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */ - {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */ - {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */ - {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */ - {Opt_inode32, "inode32"}, /* inode allocation limited to - * XFS_MAXINUMBER_32 */ - {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */ - {Opt_noikeep, "noikeep"}, /* free empty inode clusters */ - {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */ - {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes - * in stat(). */ - {Opt_attr2, "attr2"}, /* do use attr2 attribute format */ - {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */ - {Opt_filestreams,"filestreams"},/* use filestreams allocator */ - {Opt_quota, "quota"}, /* disk quotas (user) */ - {Opt_noquota, "noquota"}, /* no quotas */ - {Opt_usrquota, "usrquota"}, /* user quota enabled */ - {Opt_grpquota, "grpquota"}, /* group quota enabled */ - {Opt_prjquota, "prjquota"}, /* project quota enabled */ - {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */ - {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */ - {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */ - {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */ - {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */ - {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */ - {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */ - {Opt_discard, "discard"}, /* Discard unused blocks */ - {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ - - {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ - - /* Deprecated mount options scheduled for removal */ - {Opt_barrier, "barrier"}, /* use writer barriers for log write and - * unwritten extent conversion */ - {Opt_nobarrier, "nobarrier"}, /* .. disable */ - - {Opt_err, NULL}, +enum xfs_dax_mode { + XFS_DAX_INODE = 0, + XFS_DAX_ALWAYS = 1, + XFS_DAX_NEVER = 2, }; +/* Were quota mount options provided? Must use the upper 16 bits of qflags. */ +#define XFS_QFLAGS_MNTOPTS (1U << 31) -STATIC int -suffix_kstrtoint(const substring_t *s, unsigned int base, int *res) +static void +xfs_mount_set_dax_mode( + struct xfs_mount *mp, + enum xfs_dax_mode mode) { - int last, shift_left_factor = 0, _res; - char *value; - int ret = 0; - - value = match_strdup(s); - if (!value) - return -ENOMEM; - - last = strlen(value) - 1; - if (value[last] == 'K' || value[last] == 'k') { - shift_left_factor = 10; - value[last] = '\0'; + switch (mode) { + case XFS_DAX_INODE: + mp->m_features &= ~(XFS_FEAT_DAX_ALWAYS | XFS_FEAT_DAX_NEVER); + break; + case XFS_DAX_ALWAYS: + mp->m_features |= XFS_FEAT_DAX_ALWAYS; + mp->m_features &= ~XFS_FEAT_DAX_NEVER; + break; + case XFS_DAX_NEVER: + mp->m_features |= XFS_FEAT_DAX_NEVER; + mp->m_features &= ~XFS_FEAT_DAX_ALWAYS; + break; } - if (value[last] == 'M' || value[last] == 'm') { - shift_left_factor = 20; - value[last] = '\0'; - } - if (value[last] == 'G' || value[last] == 'g') { - shift_left_factor = 30; - value[last] = '\0'; - } - - if (kstrtoint(value, base, &_res)) - ret = -EINVAL; - kfree(value); - *res = _res << shift_left_factor; - return ret; } +static const struct constant_table dax_param_enums[] = { + {"inode", XFS_DAX_INODE }, + {"always", XFS_DAX_ALWAYS }, + {"never", XFS_DAX_NEVER }, + {} +}; + /* - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock has _not_ yet been read in. - * - * Note that this function leaks the various device name allocations on - * failure. The caller takes care of them. - * - * *sb is const because this is also used to test options on the remount - * path, and we don't want this to have any side effects at remount time. - * Today this function does not change *sb, but just to future-proof... + * Table driven mount option parser. */ -STATIC int -xfs_parseargs( - struct xfs_mount *mp, - char *options) -{ - const struct super_block *sb = mp->m_super; - char *p; - substring_t args[MAX_OPT_ARGS]; - int dsunit = 0; - int dswidth = 0; - int iosize = 0; - uint8_t iosizelog = 0; - - /* - * set up the mount name first so all the errors will refer to the - * correct device. - */ - mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); - if (!mp->m_fsname) - return -ENOMEM; - mp->m_fsname_len = strlen(mp->m_fsname) + 1; - - /* - * Copy binary VFS mount flags we are interested in. - */ - if (sb->s_flags & MS_RDONLY) - mp->m_flags |= XFS_MOUNT_RDONLY; - if (sb->s_flags & MS_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (sb->s_flags & MS_SYNCHRONOUS) - mp->m_flags |= XFS_MOUNT_WSYNC; - - /* - * Set some default flags that could be cleared by the mount option - * parsing. - */ - mp->m_flags |= XFS_MOUNT_BARRIER; - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - - /* - * These can be overridden by the mount option parsing. - */ - mp->m_logbufs = -1; - mp->m_logbsize = -1; +enum { + Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, + Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, + Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, + Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, + Opt_largeio, Opt_nolargeio, + Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, + Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, + Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, + Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, +}; - if (!options) - goto done; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_logbufs: - if (match_int(args, &mp->m_logbufs)) - return -EINVAL; - break; - case Opt_logbsize: - if (suffix_kstrtoint(args, 10, &mp->m_logbsize)) - return -EINVAL; - break; - case Opt_logdev: - mp->m_logname = match_strdup(args); - if (!mp->m_logname) - return -ENOMEM; - break; - case Opt_mtpt: - xfs_warn(mp, "%s option not allowed on this system", p); - return -EINVAL; - case Opt_rtdev: - mp->m_rtname = match_strdup(args); - if (!mp->m_rtname) - return -ENOMEM; - break; - case Opt_allocsize: - case Opt_biosize: - if (suffix_kstrtoint(args, 10, &iosize)) - return -EINVAL; - iosizelog = ffs(iosize) - 1; - break; - case Opt_grpid: - case Opt_bsdgroups: - mp->m_flags |= XFS_MOUNT_GRPID; - break; - case Opt_nogrpid: - case Opt_sysvgroups: - mp->m_flags &= ~XFS_MOUNT_GRPID; - break; - case Opt_wsync: - mp->m_flags |= XFS_MOUNT_WSYNC; - break; - case Opt_norecovery: - mp->m_flags |= XFS_MOUNT_NORECOVERY; - break; - case Opt_noalign: - mp->m_flags |= XFS_MOUNT_NOALIGN; - break; - case Opt_swalloc: - mp->m_flags |= XFS_MOUNT_SWALLOC; - break; - case Opt_sunit: - if (match_int(args, &dsunit)) - return -EINVAL; - break; - case Opt_swidth: - if (match_int(args, &dswidth)) - return -EINVAL; - break; - case Opt_inode32: - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - break; - case Opt_inode64: - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - break; - case Opt_nouuid: - mp->m_flags |= XFS_MOUNT_NOUUID; - break; - case Opt_ikeep: - mp->m_flags |= XFS_MOUNT_IKEEP; - break; - case Opt_noikeep: - mp->m_flags &= ~XFS_MOUNT_IKEEP; - break; - case Opt_largeio: - mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; - break; - case Opt_nolargeio: - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - break; - case Opt_attr2: - mp->m_flags |= XFS_MOUNT_ATTR2; - break; - case Opt_noattr2: - mp->m_flags &= ~XFS_MOUNT_ATTR2; - mp->m_flags |= XFS_MOUNT_NOATTR2; - break; - case Opt_filestreams: - mp->m_flags |= XFS_MOUNT_FILESTREAMS; - break; - case Opt_noquota: - mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; - mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; - mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; - break; - case Opt_quota: - case Opt_uquota: - case Opt_usrquota: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | - XFS_UQUOTA_ENFD); - break; - case Opt_qnoenforce: - case Opt_uqnoenforce: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_UQUOTA_ENFD; - break; - case Opt_pquota: - case Opt_prjquota: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_PQUOTA_ENFD); - break; - case Opt_pqnoenforce: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_PQUOTA_ENFD; - break; - case Opt_gquota: - case Opt_grpquota: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_GQUOTA_ENFD); - break; - case Opt_gqnoenforce: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_GQUOTA_ENFD; - break; - case Opt_discard: - mp->m_flags |= XFS_MOUNT_DISCARD; - break; - case Opt_nodiscard: - mp->m_flags &= ~XFS_MOUNT_DISCARD; - break; -#ifdef CONFIG_FS_DAX - case Opt_dax: - mp->m_flags |= XFS_MOUNT_DAX; - break; -#endif - case Opt_barrier: - xfs_warn(mp, "%s option is deprecated, ignoring.", p); - mp->m_flags |= XFS_MOUNT_BARRIER; - break; - case Opt_nobarrier: - xfs_warn(mp, "%s option is deprecated, ignoring.", p); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - break; - default: - xfs_warn(mp, "unknown mount option [%s].", p); - return -EINVAL; - } - } +#define fsparam_dead(NAME) \ + __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL) +static const struct fs_parameter_spec xfs_fs_parameters[] = { /* - * no recovery flag requires a read-only mount + * These mount options were supposed to be deprecated in September 2025 + * but the deprecation warning was buggy, so not all users were + * notified. The deprecation is now obnoxiously loud and postponed to + * September 2030. */ - if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_warn(mp, "no-recovery mounts must be read-only."); - return -EINVAL; - } - - if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { - xfs_warn(mp, - "sunit and swidth options incompatible with the noalign option"); - return -EINVAL; - } - -#ifndef CONFIG_XFS_QUOTA - if (XFS_IS_QUOTA_RUNNING(mp)) { - xfs_warn(mp, "quota support not available in this kernel."); - return -EINVAL; - } -#endif - - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { - xfs_warn(mp, "sunit and swidth must be specified together"); - return -EINVAL; - } - - if (dsunit && (dswidth % dsunit != 0)) { - xfs_warn(mp, - "stripe width (%d) must be a multiple of the stripe unit (%d)", - dswidth, dsunit); - return -EINVAL; - } - -done: - if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { - /* - * At this point the superblock has not been read - * in, therefore we do not know the block size. - * Before the mount call ends we will convert - * these to FSBs. - */ - mp->m_dalign = dsunit; - mp->m_swidth = dswidth; - } - - if (mp->m_logbufs != -1 && - mp->m_logbufs != 0 && - (mp->m_logbufs < XLOG_MIN_ICLOGS || - mp->m_logbufs > XLOG_MAX_ICLOGS)) { - xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", - mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); - return -EINVAL; - } - if (mp->m_logbsize != -1 && - mp->m_logbsize != 0 && - (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || - mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || - !is_power_of_2(mp->m_logbsize))) { - xfs_warn(mp, - "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", - mp->m_logbsize); - return -EINVAL; - } - - if (iosizelog) { - if (iosizelog > XFS_MAX_IO_LOG || - iosizelog < XFS_MIN_IO_LOG) { - xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", - iosizelog, XFS_MIN_IO_LOG, - XFS_MAX_IO_LOG); - return -EINVAL; - } - - mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; - mp->m_readio_log = iosizelog; - mp->m_writeio_log = iosizelog; - } - - return 0; -} + fsparam_dead("attr2"), + fsparam_dead("noattr2"), + fsparam_dead("ikeep"), + fsparam_dead("noikeep"), + + fsparam_u32("logbufs", Opt_logbufs), + fsparam_string("logbsize", Opt_logbsize), + fsparam_string("logdev", Opt_logdev), + fsparam_string("rtdev", Opt_rtdev), + fsparam_flag("wsync", Opt_wsync), + fsparam_flag("noalign", Opt_noalign), + fsparam_flag("swalloc", Opt_swalloc), + fsparam_u32("sunit", Opt_sunit), + fsparam_u32("swidth", Opt_swidth), + fsparam_flag("nouuid", Opt_nouuid), + fsparam_flag("grpid", Opt_grpid), + fsparam_flag("nogrpid", Opt_nogrpid), + fsparam_flag("bsdgroups", Opt_bsdgroups), + fsparam_flag("sysvgroups", Opt_sysvgroups), + fsparam_string("allocsize", Opt_allocsize), + fsparam_flag("norecovery", Opt_norecovery), + fsparam_flag("inode64", Opt_inode64), + fsparam_flag("inode32", Opt_inode32), + fsparam_flag("largeio", Opt_largeio), + fsparam_flag("nolargeio", Opt_nolargeio), + fsparam_flag("filestreams", Opt_filestreams), + fsparam_flag("quota", Opt_quota), + fsparam_flag("noquota", Opt_noquota), + fsparam_flag("usrquota", Opt_usrquota), + fsparam_flag("grpquota", Opt_grpquota), + fsparam_flag("prjquota", Opt_prjquota), + fsparam_flag("uquota", Opt_uquota), + fsparam_flag("gquota", Opt_gquota), + fsparam_flag("pquota", Opt_pquota), + fsparam_flag("uqnoenforce", Opt_uqnoenforce), + fsparam_flag("gqnoenforce", Opt_gqnoenforce), + fsparam_flag("pqnoenforce", Opt_pqnoenforce), + fsparam_flag("qnoenforce", Opt_qnoenforce), + fsparam_flag("discard", Opt_discard), + fsparam_flag("nodiscard", Opt_nodiscard), + fsparam_flag("dax", Opt_dax), + fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_u32("max_open_zones", Opt_max_open_zones), + fsparam_flag("lifetime", Opt_lifetime), + fsparam_flag("nolifetime", Opt_nolifetime), + fsparam_string("max_atomic_write", Opt_max_atomic_write), + {} +}; struct proc_xfs_info { uint64_t flag; char *str; }; -STATIC int -xfs_showargs( - struct xfs_mount *mp, - struct seq_file *m) +static int +xfs_fs_show_options( + struct seq_file *m, + struct dentry *root) { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_IKEEP, ",ikeep" }, - { XFS_MOUNT_WSYNC, ",wsync" }, - { XFS_MOUNT_NOALIGN, ",noalign" }, - { XFS_MOUNT_SWALLOC, ",swalloc" }, - { XFS_MOUNT_NOUUID, ",nouuid" }, - { XFS_MOUNT_NORECOVERY, ",norecovery" }, - { XFS_MOUNT_ATTR2, ",attr2" }, - { XFS_MOUNT_FILESTREAMS, ",filestreams" }, - { XFS_MOUNT_GRPID, ",grpid" }, - { XFS_MOUNT_DISCARD, ",discard" }, - { XFS_MOUNT_SMALL_INUMS, ",inode32" }, - { XFS_MOUNT_DAX, ",dax" }, - { 0, NULL } - }; - static struct proc_xfs_info xfs_info_unset[] = { - /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" }, - { XFS_MOUNT_BARRIER, ",nobarrier" }, - { XFS_MOUNT_SMALL_INUMS, ",inode64" }, + { XFS_FEAT_WSYNC, ",wsync" }, + { XFS_FEAT_NOALIGN, ",noalign" }, + { XFS_FEAT_SWALLOC, ",swalloc" }, + { XFS_FEAT_NOUUID, ",nouuid" }, + { XFS_FEAT_NORECOVERY, ",norecovery" }, + { XFS_FEAT_FILESTREAMS, ",filestreams" }, + { XFS_FEAT_GRPID, ",grpid" }, + { XFS_FEAT_DISCARD, ",discard" }, + { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, + { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, + { XFS_FEAT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_NOLIFETIME, ",nolifetime" }, { 0, NULL } }; + struct xfs_mount *mp = XFS_M(root->d_sb); struct proc_xfs_info *xfs_infop; for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { - if (mp->m_flags & xfs_infop->flag) - seq_puts(m, xfs_infop->str); - } - for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) { - if (!(mp->m_flags & xfs_infop->flag)) + if (mp->m_features & xfs_infop->flag) seq_puts(m, xfs_infop->str); } - if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, ",inode%d", xfs_has_small_inums(mp) ? 32 : 64); + + if (xfs_has_allocsize(mp)) seq_printf(m, ",allocsize=%dk", - (int)(1 << mp->m_writeio_log) >> 10); + (1 << mp->m_allocsize_log) >> 10); if (mp->m_logbufs > 0) seq_printf(m, ",logbufs=%d", mp->m_logbufs); @@ -533,71 +230,67 @@ xfs_showargs( seq_printf(m, ",swidth=%d", (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); - if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) + if (mp->m_qflags & XFS_UQUOTA_ENFD) seq_puts(m, ",usrquota"); else if (mp->m_qflags & XFS_UQUOTA_ACCT) seq_puts(m, ",uqnoenforce"); - if (mp->m_qflags & XFS_PQUOTA_ACCT) { - if (mp->m_qflags & XFS_PQUOTA_ENFD) - seq_puts(m, ",prjquota"); - else - seq_puts(m, ",pqnoenforce"); - } - if (mp->m_qflags & XFS_GQUOTA_ACCT) { - if (mp->m_qflags & XFS_GQUOTA_ENFD) - seq_puts(m, ",grpquota"); - else - seq_puts(m, ",gqnoenforce"); - } + if (mp->m_qflags & XFS_PQUOTA_ENFD) + seq_puts(m, ",prjquota"); + else if (mp->m_qflags & XFS_PQUOTA_ACCT) + seq_puts(m, ",pqnoenforce"); + + if (mp->m_qflags & XFS_GQUOTA_ENFD) + seq_puts(m, ",grpquota"); + else if (mp->m_qflags & XFS_GQUOTA_ACCT) + seq_puts(m, ",gqnoenforce"); if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if (mp->m_max_open_zones) + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + if (mp->m_awu_max_bytes) + seq_printf(m, ",max_atomic_write=%lluk", + mp->m_awu_max_bytes >> 10); + return 0; } -static uint64_t -xfs_max_file_offset( - unsigned int blockshift) + +static bool +xfs_set_inode_alloc_perag( + struct xfs_perag *pag, + xfs_ino_t ino, + xfs_agnumber_t max_metadata) { - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long... - * page->index << (PAGE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - * Note1: get_block_t takes a long (implicit cast from above) - * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch - * can optionally convert the [unsigned] long from above into - * an [unsigned] long long. - */ + if (!xfs_is_inode32(pag_mount(pag))) { + set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return false; + } -#if BITS_PER_LONG == 32 -# if defined(CONFIG_LBDAF) - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_SIZE; - bitshift = BITS_PER_LONG; -# else - pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift); -# endif -#endif + if (ino > XFS_MAXINUMBER_32) { + clear_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return false; + } - return (((uint64_t)pagefactor) << bitshift) - 1; + set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + if (pag_agno(pag) < max_metadata) + set_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + else + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return true; } /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically - * whether or not XFS_MOUNT_SMALL_INUMS is set. + * whether or not XFS_FEAT_SMALL_INUMS is set. * * Inode allocation patterns are altered only if inode32 is requested - * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large. - * If altered, XFS_MOUNT_32BITINODES is set as well. + * (XFS_FEAT_SMALL_INUMS), and the filesystem is sufficiently large. + * If altered, XFS_OPSTATE_INODE32 is set as well. * * An agcount independent of that in the mount structure is provided * because in the growfs case, mp->m_sb.sb_agcount is not yet updated @@ -621,7 +314,7 @@ xfs_set_inode_alloc( * Calculate how much should be reserved for inodes to meet * the max inode percentage. Used only for inode32. */ - if (mp->m_maxicount) { + if (M_IGEO(mp)->maxicount) { uint64_t icount; icount = sbp->sb_dblocks * sbp->sb_imax_pct; @@ -634,18 +327,18 @@ xfs_set_inode_alloc( } /* Get the last possible inode in the filesystem */ - agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + agino = XFS_AGB_TO_AGINO(mp, sbp->sb_agblocks - 1); ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); /* * If user asked for no more than 32-bit inodes, and the fs is - * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter + * sufficiently large, set XFS_OPSTATE_INODE32 if we must alter * the allocator to accommodate the request. */ - if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) - mp->m_flags |= XFS_MOUNT_32BITINODES; + if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32) + xfs_set_inode32(mp); else - mp->m_flags &= ~XFS_MOUNT_32BITINODES; + xfs_clear_inode32(mp); for (index = 0; index < agcount; index++) { struct xfs_perag *pag; @@ -653,42 +346,60 @@ xfs_set_inode_alloc( ino = XFS_AGINO_TO_INO(mp, index, agino); pag = xfs_perag_get(mp, index); + if (xfs_set_inode_alloc_perag(pag, ino, max_metadata)) + maxagi++; + xfs_perag_put(pag); + } - if (mp->m_flags & XFS_MOUNT_32BITINODES) { - if (ino > XFS_MAXINUMBER_32) { - pag->pagi_inodeok = 0; - pag->pagf_metadata = 0; - } else { - pag->pagi_inodeok = 1; - maxagi++; - if (index < max_metadata) - pag->pagf_metadata = 1; - else - pag->pagf_metadata = 0; - } - } else { - pag->pagi_inodeok = 1; - pag->pagf_metadata = 0; - } + return xfs_is_inode32(mp) ? maxagi : agcount; +} - xfs_perag_put(pag); +static int +xfs_setup_dax_always( + struct xfs_mount *mp) +{ + if (!mp->m_ddev_targp->bt_daxdev && + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) { + xfs_alert(mp, + "DAX unsupported by block device. Turning off DAX."); + goto disable_dax; + } + + if (mp->m_super->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "DAX not supported for blocksize. Turning off DAX."); + goto disable_dax; } - return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount; + if (xfs_has_reflink(mp) && + bdev_is_partition(mp->m_ddev_targp->bt_bdev)) { + xfs_alert(mp, + "DAX and reflink cannot work with multi-partitions!"); + return -EINVAL; + } + + return 0; + +disable_dax: + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); + return 0; } STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, - struct block_device **bdevp) + struct file **bdev_filep) { int error = 0; - - *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - mp); - if (IS_ERR(*bdevp)) { - error = PTR_ERR(*bdevp); + blk_mode_t mode; + + mode = sb_open_mode(mp->m_super->s_flags); + *bdev_filep = bdev_file_open_by_path(name, mode, + mp->m_super, &fs_holder_ops); + if (IS_ERR(*bdev_filep)) { + error = PTR_ERR(*bdev_filep); + *bdev_filep = NULL; xfs_warn(mp, "Invalid device [%s], error=%d", name, error); } @@ -696,35 +407,45 @@ xfs_blkdev_get( } STATIC void -xfs_blkdev_put( - struct block_device *bdev) -{ - if (bdev) - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - -void -xfs_blkdev_issue_flush( - xfs_buftarg_t *buftarg) -{ - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); -} - -STATIC void -xfs_close_devices( +xfs_shutdown_devices( struct xfs_mount *mp) { + /* + * Udev is triggered whenever anyone closes a block device or unmounts + * a file systemm on a block device. + * The default udev rules invoke blkid to read the fs super and create + * symlinks to the bdev under /dev/disk. For this, it uses buffered + * reads through the page cache. + * + * xfs_db also uses buffered reads to examine metadata. There is no + * coordination between xfs_db and udev, which means that they can run + * concurrently. Note there is no coordination between the kernel and + * blkid either. + * + * On a system with 64k pages, the page cache can cache the superblock + * and the root inode (and hence the root directory) with the same 64k + * page. If udev spawns blkid after the mkfs and the system is busy + * enough that it is still running when xfs_db starts up, they'll both + * read from the same page in the pagecache. + * + * The unmount writes updated inode metadata to disk directly. The XFS + * buffer cache does not use the bdev pagecache, so it needs to + * invalidate that pagecache on unmount. If the above scenario occurs, + * the pagecache no longer reflects what's on disk, xfs_db reads the + * stale metadata, and fails to find /a. Most of the time this succeeds + * because closing a bdev invalidates the page cache, but when processes + * race, everyone loses. + */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { - struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - xfs_free_buftarg(mp, mp->m_logdev_targp); - xfs_blkdev_put(logdev); + blkdev_issue_flush(mp->m_logdev_targp->bt_bdev); + invalidate_bdev(mp->m_logdev_targp->bt_bdev); } if (mp->m_rtdev_targp) { - struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - xfs_free_buftarg(mp, mp->m_rtdev_targp); - xfs_blkdev_put(rtdev); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); + invalidate_bdev(mp->m_rtdev_targp->bt_bdev); } - xfs_free_buftarg(mp, mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + invalidate_bdev(mp->m_ddev_targp->bt_bdev); } /* @@ -741,25 +462,28 @@ STATIC int xfs_open_devices( struct xfs_mount *mp) { - struct block_device *ddev = mp->m_super->s_bdev; - struct block_device *logdev = NULL, *rtdev = NULL; + struct super_block *sb = mp->m_super; + struct block_device *ddev = sb->s_bdev; + struct file *logdev_file = NULL, *rtdev_file = NULL; int error; /* * Open real time and log devices - order is important. */ if (mp->m_logname) { - error = xfs_blkdev_get(mp, mp->m_logname, &logdev); + error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file); if (error) - goto out; + return error; } if (mp->m_rtname) { - error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file); if (error) goto out_close_logdev; - if (rtdev == ddev || rtdev == logdev) { + if (file_bdev(rtdev_file) == ddev || + (logdev_file && + file_bdev(rtdev_file) == file_bdev(logdev_file))) { xfs_warn(mp, "Cannot mount filesystem with identical rtdev and ddev/logdev."); error = -EINVAL; @@ -770,38 +494,49 @@ xfs_open_devices( /* * Setup xfs_mount buffer target pointers */ - error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); - if (!mp->m_ddev_targp) + mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); + if (IS_ERR(mp->m_ddev_targp)) { + error = PTR_ERR(mp->m_ddev_targp); + mp->m_ddev_targp = NULL; goto out_close_rtdev; + } - if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); - if (!mp->m_rtdev_targp) + if (rtdev_file) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); + if (IS_ERR(mp->m_rtdev_targp)) { + error = PTR_ERR(mp->m_rtdev_targp); + mp->m_rtdev_targp = NULL; goto out_free_ddev_targ; + } } - if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); - if (!mp->m_logdev_targp) + if (logdev_file && file_bdev(logdev_file) != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); + if (IS_ERR(mp->m_logdev_targp)) { + error = PTR_ERR(mp->m_logdev_targp); + mp->m_logdev_targp = NULL; goto out_free_rtdev_targ; + } } else { mp->m_logdev_targp = mp->m_ddev_targp; + /* Handle won't be used, drop it */ + if (logdev_file) + bdev_fput(logdev_file); } return 0; out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_free_buftarg(mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp, mp->m_ddev_targp); + xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(rtdev); + if (rtdev_file) + bdev_fput(rtdev_file); out_close_logdev: - if (logdev && logdev != ddev) - xfs_blkdev_put(logdev); - out: + if (logdev_file) + bdev_fput(logdev_file); return error; } @@ -814,23 +549,32 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize, + mp->m_sb.sb_dblocks); if (error) return error; if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { unsigned int log_sector_size = BBSIZE; - if (xfs_sb_version_hassector(&mp->m_sb)) + if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, - log_sector_size); + error = xfs_configure_buftarg(mp->m_logdev_targp, + log_sector_size, mp->m_sb.sb_logblocks); if (error) return error; } - if (mp->m_rtdev_targp) { - error = xfs_setsize_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_sectsize); + + if (mp->m_sb.sb_rtstart) { + if (mp->m_rtdev_targp) { + xfs_warn(mp, + "can't use internal and external rtdev at the same time"); + return -EINVAL; + } + mp->m_rtdev_targp = mp->m_ddev_targp; + } else if (mp->m_rtname) { + error = xfs_configure_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks); if (error) return error; } @@ -843,60 +587,51 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), + 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; - mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); - if (!mp->m_data_workqueue) - goto out_destroy_buf; - mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), + 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) - goto out_destroy_data_iodone_queue; - - mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); - if (!mp->m_cil_workqueue) - goto out_destroy_unwritten; + goto out_destroy_buf; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), + 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) - goto out_destroy_cil; + goto out_destroy_unwritten; - mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0, - mp->m_fsname); - if (!mp->m_log_workqueue) + mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s", + XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); + if (!mp->m_blockgc_wq) goto out_destroy_reclaim; - mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); - if (!mp->m_eofblocks_workqueue) - goto out_destroy_log; + mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), + 1, mp->m_super->s_id); + if (!mp->m_inodegc_wq) + goto out_destroy_blockgc; - mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, - mp->m_fsname); + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", + XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0, + mp->m_super->s_id); if (!mp->m_sync_workqueue) - goto out_destroy_eofb; + goto out_destroy_inodegc; return 0; -out_destroy_eofb: - destroy_workqueue(mp->m_eofblocks_workqueue); -out_destroy_log: - destroy_workqueue(mp->m_log_workqueue); +out_destroy_inodegc: + destroy_workqueue(mp->m_inodegc_wq); +out_destroy_blockgc: + destroy_workqueue(mp->m_blockgc_wq); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); -out_destroy_cil: - destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); -out_destroy_data_iodone_queue: - destroy_workqueue(mp->m_data_workqueue); out_destroy_buf: destroy_workqueue(mp->m_buf_workqueue); out: @@ -908,15 +643,27 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_eofblocks_workqueue); - destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_blockgc_wq); + destroy_workqueue(mp->m_inodegc_wq); destroy_workqueue(mp->m_reclaim_workqueue); - destroy_workqueue(mp->m_cil_workqueue); - destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); destroy_workqueue(mp->m_buf_workqueue); } +static void +xfs_flush_inodes_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, struct xfs_mount, + m_flush_inodes_work); + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting @@ -927,12 +674,15 @@ void xfs_flush_inodes( struct xfs_mount *mp) { - struct super_block *sb = mp->m_super; + /* + * If flush_work() returns true then that means we waited for a flush + * which was already in progress. Don't bother running another scan. + */ + if (flush_work(&mp->m_flush_inodes_work)) + return; - if (down_read_trylock(&sb->s_umount)) { - sync_inodes_sb(sb); - up_read(&sb->s_umount); - } + queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work); + flush_work(&mp->m_flush_inodes_work); } /* Catch misguided souls that try to use this interface on XFS */ @@ -953,41 +703,41 @@ xfs_fs_destroy_inode( struct inode *inode) { struct xfs_inode *ip = XFS_I(inode); - int error; trace_xfs_destroy_inode(ip); ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); + xfs_inode_mark_reclaimable(ip); +} - if (xfs_is_reflink_inode(ip)) { - error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); - if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) - xfs_warn(ip->i_mount, -"Error %d while evicting CoW blocks for inode %llu.", - error, ip->i_ino); - } - - xfs_inactive(ip); - - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); - XFS_STATS_INC(ip->i_mount, vn_reclaim); +static void +xfs_fs_dirty_inode( + struct inode *inode, + int flags) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; - /* - * We should never get here with one of the reclaim flags already set. - */ - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + if (!(inode->i_sb->s_flags & SB_LAZYTIME)) + return; /* - * We always use background reclaim here because even if the - * inode is clean, it still may be under IO and hence we have - * to take the flush lock. The background reclaim path handles - * this more efficiently than we can here, so simply let background - * reclaim tear down all inodes. + * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC) + * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed + * in flags possibly together with I_DIRTY_SYNC. */ - xfs_inode_set_reclaim_tag(ip); + if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME)) + return; + + if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp)) + return; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + xfs_trans_commit(tp); } /* @@ -1012,11 +762,7 @@ xfs_fs_inode_init_once( /* xfs inode */ atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - - mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); - mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); + init_rwsem(&ip->i_lock); } /* @@ -1038,20 +784,45 @@ xfs_fs_drop_inode( * that. See the comment for this inode flag. */ if (ip->i_flags & XFS_IRECOVERY) { - ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED); + ASSERT(xlog_recovery_needed(ip->i_mount->m_log)); return 0; } - return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); + return inode_generic_drop(inode); } STATIC void -xfs_free_fsname( +xfs_fs_evict_inode( + struct inode *inode) +{ + if (IS_DAX(inode)) + dax_break_layout_final(inode); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + if (IS_ENABLED(CONFIG_XFS_RT) && + S_ISREG(inode->i_mode) && inode->i_private) { + xfs_open_zone_put(inode->i_private); + inode->i_private = NULL; + } +} + +static void +xfs_mount_free( struct xfs_mount *mp) { - kfree(mp->m_fsname); + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_free_buftarg(mp->m_logdev_targp); + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) + xfs_free_buftarg(mp->m_rtdev_targp); + if (mp->m_ddev_targp) + xfs_free_buftarg(mp->m_ddev_targp); + + debugfs_remove(mp->m_debugfs); kfree(mp->m_rtname); kfree(mp->m_logname); + kfree(mp); } STATIC int @@ -1060,6 +831,9 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); + int error; + + trace_xfs_fs_sync_fs(mp, __return_address); /* * Doing anything during the async pass would be counterproductive. @@ -1067,7 +841,10 @@ xfs_fs_sync_fs( if (!wait) return 0; - xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + if (laptop_mode) { /* * The disk must be active because we're syncing. @@ -1077,308 +854,161 @@ xfs_fs_sync_fs( flush_delayed_work(&mp->m_log->l_work); } - return 0; -} - -STATIC int -xfs_fs_statfs( - struct dentry *dentry, - struct kstatfs *statp) -{ - struct xfs_mount *mp = XFS_M(dentry->d_sb); - xfs_sb_t *sbp = &mp->m_sb; - struct xfs_inode *ip = XFS_I(d_inode(dentry)); - uint64_t fakeinos, id; - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - xfs_extlen_t lsize; - int64_t ffree; - - statp->f_type = XFS_SB_MAGIC; - statp->f_namelen = MAXNAMELEN - 1; - - id = huge_encode_dev(mp->m_ddev_targp->bt_dev); - statp->f_fsid.val[0] = (u32)id; - statp->f_fsid.val[1] = (u32)(id >> 32); - - icount = percpu_counter_sum(&mp->m_icount); - ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); - - spin_lock(&mp->m_sb_lock); - statp->f_bsize = sbp->sb_blocksize; - lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; - statp->f_blocks = sbp->sb_dblocks - lsize; - spin_unlock(&mp->m_sb_lock); - - statp->f_bfree = fdblocks - mp->m_alloc_set_aside; - statp->f_bavail = statp->f_bfree; - - fakeinos = statp->f_bfree << sbp->sb_inopblog; - statp->f_files = MIN(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); - if (mp->m_maxicount) - statp->f_files = min_t(typeof(statp->f_files), - statp->f_files, - mp->m_maxicount); - - /* If sb_icount overshot maxicount, report actual allocation */ - statp->f_files = max_t(typeof(statp->f_files), - statp->f_files, - sbp->sb_icount); - - /* make sure statp->f_ffree does not underflow */ - ffree = statp->f_files - (icount - ifree); - statp->f_ffree = max_t(int64_t, ffree, 0); - + /* + * If we are called with page faults frozen out, it means we are about + * to freeze the transaction subsystem. Take the opportunity to shut + * down inodegc because once SB_FREEZE_FS is set it's too late to + * prevent inactivation races with freeze. The fs doesn't get called + * again by the freezing process until after SB_FREEZE_FS has been set, + * so it's now or never. Same logic applies to speculative allocation + * garbage collection. + * + * We don't care if this is a normal syncfs call that does this or + * freeze that does this - we can run this multiple times without issue + * and we won't race with a restart because a restart can only occur + * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE. + */ + if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { + xfs_inodegc_stop(mp); + xfs_blockgc_stop(mp); + xfs_zone_gc_stop(mp); + } - if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == - (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) - xfs_qm_statvfs(ip, statp); return 0; } -STATIC void -xfs_save_resvblks(struct xfs_mount *mp) +static xfs_extlen_t +xfs_internal_log_size( + struct xfs_mount *mp) { - uint64_t resblks = 0; - - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, &resblks, NULL); + if (!mp->m_sb.sb_logstart) + return 0; + return mp->m_sb.sb_logblocks; } -STATIC void -xfs_restore_resvblks(struct xfs_mount *mp) +static void +xfs_statfs_data( + struct xfs_mount *mp, + struct kstatfs *st) { - uint64_t resblks; + int64_t fdblocks = + xfs_sum_freecounter(mp, XC_FREE_BLOCKS); - if (mp->m_resblks_save) { - resblks = mp->m_resblks_save; - mp->m_resblks_save = 0; - } else - resblks = xfs_default_resblks(mp); + /* make sure st->f_bfree does not underflow */ + st->f_bfree = max(0LL, + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS)); - xfs_reserve_blocks(mp, &resblks, NULL); + /* + * sb_dblocks can change during growfs, but nothing cares about reporting + * the old or new value during growfs. + */ + st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp); } /* - * Trigger writeback of all the dirty metadata in the file system. - * - * This ensures that the metadata is written to their location on disk rather - * than just existing in transactions in the log. This means after a quiesce - * there is no log replay required to write the inodes to disk - this is the - * primary difference between a sync and a quiesce. - * - * Note: xfs_log_quiesce() stops background log work - the callers must ensure - * it is started again when appropriate. + * When stat(v)fs is called on a file with the realtime bit set or a directory + * with the rtinherit bit, report freespace information for the RT device + * instead of the main data device. */ -void -xfs_quiesce_attr( - struct xfs_mount *mp) +static void +xfs_statfs_rt( + struct xfs_mount *mp, + struct kstatfs *st) { - int error = 0; - - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* force the log to unpin objects from the now complete transactions */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - - xfs_log_quiesce(mp); + st->f_bfree = xfs_rtbxlen_to_blen(mp, + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp, + mp->m_free[XC_FREE_RTEXTENTS].res_total); } -STATIC int -xfs_test_remount_options( - struct super_block *sb, +static void +xfs_statfs_inodes( struct xfs_mount *mp, - char *options) + struct kstatfs *st) { - int error = 0; - struct xfs_mount *tmp_mp; + uint64_t icount = percpu_counter_sum(&mp->m_icount); + uint64_t ifree = percpu_counter_sum(&mp->m_ifree); + uint64_t fakeinos = XFS_FSB_TO_INO(mp, st->f_bfree); - tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL); - if (!tmp_mp) - return -ENOMEM; + st->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); + if (M_IGEO(mp)->maxicount) + st->f_files = min_t(typeof(st->f_files), st->f_files, + M_IGEO(mp)->maxicount); - tmp_mp->m_super = sb; - error = xfs_parseargs(tmp_mp, options); - xfs_free_fsname(tmp_mp); - kfree(tmp_mp); + /* If sb_icount overshot maxicount, report actual allocation */ + st->f_files = max_t(typeof(st->f_files), st->f_files, + mp->m_sb.sb_icount); - return error; + /* Make sure st->f_ffree does not underflow */ + st->f_ffree = max_t(int64_t, 0, st->f_files - (icount - ifree)); } STATIC int -xfs_fs_remount( - struct super_block *sb, - int *flags, - char *options) +xfs_fs_statfs( + struct dentry *dentry, + struct kstatfs *st) { - struct xfs_mount *mp = XFS_M(sb); - xfs_sb_t *sbp = &mp->m_sb; - substring_t args[MAX_OPT_ARGS]; - char *p; - int error; - - /* First, check for complete junk; i.e. invalid options */ - error = xfs_test_remount_options(sb, mp, options); - if (error) - return error; + struct xfs_mount *mp = XFS_M(dentry->d_sb); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); - sync_filesystem(sb); - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_barrier: - xfs_warn(mp, "%s option is deprecated, ignoring.", p); - mp->m_flags |= XFS_MOUNT_BARRIER; - break; - case Opt_nobarrier: - xfs_warn(mp, "%s option is deprecated, ignoring.", p); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - break; - case Opt_inode64: - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); - break; - case Opt_inode32: - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); - break; - default: - /* - * Logically we would return an error here to prevent - * users from believing they might have changed - * mount options using remount which can't be changed. - * - * But unfortunately mount(8) adds all options from - * mtab and fstab to the mount arguments in some cases - * so we can't blindly reject options, but have to - * check for each specified option if it actually - * differs from the currently set option and only - * reject it if that's the case. - * - * Until that is implemented we return success for - * every remount request, and silently ignore all - * options that we can't actually change. - */ -#if 0 - xfs_info(mp, - "mount option \"%s\" not supported for remount", p); - return -EINVAL; -#else - break; -#endif - } - } + /* + * Expedite background inodegc but don't wait. We do not want to block + * here waiting hours for a billion extent file to be truncated. + */ + xfs_inodegc_push(mp); - /* ro -> rw */ - if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { - xfs_warn(mp, - "ro->rw transition prohibited on norecovery mount"); - return -EINVAL; - } + st->f_type = XFS_SUPER_MAGIC; + st->f_namelen = MAXNAMELEN - 1; + st->f_bsize = mp->m_sb.sb_blocksize; + st->f_fsid = u64_to_fsid(huge_encode_dev(mp->m_ddev_targp->bt_dev)); - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_ro_compat_feature(sbp, - XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { - xfs_warn(mp, -"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", - (sbp->sb_features_ro_compat & - XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); - return -EINVAL; - } + xfs_statfs_data(mp, st); + xfs_statfs_inodes(mp, st); - mp->m_flags &= ~XFS_MOUNT_RDONLY; + if (XFS_IS_REALTIME_MOUNT(mp) && + (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) + xfs_statfs_rt(mp, st); - /* - * If this is the first remount to writeable state we - * might have some superblock changes to update. - */ - if (mp->m_update_sb) { - error = xfs_sync_sb(mp, false); - if (error) { - xfs_warn(mp, "failed to write sb changes"); - return error; - } - mp->m_update_sb = false; - } + if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) && + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) + xfs_qm_statvfs(ip, st); - /* - * Fill out the reserve pool if it is empty. Use the stashed - * value if it is non-zero, otherwise go with the default. - */ - xfs_restore_resvblks(mp); - xfs_log_work_queue(mp); - xfs_queue_eofblocks(mp); + /* + * XFS does not distinguish between blocks available to privileged and + * unprivileged users. + */ + st->f_bavail = st->f_bfree; + return 0; +} - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } +STATIC void +xfs_save_resvblks( + struct xfs_mount *mp) +{ + enum xfs_free_counter i; - /* Create the per-AG metadata reservation pool .*/ - error = xfs_fs_reserve_ag_blocks(mp); - if (error && error != -ENOSPC) - return error; + for (i = 0; i < XC_FREE_NR; i++) { + mp->m_free[i].res_saved = mp->m_free[i].res_total; + xfs_reserve_blocks(mp, i, 0); } +} - /* rw -> ro */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { - /* Free the per-AG metadata reservation pool. */ - error = xfs_fs_unreserve_ag_blocks(mp); - if (error) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } - - /* - * Before we sync the metadata, we need to free up the reserve - * block pool so that the used block count in the superblock on - * disk is correct at the end of the remount. Stash the current - * reserve pool size so that if we get remounted rw, we can - * return it to the same size. - */ - xfs_save_resvblks(mp); - - /* - * Cancel background eofb scanning so it cannot race with the - * final log force+buftarg wait and deadlock the remount. - */ - cancel_delayed_work_sync(&mp->m_eofblocks_work); - - xfs_quiesce_attr(mp); - mp->m_flags |= XFS_MOUNT_RDONLY; +STATIC void +xfs_restore_resvblks( + struct xfs_mount *mp) +{ + uint64_t resblks; + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) { + if (mp->m_free[i].res_saved) { + resblks = mp->m_free[i].res_saved; + mp->m_free[i].res_saved = 0; + } else + resblks = xfs_default_resblks(mp, i); + xfs_reserve_blocks(mp, i, resblks); } - - return 0; } /* @@ -1392,10 +1022,33 @@ xfs_fs_freeze( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); + unsigned int flags; + int ret; + /* + * The filesystem is now frozen far enough that memory reclaim + * cannot safely operate on the filesystem. Hence we need to + * set a GFP_NOFS context here to avoid recursion deadlocks. + */ + flags = memalloc_nofs_save(); xfs_save_resvblks(mp); - xfs_quiesce_attr(mp); - return xfs_sync_sb(mp, true); + ret = xfs_log_quiesce(mp); + memalloc_nofs_restore(flags); + + /* + * For read-write filesystems, we need to restart the inodegc on error + * because we stopped it at SB_FREEZE_PAGEFAULT level and a thaw is not + * going to be run to restart it now. We are at SB_FREEZE_FS level + * here, so we can restart safely without racing with a stop in + * xfs_fs_sync_fs(). + */ + if (ret && !xfs_is_readonly(mp)) { + xfs_blockgc_start(mp); + xfs_inodegc_start(mp); + xfs_zone_gc_start(mp); + } + + return ret; } STATIC int @@ -1406,15 +1059,20 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - return 0; -} -STATIC int -xfs_fs_show_options( - struct seq_file *m, - struct dentry *root) -{ - return xfs_showargs(XFS_M(root->d_sb), m); + /* + * Don't reactivate the inodegc worker on a readonly filesystem because + * inodes are sent directly to reclaim. Don't reactivate the blockgc + * worker because there are no speculative preallocations on a readonly + * filesystem. + */ + if (!xfs_is_readonly(mp)) { + xfs_zone_gc_start(mp); + xfs_blockgc_start(mp); + xfs_inodegc_start(mp); + } + + return 0; } /* @@ -1425,10 +1083,8 @@ STATIC int xfs_finish_flags( struct xfs_mount *mp) { - int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); - /* Fail a mount where the logbuf is smaller than the log stripe */ - if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if (xfs_has_logv2(mp)) { if (mp->m_logbsize <= 0 && mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { mp->m_logbsize = mp->m_sb.sb_logsunit; @@ -1448,40 +1104,35 @@ xfs_finish_flags( } /* - * V5 filesystems always use attr2 format for attributes. - */ - if (xfs_sb_version_hascrc(&mp->m_sb) && - (mp->m_flags & XFS_MOUNT_NOATTR2)) { - xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " - "attr2 is always enabled for V5 filesystems."); - return -EINVAL; - } - - /* - * mkfs'ed attr2 will turn on attr2 mount unless explicitly - * told by noattr2 to turn it off - */ - if (xfs_sb_version_hasattr2(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_NOATTR2)) - mp->m_flags |= XFS_MOUNT_ATTR2; - - /* * prohibit r/w mounts of read-only filesystems */ - if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) { xfs_warn(mp, "cannot mount a read-only filesystem as read-write"); return -EROFS; } - if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && - (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && - !xfs_sb_version_has_pquotino(&mp->m_sb)) { + if ((mp->m_qflags & XFS_GQUOTA_ACCT) && + (mp->m_qflags & XFS_PQUOTA_ACCT) && + !xfs_has_pquotino(mp)) { xfs_warn(mp, "Super block does not support project and group quota together"); return -EINVAL; } + if (!xfs_has_zoned(mp)) { + if (mp->m_max_open_zones) { + xfs_warn(mp, +"max_open_zones mount option only supported on zoned file systems."); + return -EINVAL; + } + if (mp->m_features & XFS_FEAT_NOLIFETIME) { + xfs_warn(mp, +"nolifetime mount option only supported on zoned file systems."); + return -EINVAL; + } + } + return 0; } @@ -1489,7 +1140,8 @@ static int xfs_init_percpu_counters( struct xfs_mount *mp) { - int error; + int error; + int i; error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); if (error) @@ -1499,12 +1151,29 @@ xfs_init_percpu_counters( if (error) goto free_icount; - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); + error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); if (error) goto free_ifree; + error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); + if (error) + goto free_delalloc; + + for (i = 0; i < XC_FREE_NR; i++) { + error = percpu_counter_init(&mp->m_free[i].count, 0, + GFP_KERNEL); + if (error) + goto free_freecounters; + } + return 0; +free_freecounters: + while (--i >= 0) + percpu_counter_destroy(&mp->m_free[i].count); + percpu_counter_destroy(&mp->m_delalloc_rtextents); +free_delalloc: + percpu_counter_destroy(&mp->m_delalloc_blks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1518,48 +1187,516 @@ xfs_reinit_percpu_counters( { percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks); + if (!xfs_has_zoned(mp)) + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + mp->m_sb.sb_frextents); } static void xfs_destroy_percpu_counters( struct xfs_mount *mp) { + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); - percpu_counter_destroy(&mp->m_fdblocks); + ASSERT(xfs_is_shutdown(mp) || + percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); + percpu_counter_destroy(&mp->m_delalloc_rtextents); + ASSERT(xfs_is_shutdown(mp) || + percpu_counter_sum(&mp->m_delalloc_blks) == 0); + percpu_counter_destroy(&mp->m_delalloc_blks); } -STATIC int -xfs_fs_fill_super( +static int +xfs_inodegc_init_percpu( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + mp->m_inodegc = alloc_percpu(struct xfs_inodegc); + if (!mp->m_inodegc) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + gc->cpu = cpu; + gc->mp = mp; + init_llist_head(&gc->list); + gc->items = 0; + gc->error = 0; + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); + } + return 0; +} + +static void +xfs_inodegc_free_percpu( + struct xfs_mount *mp) +{ + if (!mp->m_inodegc) + return; + free_percpu(mp->m_inodegc); +} + +static void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_rtmount_freesb(mp); + xfs_freesb(mp); + xchk_mount_stats_free(mp); + free_percpu(mp->m_stats.xs_stats); + xfs_inodegc_free_percpu(mp); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_shutdown_devices(mp); +} + +static long +xfs_fs_nr_cached_objects( struct super_block *sb, - void *data, - int silent) + struct shrink_control *sc) { - struct inode *root; - struct xfs_mount *mp = NULL; - int flags = 0, error = -ENOMEM; + /* Paranoia: catch incorrect calls during mount setup or teardown */ + if (WARN_ON_ONCE(!sb->s_fs_info)) + return 0; + return xfs_reclaim_inodes_count(XFS_M(sb)); +} - mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); - if (!mp) - goto out; +static long +xfs_fs_free_cached_objects( + struct super_block *sb, + struct shrink_control *sc) +{ + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); +} - spin_lock_init(&mp->m_sb_lock); - mutex_init(&mp->m_growlock); - atomic_set(&mp->m_active_trans, 0); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); - INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); - mp->m_kobj.kobject.kset = xfs_kset; +static void +xfs_fs_shutdown( + struct super_block *sb) +{ + xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); +} + +static int +xfs_fs_show_stats( + struct seq_file *m, + struct dentry *root) +{ + struct xfs_mount *mp = XFS_M(root->d_sb); + + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT)) + xfs_zoned_show_stats(m, mp); + return 0; +} + +static const struct super_operations xfs_super_operations = { + .alloc_inode = xfs_fs_alloc_inode, + .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, + .drop_inode = xfs_fs_drop_inode, + .evict_inode = xfs_fs_evict_inode, + .put_super = xfs_fs_put_super, + .sync_fs = xfs_fs_sync_fs, + .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, + .statfs = xfs_fs_statfs, + .show_options = xfs_fs_show_options, + .nr_cached_objects = xfs_fs_nr_cached_objects, + .free_cached_objects = xfs_fs_free_cached_objects, + .shutdown = xfs_fs_shutdown, + .show_stats = xfs_fs_show_stats, +}; + +static int +suffix_kstrtoint( + const char *s, + unsigned int base, + int *res) +{ + int last, shift_left_factor = 0, _res; + char *value; + int ret = 0; + + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoint(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + +static int +suffix_kstrtoull( + const char *s, + unsigned int base, + unsigned long long *res) +{ + int last, shift_left_factor = 0; + unsigned long long _res; + char *value; + int ret = 0; + + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoull(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + +static inline void +xfs_fs_warn_deprecated( + struct fs_context *fc, + struct fs_parameter *param) +{ + /* + * Always warn about someone passing in a deprecated mount option. + * Previously we wouldn't print the warning if we were reconfiguring + * and current mount point already had the flag set, but that was not + * the right thing to do. + * + * Many distributions mount the root filesystem with no options in the + * initramfs and rely on mount -a to remount the root fs with the + * options in fstab. However, the old behavior meant that there would + * never be a warning about deprecated mount options for the root fs in + * /etc/fstab. On a single-fs system, that means no warning at all. + * + * Compounding this problem are distribution scripts that copy + * /proc/mounts to fstab, which means that we can't remove mount + * options unless we're 100% sure they have only ever been advertised + * in /proc/mounts in response to explicitly provided mount options. + */ + xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); +} + +/* + * Set mount state from a mount option. + * + * NOTE: mp->m_super is NULL here! + */ +static int +xfs_fs_parse_param( + struct fs_context *fc, + struct fs_parameter *param) +{ + struct xfs_mount *parsing_mp = fc->s_fs_info; + struct fs_parse_result result; + int size = 0; + int opt; + + BUILD_BUG_ON(XFS_QFLAGS_MNTOPTS & XFS_MOUNT_QUOTA_ALL); + + opt = fs_parse(fc, xfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Op_deprecated: + xfs_fs_warn_deprecated(fc, param); + return 0; + case Opt_logbufs: + parsing_mp->m_logbufs = result.uint_32; + return 0; + case Opt_logbsize: + if (suffix_kstrtoint(param->string, 10, &parsing_mp->m_logbsize)) + return -EINVAL; + return 0; + case Opt_logdev: + kfree(parsing_mp->m_logname); + parsing_mp->m_logname = kstrdup(param->string, GFP_KERNEL); + if (!parsing_mp->m_logname) + return -ENOMEM; + return 0; + case Opt_rtdev: + kfree(parsing_mp->m_rtname); + parsing_mp->m_rtname = kstrdup(param->string, GFP_KERNEL); + if (!parsing_mp->m_rtname) + return -ENOMEM; + return 0; + case Opt_allocsize: + if (suffix_kstrtoint(param->string, 10, &size)) + return -EINVAL; + parsing_mp->m_allocsize_log = ffs(size) - 1; + parsing_mp->m_features |= XFS_FEAT_ALLOCSIZE; + return 0; + case Opt_grpid: + case Opt_bsdgroups: + parsing_mp->m_features |= XFS_FEAT_GRPID; + return 0; + case Opt_nogrpid: + case Opt_sysvgroups: + parsing_mp->m_features &= ~XFS_FEAT_GRPID; + return 0; + case Opt_wsync: + parsing_mp->m_features |= XFS_FEAT_WSYNC; + return 0; + case Opt_norecovery: + parsing_mp->m_features |= XFS_FEAT_NORECOVERY; + return 0; + case Opt_noalign: + parsing_mp->m_features |= XFS_FEAT_NOALIGN; + return 0; + case Opt_swalloc: + parsing_mp->m_features |= XFS_FEAT_SWALLOC; + return 0; + case Opt_sunit: + parsing_mp->m_dalign = result.uint_32; + return 0; + case Opt_swidth: + parsing_mp->m_swidth = result.uint_32; + return 0; + case Opt_inode32: + parsing_mp->m_features |= XFS_FEAT_SMALL_INUMS; + return 0; + case Opt_inode64: + parsing_mp->m_features &= ~XFS_FEAT_SMALL_INUMS; + return 0; + case Opt_nouuid: + parsing_mp->m_features |= XFS_FEAT_NOUUID; + return 0; + case Opt_largeio: + parsing_mp->m_features |= XFS_FEAT_LARGE_IOSIZE; + return 0; + case Opt_nolargeio: + parsing_mp->m_features &= ~XFS_FEAT_LARGE_IOSIZE; + return 0; + case Opt_filestreams: + parsing_mp->m_features |= XFS_FEAT_FILESTREAMS; + return 0; + case Opt_noquota: + parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; + return 0; + case Opt_quota: + case Opt_uquota: + case Opt_usrquota: + parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; + return 0; + case Opt_qnoenforce: + case Opt_uqnoenforce: + parsing_mp->m_qflags |= XFS_UQUOTA_ACCT; + parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; + return 0; + case Opt_pquota: + case Opt_prjquota: + parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; + return 0; + case Opt_pqnoenforce: + parsing_mp->m_qflags |= XFS_PQUOTA_ACCT; + parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; + return 0; + case Opt_gquota: + case Opt_grpquota: + parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; + return 0; + case Opt_gqnoenforce: + parsing_mp->m_qflags |= XFS_GQUOTA_ACCT; + parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; + return 0; + case Opt_discard: + parsing_mp->m_features |= XFS_FEAT_DISCARD; + return 0; + case Opt_nodiscard: + parsing_mp->m_features &= ~XFS_FEAT_DISCARD; + return 0; +#ifdef CONFIG_FS_DAX + case Opt_dax: + xfs_mount_set_dax_mode(parsing_mp, XFS_DAX_ALWAYS); + return 0; + case Opt_dax_enum: + xfs_mount_set_dax_mode(parsing_mp, result.uint_32); + return 0; +#endif + case Opt_max_open_zones: + parsing_mp->m_max_open_zones = result.uint_32; + return 0; + case Opt_lifetime: + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; + return 0; + case Opt_nolifetime: + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; + return 0; + case Opt_max_atomic_write: + if (suffix_kstrtoull(param->string, 10, + &parsing_mp->m_awu_max_bytes)) { + xfs_warn(parsing_mp, + "max atomic write size must be positive integer"); + return -EINVAL; + } + return 0; + default: + xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); + return -EINVAL; + } + + return 0; +} + +static int +xfs_fs_validate_params( + struct xfs_mount *mp) +{ + /* No recovery flag requires a read-only mount */ + if (xfs_has_norecovery(mp) && !xfs_is_readonly(mp)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return -EINVAL; + } + + if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { + xfs_warn(mp, + "sunit and swidth options incompatible with the noalign option"); + return -EINVAL; + } + + if (!IS_ENABLED(CONFIG_XFS_QUOTA) && + (mp->m_qflags & ~XFS_QFLAGS_MNTOPTS)) { + xfs_warn(mp, "quota support not available in this kernel."); + return -EINVAL; + } + + if ((mp->m_dalign && !mp->m_swidth) || + (!mp->m_dalign && mp->m_swidth)) { + xfs_warn(mp, "sunit and swidth must be specified together"); + return -EINVAL; + } + + if (mp->m_dalign && (mp->m_swidth % mp->m_dalign != 0)) { + xfs_warn(mp, + "stripe width (%d) must be a multiple of the stripe unit (%d)", + mp->m_swidth, mp->m_dalign); + return -EINVAL; + } + + if (mp->m_logbufs != -1 && + mp->m_logbufs != 0 && + (mp->m_logbufs < XLOG_MIN_ICLOGS || + mp->m_logbufs > XLOG_MAX_ICLOGS)) { + xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", + mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return -EINVAL; + } + + if (mp->m_logbsize != -1 && + mp->m_logbsize != 0 && + (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || + mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(mp->m_logbsize))) { + xfs_warn(mp, + "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + mp->m_logbsize); + return -EINVAL; + } + + if (xfs_has_allocsize(mp) && + (mp->m_allocsize_log > XFS_MAX_IO_LOG || + mp->m_allocsize_log < XFS_MIN_IO_LOG)) { + xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", + mp->m_allocsize_log, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG); + return -EINVAL; + } + + return 0; +} + +struct dentry * +xfs_debugfs_mkdir( + const char *name, + struct dentry *parent) +{ + struct dentry *child; + + /* Apparently we're expected to ignore error returns?? */ + child = debugfs_create_dir(name, parent); + if (IS_ERR(child)) + return NULL; + + return child; +} + +static int +xfs_fs_fill_super( + struct super_block *sb, + struct fs_context *fc) +{ + struct xfs_mount *mp = sb->s_fs_info; + struct inode *root; + int flags = 0, error; mp->m_super = sb; - sb->s_fs_info = mp; - error = xfs_parseargs(mp, (char *)data); + /* + * Copy VFS mount flags from the context now that all parameter parsing + * is guaranteed to have been completed by either the old mount API or + * the newer fsopen/fsconfig API. + */ + if (fc->sb_flags & SB_RDONLY) + xfs_set_readonly(mp); + if (fc->sb_flags & SB_DIRSYNC) + mp->m_features |= XFS_FEAT_DIRSYNC; + if (fc->sb_flags & SB_SYNCHRONOUS) + mp->m_features |= XFS_FEAT_WSYNC; + + error = xfs_fs_validate_params(mp); if (error) - goto out_free_fsname; + return error; - sb_min_blocksize(sb, BBSIZE); + if (!sb_min_blocksize(sb, BBSIZE)) { + xfs_err(mp, "unable to set blocksize"); + return -EINVAL; + } sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA @@ -1568,32 +1705,58 @@ xfs_fs_fill_super( #endif sb->s_op = &xfs_super_operations; - if (silent) + /* + * Delay mount work if the debug hook is set. This is debug + * instrumention to coordinate simulation of xfs mount failures with + * VFS superblock operations + */ + if (xfs_globals.mount_delay) { + xfs_notice(mp, "Delaying mount for %d seconds.", + xfs_globals.mount_delay); + msleep(xfs_globals.mount_delay * 1000); + } + + if (fc->sb_flags & SB_SILENT) flags |= XFS_MFSI_QUIET; error = xfs_open_devices(mp); if (error) - goto out_free_fsname; + return error; + + if (xfs_debugfs) { + mp->m_debugfs = xfs_debugfs_mkdir(mp->m_super->s_id, + xfs_debugfs); + } else { + mp->m_debugfs = NULL; + } error = xfs_init_mount_workqueues(mp); if (error) - goto out_close_devices; + goto out_shutdown_devices; error = xfs_init_percpu_counters(mp); if (error) goto out_destroy_workqueues; + error = xfs_inodegc_init_percpu(mp); + if (error) + goto out_destroy_counters; + /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { error = -ENOMEM; - goto out_destroy_counters; + goto out_destroy_inodegc; } - error = xfs_readsb(mp, flags); + error = xchk_mount_stats_alloc(mp); if (error) goto out_free_stats; + error = xfs_readsb(mp, flags); + if (error) + goto out_free_scrub_stats; + error = xfs_finish_flags(mp); if (error) goto out_free_sb; @@ -1602,55 +1765,197 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - error = xfs_filestream_mount(mp); + /* + * V4 support is undergoing deprecation. + * + * Note: this has to use an open coded m_features check as xfs_has_crc + * always returns false for !CONFIG_XFS_SUPPORT_V4. + */ + if (!(mp->m_features & XFS_FEAT_CRC)) { + if (!IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { + xfs_warn(mp, + "Deprecated V4 format (crc=0) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; + } + xfs_warn_once(mp, + "Deprecated V4 format (crc=0) will not be supported after September 2030."); + } + + /* ASCII case insensitivity is undergoing deprecation. */ + if (xfs_has_asciici(mp)) { +#ifdef CONFIG_XFS_SUPPORT_ASCII_CI + xfs_warn_once(mp, + "Deprecated ASCII case-insensitivity feature (ascii-ci=1) will not be supported after September 2030."); +#else + xfs_warn(mp, + "Deprecated ASCII case-insensitivity feature (ascii-ci=1) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; +#endif + } + + /* + * Filesystem claims it needs repair, so refuse the mount unless + * norecovery is also specified, in which case the filesystem can + * be mounted with no risk of further damage. + */ + if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) { + xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); + error = -EFSCORRUPTED; + goto out_free_sb; + } + + /* + * Don't touch the filesystem if a user tool thinks it owns the primary + * superblock. mkfs doesn't clear the flag from secondary supers, so + * we don't check them at all. + */ + if (mp->m_sb.sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); + error = -EFSCORRUPTED; + goto out_free_sb; + } + + if (mp->m_sb.sb_blocksize > PAGE_SIZE) { + size_t max_folio_size = mapping_max_folio_size_supported(); + + if (!xfs_has_crc(mp)) { + xfs_warn(mp, +"V4 Filesystem with blocksize %d bytes. Only pagesize (%ld) or less is supported.", + mp->m_sb.sb_blocksize, PAGE_SIZE); + error = -ENOSYS; + goto out_free_sb; + } + + if (mp->m_sb.sb_blocksize > max_folio_size) { + xfs_warn(mp, +"block size (%u bytes) not supported; Only block size (%zu) or less is supported", + mp->m_sb.sb_blocksize, max_folio_size); + error = -ENOSYS; + goto out_free_sb; + } + + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LBS); + } + + /* Ensure this filesystem fits in the page cache limits */ + if (xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_dblocks) || + xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_rblocks)) { + xfs_warn(mp, + "file system too large to be mounted on this system."); + error = -EFBIG; + goto out_free_sb; + } + + /* + * XFS block mappings use 54 bits to store the logical block offset. + * This should suffice to handle the maximum file size that the VFS + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON + * to check this assertion. + * + * Avoid integer overflow by comparing the maximum bmbt offset to the + * maximum pagecache offset in units of fs blocks. + */ + if (!xfs_verify_fileoff(mp, XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE))) { + xfs_warn(mp, +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), + XFS_MAX_FILEOFF); + error = -EINVAL; + goto out_free_sb; + } + + error = xfs_rtmount_readsb(mp); if (error) goto out_free_sb; + error = xfs_filestream_mount(mp); + if (error) + goto out_free_rtsb; + /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. */ - sb->s_magic = XFS_SB_MAGIC; + sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; + if (xfs_has_bigtime(mp)) { + sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN); + sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX); + } else { + sb->s_time_min = XFS_LEGACY_TIME_MIN; + sb->s_time_max = XFS_LEGACY_TIME_MAX; + } + trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; + set_posix_acl_flag(sb); /* version 5 superblocks support inode version counters. */ - if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) - sb->s_flags |= MS_I_VERSION; + if (xfs_has_crc(mp)) + sb->s_flags |= SB_I_VERSION; + + if (xfs_has_dax_always(mp)) { + error = xfs_setup_dax_always(mp); + if (error) + goto out_filestream_unmount; + } - if (mp->m_flags & XFS_MOUNT_DAX) { + if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) { xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + "mounting with \"discard\" option, but the device does not support discard"); + mp->m_features &= ~XFS_FEAT_DISCARD; + } - error = bdev_dax_supported(sb, sb->s_blocksize); - if (error) { + if (xfs_has_zoned(mp)) { + if (!xfs_has_metadir(mp)) { xfs_alert(mp, - "DAX unsupported by block device. Turning off DAX."); - mp->m_flags &= ~XFS_MOUNT_DAX; + "metadir feature required for zoned realtime devices."); + error = -EINVAL; + goto out_filestream_unmount; } - if (xfs_sb_version_hasreflink(&mp->m_sb)) - xfs_alert(mp, - "DAX and reflink have not been tested together!"); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED); + } else if (xfs_has_metadir(mp)) { + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); } - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - if (mp->m_sb.sb_rblocks) { + if (xfs_has_reflink(mp)) { + if (xfs_has_realtime(mp) && + !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) { xfs_alert(mp, - "EXPERIMENTAL reverse mapping btree not compatible with realtime device!"); + "reflink not compatible with realtime extent size %u!", + mp->m_sb.sb_rextsize); error = -EINVAL; goto out_filestream_unmount; } - xfs_alert(mp, - "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); + + if (xfs_has_zoned(mp)) { + xfs_alert(mp, + "reflink not compatible with zoned RT device!"); + error = -EINVAL; + goto out_filestream_unmount; + } + + if (xfs_globals.always_cow) { + xfs_info(mp, "using DEBUG-only always_cow mode."); + mp->m_always_cow = true; + } } - if (xfs_sb_version_hasreflink(&mp->m_sb)) - xfs_alert(mp, - "EXPERIMENTAL reflink feature enabled. Use at your own risk!"); + /* + * If no quota mount options were provided, maybe we'll try to pick + * up the quota accounting and enforcement flags from the ondisk sb. + */ + if (!(mp->m_qflags & XFS_QFLAGS_MNTOPTS)) + xfs_set_resuming_quotaon(mp); + mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS; error = xfs_mountfs(mp); if (error) @@ -1671,285 +1976,595 @@ xfs_fs_fill_super( out_filestream_unmount: xfs_filestream_unmount(mp); + out_free_rtsb: + xfs_rtmount_freesb(mp); out_free_sb: xfs_freesb(mp); + out_free_scrub_stats: + xchk_mount_stats_free(mp); out_free_stats: free_percpu(mp->m_stats.xs_stats); + out_destroy_inodegc: + xfs_inodegc_free_percpu(mp); out_destroy_counters: xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); - out_close_devices: - xfs_close_devices(mp); - out_free_fsname: - xfs_free_fsname(mp); - kfree(mp); - out: + out_shutdown_devices: + xfs_shutdown_devices(mp); return error; out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - goto out_free_sb; + goto out_free_rtsb; } -STATIC void -xfs_fs_put_super( - struct super_block *sb) +static int +xfs_fs_get_tree( + struct fs_context *fc) { - struct xfs_mount *mp = XFS_M(sb); + return get_tree_bdev(fc, xfs_fs_fill_super); +} - xfs_notice(mp, "Unmounting Filesystem"); - xfs_filestream_unmount(mp); - xfs_unmountfs(mp); +static int +xfs_remount_rw( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + int error; - xfs_freesb(mp); - free_percpu(mp->m_stats.xs_stats); - xfs_destroy_percpu_counters(mp); - xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - xfs_free_fsname(mp); - kfree(mp); + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp && + xfs_readonly_buftarg(mp->m_logdev_targp)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only logdev"); + return -EACCES; + } + + if (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only rtdev"); + return -EACCES; + } + + if (xfs_has_norecovery(mp)) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } + + if (xfs_sb_is_v5(sbp) && + xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_warn(mp, + "ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + return -EINVAL; + } + + xfs_clear_readonly(mp); + + /* + * If this is the first remount to writeable state we might have some + * superblock changes to update. + */ + if (mp->m_update_sb) { + error = xfs_sync_sb(mp, false); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + return error; + } + mp->m_update_sb = false; + } + + /* + * Fill out the reserve pool if it is empty. Use the stashed value if + * it is non-zero, otherwise go with the default. + */ + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + xfs_blockgc_start(mp); + + /* Create the per-AG metadata reservation pool .*/ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + return error; + + /* Re-enable the background inode inactivation worker. */ + xfs_inodegc_start(mp); + + /* Restart zone reclaim */ + xfs_zone_gc_start(mp); + + return 0; } -STATIC struct dentry * -xfs_fs_mount( - struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *data) +static int +xfs_remount_ro( + struct xfs_mount *mp) { - return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); + struct xfs_icwalk icw = { + .icw_flags = XFS_ICWALK_FLAG_SYNC, + }; + int error; + + /* Flush all the dirty data to disk. */ + error = sync_filesystem(mp->m_super); + if (error) + return error; + + /* + * Cancel background eofb scanning so it cannot race with the final + * log force+buftarg wait and deadlock the remount. + */ + xfs_blockgc_stop(mp); + + /* + * Clear out all remaining COW staging extents and speculative post-EOF + * preallocations so that we don't leave inodes requiring inactivation + * cleanups during reclaim on a read-only mount. We must process every + * cached inode, so this requires a synchronous cache scan. + */ + error = xfs_blockgc_free_space(mp, &icw); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* + * Stop the inodegc background worker. xfs_fs_reconfigure already + * flushed all pending inodegc work when it sync'd the filesystem. + * The VFS holds s_umount, so we know that inodes cannot enter + * xfs_fs_destroy_inode during a remount operation. In readonly mode + * we send inodes straight to reclaim, so no inodes will be queued. + */ + xfs_inodegc_stop(mp); + + /* Stop zone reclaim */ + xfs_zone_gc_stop(mp); + + /* Free the per-AG metadata reservation pool. */ + xfs_fs_unreserve_ag_blocks(mp); + + /* + * Before we sync the metadata, we need to free up the reserve block + * pool so that the used block count in the superblock on disk is + * correct at the end of the remount. Stash the current* reserve pool + * size so that if we get remounted rw, we can return it to the same + * size. + */ + xfs_save_resvblks(mp); + + xfs_log_clean(mp); + xfs_set_readonly(mp); + + return 0; } -static long -xfs_fs_nr_cached_objects( - struct super_block *sb, - struct shrink_control *sc) +/* + * Logically we would return an error here to prevent users from believing + * they might have changed mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from mtab and fstab to the mount + * arguments in some cases so we can't blindly reject options, but have to + * check for each specified option if it actually differs from the currently + * set option and only reject it if that's the case. + * + * Until that is implemented we return success for every remount request, and + * silently ignore all options that we can't actually change. + */ +static int +xfs_fs_reconfigure( + struct fs_context *fc) { - return xfs_reclaim_inodes_count(XFS_M(sb)); + struct xfs_mount *mp = XFS_M(fc->root->d_sb); + struct xfs_mount *new_mp = fc->s_fs_info; + int flags = fc->sb_flags; + int error; + + new_mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS; + + /* version 5 superblocks always support version counters. */ + if (xfs_has_crc(mp)) + fc->sb_flags |= SB_I_VERSION; + + error = xfs_fs_validate_params(new_mp); + if (error) + return error; + + /* Validate new max_atomic_write option before making other changes */ + if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { + error = xfs_set_max_atomic_write_opt(mp, + new_mp->m_awu_max_bytes); + if (error) + return error; + } + + /* inode32 -> inode64 */ + if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { + mp->m_features &= ~XFS_FEAT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); + } + + /* inode64 -> inode32 */ + if (!xfs_has_small_inums(mp) && xfs_has_small_inums(new_mp)) { + mp->m_features |= XFS_FEAT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); + } + + /* + * Now that mp has been modified according to the remount options, we + * do a final option validation with xfs_finish_flags() just like it is + * just like it is done during mount. We cannot use + * done during mount. We cannot use xfs_finish_flags() on new_mp as it + * contains only the user given options. + */ + error = xfs_finish_flags(mp); + if (error) + return error; + + /* ro -> rw */ + if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) { + error = xfs_remount_rw(mp); + if (error) + return error; + } + + /* rw -> ro */ + if (!xfs_is_readonly(mp) && (flags & SB_RDONLY)) { + error = xfs_remount_ro(mp); + if (error) + return error; + } + + return 0; } -static long -xfs_fs_free_cached_objects( - struct super_block *sb, - struct shrink_control *sc) +static void +xfs_fs_free( + struct fs_context *fc) { - return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); + struct xfs_mount *mp = fc->s_fs_info; + + /* + * mp is stored in the fs_context when it is initialized. + * mp is transferred to the superblock on a successful mount, + * but if an error occurs before the transfer we have to free + * it here. + */ + if (mp) + xfs_mount_free(mp); } -static const struct super_operations xfs_super_operations = { - .alloc_inode = xfs_fs_alloc_inode, - .destroy_inode = xfs_fs_destroy_inode, - .drop_inode = xfs_fs_drop_inode, - .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_fs, - .freeze_fs = xfs_fs_freeze, - .unfreeze_fs = xfs_fs_unfreeze, - .statfs = xfs_fs_statfs, - .remount_fs = xfs_fs_remount, - .show_options = xfs_fs_show_options, - .nr_cached_objects = xfs_fs_nr_cached_objects, - .free_cached_objects = xfs_fs_free_cached_objects, +static const struct fs_context_operations xfs_context_ops = { + .parse_param = xfs_fs_parse_param, + .get_tree = xfs_fs_get_tree, + .reconfigure = xfs_fs_reconfigure, + .free = xfs_fs_free, }; +/* + * WARNING: do not initialise any parameters in this function that depend on + * mount option parsing having already been performed as this can be called from + * fsopen() before any parameters have been set. + */ +static int +xfs_init_fs_context( + struct fs_context *fc) +{ + struct xfs_mount *mp; + int i; + + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); + if (!mp) + return -ENOMEM; + + spin_lock_init(&mp->m_sb_lock); + for (i = 0; i < XG_TYPE_MAX; i++) + xa_init(&mp->m_groups[i].xa); + mutex_init(&mp->m_growlock); + mutex_init(&mp->m_metafile_resv_lock); + INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; + + /* + * These can be overridden by the mount option parsing. + */ + mp->m_logbufs = -1; + mp->m_logbsize = -1; + mp->m_allocsize_log = 16; /* 64k */ + + xfs_hooks_init(&mp->m_dir_update_hooks); + + fc->s_fs_info = mp; + fc->ops = &xfs_context_ops; + + return 0; +} + +static void +xfs_kill_sb( + struct super_block *sb) +{ + kill_block_super(sb); + xfs_mount_free(XFS_M(sb)); +} + static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", - .mount = xfs_fs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = xfs_init_fs_context, + .parameters = xfs_fs_parameters, + .kill_sb = xfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("xfs"); STATIC int __init -xfs_init_zones(void) +xfs_init_caches(void) { - xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, - offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS); - if (!xfs_ioend_bioset) + int error; + + xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT, + NULL); + if (!xfs_buf_cache) goto out; - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), - "xfs_log_ticket"); - if (!xfs_log_ticket_zone) - goto out_free_ioend_bioset; - - xfs_bmap_free_item_zone = kmem_zone_init( - sizeof(struct xfs_extent_free_item), - "xfs_bmap_free_item"); - if (!xfs_bmap_free_item_zone) - goto out_destroy_log_ticket_zone; - - xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), - "xfs_btree_cur"); - if (!xfs_btree_cur_zone) - goto out_destroy_bmap_free_item_zone; - - xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), - "xfs_da_state"); - if (!xfs_da_state_zone) - goto out_destroy_btree_cur_zone; - - xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); - if (!xfs_ifork_zone) - goto out_destroy_da_state_zone; - - xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); - if (!xfs_trans_zone) - goto out_destroy_ifork_zone; - - xfs_log_item_desc_zone = - kmem_zone_init(sizeof(struct xfs_log_item_desc), - "xfs_log_item_desc"); - if (!xfs_log_item_desc_zone) - goto out_destroy_trans_zone; + xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket", + sizeof(struct xlog_ticket), + 0, 0, NULL); + if (!xfs_log_ticket_cache) + goto out_destroy_buf_cache; + + error = xfs_btree_init_cur_caches(); + if (error) + goto out_destroy_log_ticket_cache; + + error = rcbagbt_init_cur_cache(); + if (error) + goto out_destroy_btree_cur_cache; + + error = xfs_defer_init_item_caches(); + if (error) + goto out_destroy_rcbagbt_cur_cache; + + xfs_da_state_cache = kmem_cache_create("xfs_da_state", + sizeof(struct xfs_da_state), + 0, 0, NULL); + if (!xfs_da_state_cache) + goto out_destroy_defer_item_cache; + + xfs_ifork_cache = kmem_cache_create("xfs_ifork", + sizeof(struct xfs_ifork), + 0, 0, NULL); + if (!xfs_ifork_cache) + goto out_destroy_da_state_cache; + + xfs_trans_cache = kmem_cache_create("xfs_trans", + sizeof(struct xfs_trans), + 0, 0, NULL); + if (!xfs_trans_cache) + goto out_destroy_ifork_cache; + /* - * The size of the zone allocated buf log item is the maximum + * The size of the cache-allocated buf log item is the maximum * size possible under XFS. This wastes a little bit of memory, * but it is much faster. */ - xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), - "xfs_buf_item"); - if (!xfs_buf_item_zone) - goto out_destroy_log_item_desc_zone; - - xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + - ((XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), "xfs_efd_item"); - if (!xfs_efd_zone) - goto out_destroy_buf_item_zone; - - xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + - ((XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), "xfs_efi_item"); - if (!xfs_efi_zone) - goto out_destroy_efd_zone; - - xfs_inode_zone = - kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD | - KM_ZONE_ACCOUNT, xfs_fs_inode_init_once); - if (!xfs_inode_zone) - goto out_destroy_efi_zone; - - xfs_ili_zone = - kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", - KM_ZONE_SPREAD, NULL); - if (!xfs_ili_zone) - goto out_destroy_inode_zone; - xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), - "xfs_icr"); - if (!xfs_icreate_zone) - goto out_destroy_ili_zone; - - xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item), - "xfs_rud_item"); - if (!xfs_rud_zone) - goto out_destroy_icreate_zone; - - xfs_rui_zone = kmem_zone_init( + xfs_buf_item_cache = kmem_cache_create("xfs_buf_item", + sizeof(struct xfs_buf_log_item), + 0, 0, NULL); + if (!xfs_buf_item_cache) + goto out_destroy_trans_cache; + + xfs_efd_cache = kmem_cache_create("xfs_efd_item", + xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS), + 0, 0, NULL); + if (!xfs_efd_cache) + goto out_destroy_buf_item_cache; + + xfs_efi_cache = kmem_cache_create("xfs_efi_item", + xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS), + 0, 0, NULL); + if (!xfs_efi_cache) + goto out_destroy_efd_cache; + + xfs_inode_cache = kmem_cache_create("xfs_inode", + sizeof(struct xfs_inode), 0, + (SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT), + xfs_fs_inode_init_once); + if (!xfs_inode_cache) + goto out_destroy_efi_cache; + + xfs_ili_cache = kmem_cache_create("xfs_ili", + sizeof(struct xfs_inode_log_item), 0, + SLAB_RECLAIM_ACCOUNT, + NULL); + if (!xfs_ili_cache) + goto out_destroy_inode_cache; + + xfs_icreate_cache = kmem_cache_create("xfs_icr", + sizeof(struct xfs_icreate_item), + 0, 0, NULL); + if (!xfs_icreate_cache) + goto out_destroy_ili_cache; + + xfs_rud_cache = kmem_cache_create("xfs_rud_item", + sizeof(struct xfs_rud_log_item), + 0, 0, NULL); + if (!xfs_rud_cache) + goto out_destroy_icreate_cache; + + xfs_rui_cache = kmem_cache_create("xfs_rui_item", xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS), - "xfs_rui_item"); - if (!xfs_rui_zone) - goto out_destroy_rud_zone; + 0, 0, NULL); + if (!xfs_rui_cache) + goto out_destroy_rud_cache; - xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item), - "xfs_cud_item"); - if (!xfs_cud_zone) - goto out_destroy_rui_zone; + xfs_cud_cache = kmem_cache_create("xfs_cud_item", + sizeof(struct xfs_cud_log_item), + 0, 0, NULL); + if (!xfs_cud_cache) + goto out_destroy_rui_cache; - xfs_cui_zone = kmem_zone_init( + xfs_cui_cache = kmem_cache_create("xfs_cui_item", xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), - "xfs_cui_item"); - if (!xfs_cui_zone) - goto out_destroy_cud_zone; + 0, 0, NULL); + if (!xfs_cui_cache) + goto out_destroy_cud_cache; - xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item), - "xfs_bud_item"); - if (!xfs_bud_zone) - goto out_destroy_cui_zone; + xfs_bud_cache = kmem_cache_create("xfs_bud_item", + sizeof(struct xfs_bud_log_item), + 0, 0, NULL); + if (!xfs_bud_cache) + goto out_destroy_cui_cache; - xfs_bui_zone = kmem_zone_init( + xfs_bui_cache = kmem_cache_create("xfs_bui_item", xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), - "xfs_bui_item"); - if (!xfs_bui_zone) - goto out_destroy_bud_zone; + 0, 0, NULL); + if (!xfs_bui_cache) + goto out_destroy_bud_cache; + + xfs_attrd_cache = kmem_cache_create("xfs_attrd_item", + sizeof(struct xfs_attrd_log_item), + 0, 0, NULL); + if (!xfs_attrd_cache) + goto out_destroy_bui_cache; + + xfs_attri_cache = kmem_cache_create("xfs_attri_item", + sizeof(struct xfs_attri_log_item), + 0, 0, NULL); + if (!xfs_attri_cache) + goto out_destroy_attrd_cache; + + xfs_iunlink_cache = kmem_cache_create("xfs_iul_item", + sizeof(struct xfs_iunlink_item), + 0, 0, NULL); + if (!xfs_iunlink_cache) + goto out_destroy_attri_cache; + + xfs_xmd_cache = kmem_cache_create("xfs_xmd_item", + sizeof(struct xfs_xmd_log_item), + 0, 0, NULL); + if (!xfs_xmd_cache) + goto out_destroy_iul_cache; + + xfs_xmi_cache = kmem_cache_create("xfs_xmi_item", + sizeof(struct xfs_xmi_log_item), + 0, 0, NULL); + if (!xfs_xmi_cache) + goto out_destroy_xmd_cache; + + xfs_parent_args_cache = kmem_cache_create("xfs_parent_args", + sizeof(struct xfs_parent_args), + 0, 0, NULL); + if (!xfs_parent_args_cache) + goto out_destroy_xmi_cache; return 0; - out_destroy_bud_zone: - kmem_zone_destroy(xfs_bud_zone); - out_destroy_cui_zone: - kmem_zone_destroy(xfs_cui_zone); - out_destroy_cud_zone: - kmem_zone_destroy(xfs_cud_zone); - out_destroy_rui_zone: - kmem_zone_destroy(xfs_rui_zone); - out_destroy_rud_zone: - kmem_zone_destroy(xfs_rud_zone); - out_destroy_icreate_zone: - kmem_zone_destroy(xfs_icreate_zone); - out_destroy_ili_zone: - kmem_zone_destroy(xfs_ili_zone); - out_destroy_inode_zone: - kmem_zone_destroy(xfs_inode_zone); - out_destroy_efi_zone: - kmem_zone_destroy(xfs_efi_zone); - out_destroy_efd_zone: - kmem_zone_destroy(xfs_efd_zone); - out_destroy_buf_item_zone: - kmem_zone_destroy(xfs_buf_item_zone); - out_destroy_log_item_desc_zone: - kmem_zone_destroy(xfs_log_item_desc_zone); - out_destroy_trans_zone: - kmem_zone_destroy(xfs_trans_zone); - out_destroy_ifork_zone: - kmem_zone_destroy(xfs_ifork_zone); - out_destroy_da_state_zone: - kmem_zone_destroy(xfs_da_state_zone); - out_destroy_btree_cur_zone: - kmem_zone_destroy(xfs_btree_cur_zone); - out_destroy_bmap_free_item_zone: - kmem_zone_destroy(xfs_bmap_free_item_zone); - out_destroy_log_ticket_zone: - kmem_zone_destroy(xfs_log_ticket_zone); - out_free_ioend_bioset: - bioset_free(xfs_ioend_bioset); + out_destroy_xmi_cache: + kmem_cache_destroy(xfs_xmi_cache); + out_destroy_xmd_cache: + kmem_cache_destroy(xfs_xmd_cache); + out_destroy_iul_cache: + kmem_cache_destroy(xfs_iunlink_cache); + out_destroy_attri_cache: + kmem_cache_destroy(xfs_attri_cache); + out_destroy_attrd_cache: + kmem_cache_destroy(xfs_attrd_cache); + out_destroy_bui_cache: + kmem_cache_destroy(xfs_bui_cache); + out_destroy_bud_cache: + kmem_cache_destroy(xfs_bud_cache); + out_destroy_cui_cache: + kmem_cache_destroy(xfs_cui_cache); + out_destroy_cud_cache: + kmem_cache_destroy(xfs_cud_cache); + out_destroy_rui_cache: + kmem_cache_destroy(xfs_rui_cache); + out_destroy_rud_cache: + kmem_cache_destroy(xfs_rud_cache); + out_destroy_icreate_cache: + kmem_cache_destroy(xfs_icreate_cache); + out_destroy_ili_cache: + kmem_cache_destroy(xfs_ili_cache); + out_destroy_inode_cache: + kmem_cache_destroy(xfs_inode_cache); + out_destroy_efi_cache: + kmem_cache_destroy(xfs_efi_cache); + out_destroy_efd_cache: + kmem_cache_destroy(xfs_efd_cache); + out_destroy_buf_item_cache: + kmem_cache_destroy(xfs_buf_item_cache); + out_destroy_trans_cache: + kmem_cache_destroy(xfs_trans_cache); + out_destroy_ifork_cache: + kmem_cache_destroy(xfs_ifork_cache); + out_destroy_da_state_cache: + kmem_cache_destroy(xfs_da_state_cache); + out_destroy_defer_item_cache: + xfs_defer_destroy_item_caches(); + out_destroy_rcbagbt_cur_cache: + rcbagbt_destroy_cur_cache(); + out_destroy_btree_cur_cache: + xfs_btree_destroy_cur_caches(); + out_destroy_log_ticket_cache: + kmem_cache_destroy(xfs_log_ticket_cache); + out_destroy_buf_cache: + kmem_cache_destroy(xfs_buf_cache); out: return -ENOMEM; } STATIC void -xfs_destroy_zones(void) +xfs_destroy_caches(void) { /* * Make sure all delayed rcu free are flushed before we * destroy caches. */ rcu_barrier(); - kmem_zone_destroy(xfs_bui_zone); - kmem_zone_destroy(xfs_bud_zone); - kmem_zone_destroy(xfs_cui_zone); - kmem_zone_destroy(xfs_cud_zone); - kmem_zone_destroy(xfs_rui_zone); - kmem_zone_destroy(xfs_rud_zone); - kmem_zone_destroy(xfs_icreate_zone); - kmem_zone_destroy(xfs_ili_zone); - kmem_zone_destroy(xfs_inode_zone); - kmem_zone_destroy(xfs_efi_zone); - kmem_zone_destroy(xfs_efd_zone); - kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_log_item_desc_zone); - kmem_zone_destroy(xfs_trans_zone); - kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_da_state_zone); - kmem_zone_destroy(xfs_btree_cur_zone); - kmem_zone_destroy(xfs_bmap_free_item_zone); - kmem_zone_destroy(xfs_log_ticket_zone); - bioset_free(xfs_ioend_bioset); + kmem_cache_destroy(xfs_parent_args_cache); + kmem_cache_destroy(xfs_xmd_cache); + kmem_cache_destroy(xfs_xmi_cache); + kmem_cache_destroy(xfs_iunlink_cache); + kmem_cache_destroy(xfs_attri_cache); + kmem_cache_destroy(xfs_attrd_cache); + kmem_cache_destroy(xfs_bui_cache); + kmem_cache_destroy(xfs_bud_cache); + kmem_cache_destroy(xfs_cui_cache); + kmem_cache_destroy(xfs_cud_cache); + kmem_cache_destroy(xfs_rui_cache); + kmem_cache_destroy(xfs_rud_cache); + kmem_cache_destroy(xfs_icreate_cache); + kmem_cache_destroy(xfs_ili_cache); + kmem_cache_destroy(xfs_inode_cache); + kmem_cache_destroy(xfs_efi_cache); + kmem_cache_destroy(xfs_efd_cache); + kmem_cache_destroy(xfs_buf_item_cache); + kmem_cache_destroy(xfs_trans_cache); + kmem_cache_destroy(xfs_ifork_cache); + kmem_cache_destroy(xfs_da_state_cache); + xfs_defer_destroy_item_caches(); + rcbagbt_destroy_cur_cache(); + xfs_btree_destroy_cur_caches(); + kmem_cache_destroy(xfs_log_ticket_cache); + kmem_cache_destroy(xfs_buf_cache); } STATIC int __init @@ -1961,12 +2576,13 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 0); if (!xfs_alloc_wq) return -ENOMEM; - xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0); + xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND), + 0); if (!xfs_discard_wq) goto out_free_alloc_wq; @@ -1990,44 +2606,41 @@ init_xfs_fs(void) xfs_check_ondisk_structs(); + error = xfs_dahash_test(); + if (error) + return error; + printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); - xfs_extent_free_init_defer_op(); - xfs_rmap_update_init_defer_op(); - xfs_refcount_update_init_defer_op(); - xfs_bmap_update_init_defer_op(); - xfs_dir_startup(); - error = xfs_init_zones(); + error = xfs_init_caches(); if (error) goto out; error = xfs_init_workqueues(); if (error) - goto out_destroy_zones; + goto out_destroy_caches; error = xfs_mru_cache_init(); if (error) goto out_destroy_wq; - error = xfs_buf_init(); - if (error) - goto out_mru_cache_uninit; - error = xfs_init_procfs(); if (error) - goto out_buf_terminate; + goto out_mru_cache_uninit; error = xfs_sysctl_register(); if (error) goto out_cleanup_procfs; + xfs_debugfs = xfs_debugfs_mkdir("xfs", NULL); + xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); if (!xfs_kset) { error = -ENOMEM; - goto out_sysctl_unregister; + goto out_debugfs_unregister; } xfsstats.xs_kobj.kobject.kset = xfs_kset; @@ -2043,11 +2656,15 @@ init_xfs_fs(void) if (error) goto out_free_stats; + error = xchk_global_stats_setup(xfs_debugfs); + if (error) + goto out_remove_stats_kobj; + #ifdef DEBUG xfs_dbg_kobj.kobject.kset = xfs_kset; error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); if (error) - goto out_remove_stats_kobj; + goto out_remove_scrub_stats; #endif error = xfs_qm_init(); @@ -2064,25 +2681,26 @@ init_xfs_fs(void) out_remove_dbg_kobj: #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); - out_remove_stats_kobj: + out_remove_scrub_stats: #endif + xchk_global_stats_teardown(); + out_remove_stats_kobj: xfs_sysfs_del(&xfsstats.xs_kobj); out_free_stats: free_percpu(xfsstats.xs_stats); out_kset_unregister: kset_unregister(xfs_kset); - out_sysctl_unregister: + out_debugfs_unregister: + debugfs_remove(xfs_debugfs); xfs_sysctl_unregister(); out_cleanup_procfs: xfs_cleanup_procfs(); - out_buf_terminate: - xfs_buf_terminate(); out_mru_cache_uninit: xfs_mru_cache_uninit(); out_destroy_wq: xfs_destroy_workqueues(); - out_destroy_zones: - xfs_destroy_zones(); + out_destroy_caches: + xfs_destroy_caches(); out: return error; } @@ -2095,15 +2713,16 @@ exit_xfs_fs(void) #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); #endif + xchk_global_stats_teardown(); xfs_sysfs_del(&xfsstats.xs_kobj); free_percpu(xfsstats.xs_stats); kset_unregister(xfs_kset); + debugfs_remove(xfs_debugfs); xfs_sysctl_unregister(); xfs_cleanup_procfs(); - xfs_buf_terminate(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); - xfs_destroy_zones(); + xfs_destroy_caches(); xfs_uuid_table_free(); } |
