diff options
Diffstat (limited to 'fs/btrfs/super.c')
| -rw-r--r-- | fs/btrfs/super.c | 3443 |
1 files changed, 2179 insertions, 1264 deletions
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8eb6191d86da..1999533b52be 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1,24 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/blkdev.h> #include <linux/module.h> -#include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> @@ -28,8 +14,6 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mount.h> -#include <linux/mpage.h> -#include <linux/swap.h> #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> @@ -39,713 +23,912 @@ #include <linux/miscdevice.h> #include <linux/magic.h> #include <linux/slab.h> -#include <linux/cleancache.h> #include <linux/ratelimit.h> +#include <linux/crc32c.h> #include <linux/btrfs.h> -#include "compat.h" +#include <linux/security.h> +#include <linux/fs_parser.h> +#include "messages.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" +#include "direct-io.h" +#include "props.h" #include "xattr.h" -#include "volumes.h" +#include "bio.h" #include "export.h" #include "compression.h" -#include "rcu-string.h" #include "dev-replace.h" #include "free-space-cache.h" - +#include "backref.h" +#include "space-info.h" +#include "sysfs.h" +#include "zoned.h" +#include "tests/btrfs-tests.h" +#include "block-group.h" +#include "discard.h" +#include "qgroup.h" +#include "raid56.h" +#include "fs.h" +#include "accessors.h" +#include "defrag.h" +#include "dir-item.h" +#include "ioctl.h" +#include "scrub.h" +#include "verity.h" +#include "super.h" +#include "extent-tree.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; -static const char *btrfs_decode_error(int errno) +static void btrfs_put_super(struct super_block *sb) { - char *errstr = "unknown"; - - switch (errno) { - case -EIO: - errstr = "IO failure"; - break; - case -ENOMEM: - errstr = "Out of memory"; - break; - case -EROFS: - errstr = "Readonly filesystem"; - break; - case -EEXIST: - errstr = "Object already exists"; - break; - case -ENOSPC: - errstr = "No space left"; - break; - case -ENOENT: - errstr = "No such entry"; - break; - } + struct btrfs_fs_info *fs_info = btrfs_sb(sb); - return errstr; + btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid); + close_ctree(fs_info); } -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - /* - * today we only save the error info into ram. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); -} +/* Store the mount options related information. */ +struct btrfs_fs_context { + char *subvol_name; + u64 subvol_objectid; + u64 max_inline; + u32 commit_interval; + u32 metadata_ratio; + u32 thread_pool_size; + unsigned long long mount_opt; + unsigned long compress_type:4; + int compress_level; + refcount_t refs; +}; -/* btrfs handle error by forcing the filesystem readonly */ -static void btrfs_handle_error(struct btrfs_fs_info *fs_info) -{ - struct super_block *sb = fs_info->sb; +static void btrfs_emit_options(struct btrfs_fs_info *info, + struct btrfs_fs_context *old); - if (sb->s_flags & MS_RDONLY) - return; +enum { + Opt_acl, + Opt_clear_cache, + Opt_commit_interval, + Opt_compress, + Opt_compress_force, + Opt_compress_force_type, + Opt_compress_type, + Opt_degraded, + Opt_device, + Opt_fatal_errors, + Opt_flushoncommit, + Opt_max_inline, + Opt_barrier, + Opt_datacow, + Opt_datasum, + Opt_defrag, + Opt_discard, + Opt_discard_mode, + Opt_ratio, + Opt_rescan_uuid_tree, + Opt_skip_balance, + Opt_space_cache, + Opt_space_cache_version, + Opt_ssd, + Opt_ssd_spread, + Opt_subvol, + Opt_subvol_empty, + Opt_subvolid, + Opt_thread_pool, + Opt_treelog, + Opt_user_subvol_rm_allowed, + Opt_norecovery, + + /* Rescue options */ + Opt_rescue, + Opt_usebackuproot, + + /* Debugging options */ + Opt_enospc_debug, +#ifdef CONFIG_BTRFS_DEBUG + Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, + Opt_ref_verify, + Opt_ref_tracker, +#endif + Opt_err, +}; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - sb->s_flags |= MS_RDONLY; - btrfs_info(fs_info, "forced readonly"); - /* - * Note that a running device replace operation is not - * canceled here although there is no way to update - * the progress. It would add the risk of a deadlock, - * therefore the canceling is ommited. The only penalty - * is that some I/O remains active until the procedure - * completes. The next time when the filesystem is - * mounted writeable again, the device replace - * operation continues. - */ - } -} +enum { + Opt_fatal_errors_panic, + Opt_fatal_errors_bug, +}; -#ifdef CONFIG_PRINTK -/* - * __btrfs_std_error decodes expected errors from the caller and - * invokes the approciate error response. - */ -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; - const char *errstr; +static const struct constant_table btrfs_parameter_fatal_errors[] = { + { "panic", Opt_fatal_errors_panic }, + { "bug", Opt_fatal_errors_bug }, + {} +}; - /* - * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. - */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) - return; +enum { + Opt_discard_sync, + Opt_discard_async, +}; - errstr = btrfs_decode_error(errno); - if (fmt) { - struct va_format vaf; - va_list args; +static const struct constant_table btrfs_parameter_discard[] = { + { "sync", Opt_discard_sync }, + { "async", Opt_discard_async }, + {} +}; - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; +enum { + Opt_space_cache_v1, + Opt_space_cache_v2, +}; - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, function, line, errno, errstr, &vaf); - va_end(args); - } else { - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n", - sb->s_id, function, line, errno, errstr); - } +static const struct constant_table btrfs_parameter_space_cache[] = { + { "v1", Opt_space_cache_v1 }, + { "v2", Opt_space_cache_v2 }, + {} +}; - /* Don't go through full error handling during mount */ - save_error_info(fs_info); - if (sb->s_flags & MS_BORN) - btrfs_handle_error(fs_info); -} +enum { + Opt_rescue_usebackuproot, + Opt_rescue_nologreplay, + Opt_rescue_ignorebadroots, + Opt_rescue_ignoredatacsums, + Opt_rescue_ignoremetacsums, + Opt_rescue_ignoresuperflags, + Opt_rescue_parameter_all, +}; -static const char * const logtypes[] = { - "emergency", - "alert", - "critical", - "error", - "warning", - "notice", - "info", - "debug", +static const struct constant_table btrfs_parameter_rescue[] = { + { "usebackuproot", Opt_rescue_usebackuproot }, + { "nologreplay", Opt_rescue_nologreplay }, + { "ignorebadroots", Opt_rescue_ignorebadroots }, + { "ibadroots", Opt_rescue_ignorebadroots }, + { "ignoredatacsums", Opt_rescue_ignoredatacsums }, + { "ignoremetacsums", Opt_rescue_ignoremetacsums}, + { "ignoresuperflags", Opt_rescue_ignoresuperflags}, + { "idatacsums", Opt_rescue_ignoredatacsums }, + { "imetacsums", Opt_rescue_ignoremetacsums}, + { "isuperflags", Opt_rescue_ignoresuperflags}, + { "all", Opt_rescue_parameter_all }, + {} }; -void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; - char lvl[4]; - struct va_format vaf; - va_list args; - const char *type = logtypes[4]; - int kern_level; - - va_start(args, fmt); - - kern_level = printk_get_level(fmt); - if (kern_level) { - size_t size = printk_skip_level(fmt) - fmt; - memcpy(lvl, fmt, size); - lvl[size] = '\0'; - fmt += size; - type = logtypes[kern_level - '0']; - } else - *lvl = '\0'; - - vaf.fmt = fmt; - vaf.va = &args; - - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); - - va_end(args); -} +#ifdef CONFIG_BTRFS_DEBUG +enum { + Opt_fragment_parameter_data, + Opt_fragment_parameter_metadata, + Opt_fragment_parameter_all, +}; -#else +static const struct constant_table btrfs_parameter_fragment[] = { + { "data", Opt_fragment_parameter_data }, + { "metadata", Opt_fragment_parameter_metadata }, + { "all", Opt_fragment_parameter_all }, + {} +}; +#endif -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; +static const struct fs_parameter_spec btrfs_fs_parameters[] = { + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("autodefrag", Opt_defrag), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("clear_cache", Opt_clear_cache), + fsparam_u32("commit", Opt_commit_interval), + fsparam_flag("compress", Opt_compress), + fsparam_string("compress", Opt_compress_type), + fsparam_flag("compress-force", Opt_compress_force), + fsparam_string("compress-force", Opt_compress_force_type), + fsparam_flag_no("datacow", Opt_datacow), + fsparam_flag_no("datasum", Opt_datasum), + fsparam_flag("degraded", Opt_degraded), + fsparam_string("device", Opt_device), + fsparam_flag_no("discard", Opt_discard), + fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard), + fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors), + fsparam_flag_no("flushoncommit", Opt_flushoncommit), + fsparam_string("max_inline", Opt_max_inline), + fsparam_u32("metadata_ratio", Opt_ratio), + fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree), + fsparam_flag("skip_balance", Opt_skip_balance), + fsparam_flag_no("space_cache", Opt_space_cache), + fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache), + fsparam_flag_no("ssd", Opt_ssd), + fsparam_flag_no("ssd_spread", Opt_ssd_spread), + fsparam_string("subvol", Opt_subvol), + fsparam_flag("subvol=", Opt_subvol_empty), + fsparam_u64("subvolid", Opt_subvolid), + fsparam_u32("thread_pool", Opt_thread_pool), + fsparam_flag_no("treelog", Opt_treelog), + fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed), + + /* Rescue options. */ + fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), + /* Deprecated, with alias rescue=usebackuproot */ + __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), + /* For compatibility only, alias for "rescue=nologreplay". */ + fsparam_flag("norecovery", Opt_norecovery), + + /* Debugging options. */ + fsparam_flag_no("enospc_debug", Opt_enospc_debug), +#ifdef CONFIG_BTRFS_DEBUG + fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), + fsparam_flag("ref_tracker", Opt_ref_tracker), + fsparam_flag("ref_verify", Opt_ref_verify), +#endif + {} +}; - /* - * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. - */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) - return; +static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level) +{ + const int len = strlen(type); - /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) { - save_error_info(fs_info); - btrfs_handle_error(fs_info); - } + return (strncmp(string, type, len) == 0) && + ((may_have_level && string[len] == ':') || string[len] == '\0'); } -#endif -/* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, - unsigned int line, int errno) +static int btrfs_parse_compress(struct btrfs_fs_context *ctx, + const struct fs_parameter *param, int opt) { + const char *string = param->string; + int ret; + /* - * Report first abort since mount + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears "force-compress" + * without the need to pass "compress-force=[no|none]" before + * specifying "compress". */ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, - &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n", - errno); - } - trans->aborted = errno; - /* Nothing used. The other threads that have joined this - * transaction may be able to continue. */ - if (!trans->blocks_used) { - const char *errstr; - - errstr = btrfs_decode_error(errno); - btrfs_warn(root->fs_info, - "%s:%d: Aborting unused transaction(%s).", - function, line, errstr); - return; + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "zlib", true)) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "lzo", true)) { + ctx->compress_type = BTRFS_COMPRESS_LZO; + ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3, + &ctx->compress_level); + if (ret < 0) + goto error; + if (string[3] == ':' && string[4]) + btrfs_warn(NULL, "Compression level ignored for LZO"); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "zstd", true)) { + ctx->compress_type = BTRFS_COMPRESS_ZSTD; + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "no", false) || + btrfs_match_compress_type(string, "none", false)) { + ctx->compress_level = 0; + ctx->compress_type = 0; + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + } else { + ret = -EINVAL; + goto error; } - ACCESS_ONCE(trans->transaction->aborted) = errno; - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&root->fs_info->transaction_wait); - wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_std_error(root->fs_info, function, line, errno, NULL); -} -/* - * __btrfs_panic decodes unexpected, fatal errors from the caller, - * issues an alert, and either panics or BUGs, depending on mount options. - */ -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - char *s_id = "<unknown>"; - const char *errstr; - struct va_format vaf = { .fmt = fmt }; - va_list args; - - if (fs_info) - s_id = fs_info->sb->s_id; - - va_start(args, fmt); - vaf.va = &args; - - errstr = btrfs_decode_error(errno); - if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) - panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); - - printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); - va_end(args); - /* Caller calls BUG() */ -} + return 0; +error: + btrfs_err(NULL, "failed to parse compression option '%s'", string); + return ret; -static void btrfs_put_super(struct super_block *sb) -{ - (void)close_ctree(btrfs_sb(sb)->tree_root); - /* FIXME: need to fix VFS to return error? */ - /* AV: return it _where_? ->put_super() can be triggered by any number - * of async events, up to and including delivery of SIGKILL to the - * last process that kept it busy. Or segfault in the aforementioned - * process... Whom would you report that to? - */ } -enum { - Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, - Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, - Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_compress_type, Opt_compress_force, Opt_compress_force_type, - Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, - Opt_no_space_cache, Opt_recovery, Opt_skip_balance, - Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, Opt_fatal_errors, - Opt_err, -}; - -static match_table_t tokens = { - {Opt_degraded, "degraded"}, - {Opt_subvol, "subvol=%s"}, - {Opt_subvolid, "subvolid=%d"}, - {Opt_device, "device=%s"}, - {Opt_nodatasum, "nodatasum"}, - {Opt_nodatacow, "nodatacow"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_max_inline, "max_inline=%s"}, - {Opt_alloc_start, "alloc_start=%s"}, - {Opt_thread_pool, "thread_pool=%d"}, - {Opt_compress, "compress"}, - {Opt_compress_type, "compress=%s"}, - {Opt_compress_force, "compress-force"}, - {Opt_compress_force_type, "compress-force=%s"}, - {Opt_ssd, "ssd"}, - {Opt_ssd_spread, "ssd_spread"}, - {Opt_nossd, "nossd"}, - {Opt_noacl, "noacl"}, - {Opt_notreelog, "notreelog"}, - {Opt_flushoncommit, "flushoncommit"}, - {Opt_ratio, "metadata_ratio=%d"}, - {Opt_discard, "discard"}, - {Opt_space_cache, "space_cache"}, - {Opt_clear_cache, "clear_cache"}, - {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, - {Opt_enospc_debug, "enospc_debug"}, - {Opt_subvolrootid, "subvolrootid=%d"}, - {Opt_defrag, "autodefrag"}, - {Opt_inode_cache, "inode_cache"}, - {Opt_no_space_cache, "nospace_cache"}, - {Opt_recovery, "recovery"}, - {Opt_skip_balance, "skip_balance"}, - {Opt_check_integrity, "check_int"}, - {Opt_check_integrity_including_extent_data, "check_int_data"}, - {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, - {Opt_fatal_errors, "fatal_errors=%s"}, - {Opt_err, NULL}, -}; - -/* - * Regular mount options parser. Everything that is needed only when - * reading in a new superblock is parsed here. - * XXX JDM: This needs to be cleaned up for remount. - */ -int btrfs_parse_options(struct btrfs_root *root, char *options) +static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct btrfs_fs_info *info = root->fs_info; - substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig = NULL; - u64 cache_gen; - int intarg; - int ret = 0; - char *compress_type; - bool compress_force = false; - - cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (cache_gen) - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; - if (!options) - goto out; + opt = fs_parse(fc, btrfs_fs_parameters, param, &result); + if (opt < 0) + return opt; - /* - * strsep changes the string, duplicate it because parse_options - * gets called twice - */ - options = kstrdup(options, GFP_NOFS); - if (!options) - return -ENOMEM; - - orig = options; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; + switch (opt) { + case Opt_degraded: + btrfs_set_opt(ctx->mount_opt, DEGRADED); + break; + case Opt_subvol_empty: + /* + * This exists because we used to allow it on accident, so we're + * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow + * empty subvol= again"). + */ + break; + case Opt_subvol: + kfree(ctx->subvol_name); + ctx->subvol_name = kstrdup(param->string, GFP_KERNEL); + if (!ctx->subvol_name) + return -ENOMEM; + break; + case Opt_subvolid: + ctx->subvol_objectid = result.uint_64; - token = match_token(p, tokens, args); - switch (token) { - case Opt_degraded: - printk(KERN_INFO "btrfs: allowing degraded mounts\n"); - btrfs_set_opt(info->mount_opt, DEGRADED); + /* subvolid=0 means give me the original fs_tree. */ + if (!ctx->subvol_objectid) + ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID; + break; + case Opt_device: { + struct btrfs_device *device; + + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(param->string, false); + mutex_unlock(&uuid_mutex); + if (IS_ERR(device)) + return PTR_ERR(device); + break; + } + case Opt_datasum: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NODATASUM); + } else { + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } + break; + case Opt_datacow: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(ctx->mount_opt, NODATACOW); + btrfs_set_opt(ctx->mount_opt, NODATASUM); + } else { + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + } + break; + case Opt_compress_force: + case Opt_compress_force_type: + btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS); + fallthrough; + case Opt_compress: + case Opt_compress_type: + if (btrfs_parse_compress(ctx, param, opt)) + return -EINVAL; + break; + case Opt_ssd: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSSD); + btrfs_clear_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_ssd_spread: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_set_opt(ctx->mount_opt, SSD_SPREAD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_barrier: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOBARRIER); + else + btrfs_clear_opt(ctx->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + if (result.uint_32 == 0) { + btrfs_err(NULL, "invalid value 0 for thread_pool"); + return -EINVAL; + } + ctx->thread_pool_size = result.uint_32; + break; + case Opt_max_inline: + ctx->max_inline = memparse(param->string, NULL); + break; + case Opt_acl: + if (result.negated) { + fc->sb_flags &= ~SB_POSIXACL; + } else { +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#else + btrfs_err(NULL, "support for ACL not compiled in"); + return -EINVAL; +#endif + } + /* + * VFS limits the ability to toggle ACL on and off via remount, + * despite every file system allowing this. This seems to be + * an oversight since we all do, but it'll fail if we're + * remounting. So don't set the mask here, we'll check it in + * btrfs_reconfigure and do the toggling ourselves. + */ + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) + fc->sb_flags_mask |= SB_POSIXACL; + break; + case Opt_treelog: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOTREELOG); + else + btrfs_clear_opt(ctx->mount_opt, NOTREELOG); + break; + case Opt_norecovery: + btrfs_info(NULL, +"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; + case Opt_flushoncommit: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); + else + btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + ctx->metadata_ratio = result.uint_32; + break; + case Opt_discard: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, NODISCARD); + } else { + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + } + break; + case Opt_discard_mode: + switch (result.uint_32) { + case Opt_discard_sync: + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); break; - case Opt_subvol: - case Opt_subvolid: - case Opt_subvolrootid: - case Opt_device: - /* - * These are parsed by btrfs_parse_early_options - * and can be happily ignored here. - */ + case Opt_discard_async: + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC); break; - case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatasum\n"); - btrfs_set_opt(info->mount_opt, NODATASUM); + default: + btrfs_err(NULL, "unrecognized discard mode value %s", + param->key); + return -EINVAL; + } + btrfs_clear_opt(ctx->mount_opt, NODISCARD); + break; + case Opt_space_cache: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSPACECACHE); + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + } else { + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + } + break; + case Opt_space_cache_version: + switch (result.uint_32) { + case Opt_space_cache_v1: + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); break; - case Opt_nodatacow: - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { - printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n"); - } else { - printk(KERN_INFO "btrfs: setting nodatacow\n"); - } - info->compress_type = BTRFS_COMPRESS_NONE; - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - btrfs_set_opt(info->mount_opt, NODATACOW); - btrfs_set_opt(info->mount_opt, NODATASUM); + case Opt_space_cache_v2: + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE); break; - case Opt_compress_force: - case Opt_compress_force_type: - compress_force = true; - /* Fallthrough */ - case Opt_compress: - case Opt_compress_type: - if (token == Opt_compress || - token == Opt_compress_force || - strcmp(args[0].from, "zlib") == 0) { - compress_type = "zlib"; - info->compress_type = BTRFS_COMPRESS_ZLIB; - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - } else if (strcmp(args[0].from, "lzo") == 0) { - compress_type = "lzo"; - info->compress_type = BTRFS_COMPRESS_LZO; - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_LZO); - } else if (strncmp(args[0].from, "no", 2) == 0) { - compress_type = "no"; - info->compress_type = BTRFS_COMPRESS_NONE; - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - compress_force = false; - } else { - ret = -EINVAL; - goto out; - } + default: + btrfs_err(NULL, "unrecognized space_cache value %s", + param->key); + return -EINVAL; + } + break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE); + break; + case Opt_clear_cache: + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG); + else + btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG); + else + btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG); + break; + case Opt_usebackuproot: + btrfs_warn(NULL, + "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead"); + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); - if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - pr_info("btrfs: force %s compression\n", - compress_type); - } else - pr_info("btrfs: use %s compression\n", - compress_type); - break; - case Opt_ssd: - printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); - break; - case Opt_ssd_spread: - printk(KERN_INFO "btrfs: use spread ssd " - "allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); - btrfs_set_opt(info->mount_opt, SSD_SPREAD); - break; - case Opt_nossd: - printk(KERN_INFO "btrfs: not using ssd allocation " - "scheme\n"); - btrfs_set_opt(info->mount_opt, NOSSD); - btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); - break; - case Opt_nobarrier: - printk(KERN_INFO "btrfs: turning off barriers\n"); - btrfs_set_opt(info->mount_opt, NOBARRIER); - break; - case Opt_thread_pool: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) - info->thread_pool_size = intarg; - break; - case Opt_max_inline: - num = match_strdup(&args[0]); - if (num) { - info->max_inline = memparse(num, NULL); - kfree(num); - - if (info->max_inline) { - info->max_inline = max_t(u64, - info->max_inline, - root->sectorsize); - } - printk(KERN_INFO "btrfs: max_inline at %llu\n", - (unsigned long long)info->max_inline); - } - break; - case Opt_alloc_start: - num = match_strdup(&args[0]); - if (num) { - mutex_lock(&info->chunk_mutex); - info->alloc_start = memparse(num, NULL); - mutex_unlock(&info->chunk_mutex); - kfree(num); - printk(KERN_INFO - "btrfs: allocations start at %llu\n", - (unsigned long long)info->alloc_start); - } - break; - case Opt_noacl: - root->fs_info->sb->s_flags &= ~MS_POSIXACL; - break; - case Opt_notreelog: - printk(KERN_INFO "btrfs: disabling tree log\n"); - btrfs_set_opt(info->mount_opt, NOTREELOG); - break; - case Opt_flushoncommit: - printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); - btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); - break; - case Opt_ratio: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { - info->metadata_ratio = intarg; - printk(KERN_INFO "btrfs: metadata ratio %d\n", - info->metadata_ratio); - } - break; - case Opt_discard: - btrfs_set_opt(info->mount_opt, DISCARD); - break; - case Opt_space_cache: - btrfs_set_opt(info->mount_opt, SPACE_CACHE); - break; - case Opt_no_space_cache: - printk(KERN_INFO "btrfs: disabling disk space caching\n"); - btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + /* If we're loading the backup roots we can't trust the space cache. */ + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_skip_balance: + btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE); + break; + case Opt_fatal_errors: + switch (result.uint_32) { + case Opt_fatal_errors_panic: + btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; - case Opt_inode_cache: - printk(KERN_INFO "btrfs: enabling inode map caching\n"); - btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); + case Opt_fatal_errors_bug: + btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; - case Opt_clear_cache: - printk(KERN_INFO "btrfs: force clearing of disk cache\n"); - btrfs_set_opt(info->mount_opt, CLEAR_CACHE); + default: + btrfs_err(NULL, "unrecognized fatal_errors value %s", + param->key); + return -EINVAL; + } + break; + case Opt_commit_interval: + ctx->commit_interval = result.uint_32; + if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { + btrfs_warn(NULL, "excessive commit interval %u, use with care", + ctx->commit_interval); + } + if (ctx->commit_interval == 0) + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + break; + case Opt_rescue: + switch (result.uint_32) { + case Opt_rescue_usebackuproot: + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); break; - case Opt_user_subvol_rm_allowed: - btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + case Opt_rescue_nologreplay: + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; - case Opt_enospc_debug: - btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + case Opt_rescue_ignorebadroots: + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); break; - case Opt_defrag: - printk(KERN_INFO "btrfs: enabling auto defrag\n"); - btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); + case Opt_rescue_ignoredatacsums: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); break; - case Opt_recovery: - printk(KERN_INFO "btrfs: enabling auto recovery\n"); - btrfs_set_opt(info->mount_opt, RECOVERY); + case Opt_rescue_ignoremetacsums: + btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS); break; - case Opt_skip_balance: - btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + case Opt_rescue_ignoresuperflags: + btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS); break; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - case Opt_check_integrity_including_extent_data: - printk(KERN_INFO "btrfs: enabling check integrity" - " including extent data\n"); - btrfs_set_opt(info->mount_opt, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + case Opt_rescue_parameter_all: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS); + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; - case Opt_check_integrity: - printk(KERN_INFO "btrfs: enabling check integrity\n"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + default: + btrfs_info(NULL, "unrecognized rescue option '%s'", + param->key); + return -EINVAL; + } + break; +#ifdef CONFIG_BTRFS_DEBUG + case Opt_fragment: + switch (result.uint_32) { + case Opt_fragment_parameter_all: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; - case Opt_check_integrity_print_mask: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { - info->check_integrity_print_mask = intarg; - printk(KERN_INFO "btrfs:" - " check_integrity_print_mask 0x%x\n", - info->check_integrity_print_mask); - } + case Opt_fragment_parameter_metadata: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; -#else - case Opt_check_integrity_including_extent_data: - case Opt_check_integrity: - case Opt_check_integrity_print_mask: - printk(KERN_ERR "btrfs: support for check_integrity*" - " not compiled in!\n"); - ret = -EINVAL; - goto out; -#endif - case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) - btrfs_set_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) - btrfs_clear_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - else { - ret = -EINVAL; - goto out; - } + case Opt_fragment_parameter_data: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); break; - case Opt_err: - printk(KERN_INFO "btrfs: unrecognized mount option " - "'%s'\n", p); - ret = -EINVAL; - goto out; default: - break; + btrfs_info(NULL, "unrecognized fragment option '%s'", + param->key); + return -EINVAL; } + break; + case Opt_ref_verify: + btrfs_set_opt(ctx->mount_opt, REF_VERIFY); + break; + case Opt_ref_tracker: + btrfs_set_opt(ctx->mount_opt, REF_TRACKER); + break; +#endif + default: + btrfs_err(NULL, "unrecognized mount option '%s'", param->key); + return -EINVAL; } -out: - if (!ret && btrfs_test_opt(root, SPACE_CACHE)) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); - kfree(orig); + + return 0; +} + +/* + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount code + * paths. + */ +static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) +{ + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); +} + +static bool check_ro_option(const struct btrfs_fs_info *fs_info, + unsigned long long mount_opt, unsigned long long opt, + const char *opt_name) +{ + if (mount_opt & opt) { + btrfs_err(fs_info, "%s must be used with ro mount option", + opt_name); + return true; + } + return false; +} + +bool btrfs_check_options(const struct btrfs_fs_info *info, + unsigned long long *mount_opt, + unsigned long flags) +{ + bool ret = true; + + if (!(flags & SB_RDONLY) && + (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREMETACSUMS, "ignoremetacsums") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNORESUPERFLAGS, "ignoresuperflags"))) + ret = false; + + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) { + btrfs_err(info, "cannot disable free-space-tree"); + ret = false; + } + if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) { + btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature"); + ret = false; + } + + if (btrfs_check_mountopts_zoned(info, mount_opt)) + ret = false; + + if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { + btrfs_warn(info, +"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); + } + } + return ret; } /* - * Parse mount options that are required early in the mount process. + * This is subtle, we only call this during open_ctree(). We need to pre-load + * the mount options with the on-disk settings. Before the new mount API took + * effect we would do this on mount and remount. With the new mount API we'll + * only do this on the initial mount. * - * All other options will be parsed on much later in the mount process and - * only when we need to allocate a new super block. + * This isn't a change in behavior, because we're using the current state of the + * file system to set the current mount options. If you mounted with special + * options to disable these features and then remounted we wouldn't revert the + * settings, because mounting without these features cleared the on-disk + * settings, so this being called on re-mount is not needed. */ -static int btrfs_parse_early_options(const char *options, fmode_t flags, - void *holder, char **subvol_name, u64 *subvol_objectid, - struct btrfs_fs_devices **fs_devices) +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) { - substring_t args[MAX_OPT_ARGS]; - char *device_name, *opts, *orig, *p; - int error = 0; - int intarg; + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, + "forcing free space tree for sector size %u with page size %lu", + fs_info->sectorsize, PAGE_SIZE); + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + } + } - if (!options) - return 0; + /* + * At this point our mount options are populated, so we only mess with + * these settings if we don't have any settings already. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + return; + + if (btrfs_is_zoned(fs_info) && + btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_info(fs_info, "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(fs_info->super_copy, 0); + return; + } + + if (btrfs_test_opt(fs_info, SPACE_CACHE)) + return; + + if (btrfs_test_opt(fs_info, NOSPACECACHE)) + return; /* - * strsep changes the string, duplicate it because parse_options - * gets called twice + * At this point we don't have explicit options set by the user, set + * them ourselves based on the state of the file system. */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + else if (btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); +} - while ((p = strsep(&opts, ",")) != NULL) { - int token; - if (!*p) - continue; +static void set_device_specific_options(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, NOSSD) && + !fs_info->fs_devices->rotating) + btrfs_set_opt(fs_info->mount_opt, SSD); - token = match_token(p, tokens, args); - switch (token) { - case Opt_subvol: - kfree(*subvol_name); - *subvol_name = match_strdup(&args[0]); - break; - case Opt_subvolid: - intarg = 0; - error = match_int(&args[0], &intarg); - if (!error) { - /* we want the original fs_tree */ - if (!intarg) - *subvol_objectid = - BTRFS_FS_TREE_OBJECTID; - else - *subvol_objectid = intarg; + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + * + * The zoned mode piggy backs on the discard functionality for + * resetting a zone. There is no reason to delay the zone reset as it is + * fast enough. So, do not enable async discard for zoned mode. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable && + !btrfs_is_zoned(fs_info)) + btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC); +} + +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_root *fs_root = NULL; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + BTRFS_PATH_AUTO_FREE(path); + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; + + /* + * Walk up the subvolume trees in the tree of tree roots by root + * backrefs until we hit the top-level subvolume. + */ + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_backwards(root, &key, path); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + fs_root = NULL; + goto err; + } + + /* + * Walk up the filesystem tree by inode refs until we hit the + * root directory. + */ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_backwards(fs_root, &key, path); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; } - break; - case Opt_subvolrootid: - printk(KERN_WARNING - "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n"); - break; - case Opt_device: - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; + + dirid = key.offset; + + inode_ref = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], + inode_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; } - error = btrfs_scan_one_device(device_name, - flags, holder, fs_devices); - kfree(device_name); - if (error) - goto out; - break; - default: - break; + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(inode_ref + 1), len); + ptr[0] = '/'; + btrfs_release_path(path); } + btrfs_put_root(fs_root); + fs_root = NULL; } -out: - kfree(orig); - return error; + if (ptr == name + PATH_MAX - 1) { + name[0] = '/'; + name[1] = '\0'; + } else { + memmove(name, ptr, name + PATH_MAX - ptr); + } + return name; + +err: + btrfs_put_root(fs_root); + kfree(name); + return ERR_PTR(ret); } -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) +static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key location; - struct inode *inode; + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; - int new = 0; - - /* - * We have a specific subvol we want to mount, just setup location and - * go look up the root. - */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; - } path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); - path->leave_spinning = 1; + return -ENOMEM; /* * Find the "default" dir item which points to the root item that we @@ -753,103 +936,77 @@ static struct dentry *get_default_root(struct super_block *sb, * to mount. */ dir_id = btrfs_super_root_dir(fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { - btrfs_free_path(path); - return ERR_CAST(di); + return PTR_ERR(di); } if (!di) { /* * Ok the default dir item isn't there. This is weird since * it's always been there, but don't freak out, just try and - * mount to root most subvolume. + * mount the top-level subvolume. */ - btrfs_free_path(path); - dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; - goto setup_root; + *objectid = BTRFS_FS_TREE_OBJECTID; + return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - btrfs_free_path(path); - -find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (IS_ERR(new_root)) - return ERR_CAST(new_root); - - dir_id = btrfs_root_dirid(&new_root->root_item); -setup_root: - location.objectid = dir_id; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - inode = btrfs_iget(sb, &location, new_root, &new); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - /* - * If we're just mounting the root most subvol put the inode and return - * a reference to the dentry. We will have already gotten a reference - * to the inode in btrfs_fill_super so we're good to go. - */ - if (!new && sb->s_root->d_inode == inode) { - iput(inode); - return dget(sb->s_root); - } - - return d_obtain_alias(inode); + *objectid = location.objectid; + return 0; } static int btrfs_fill_super(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - void *data, int silent) + struct btrfs_fs_devices *fs_devices) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_key key; - int err; + int ret; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; - sb->s_d_op = &btrfs_dentry_operations; + set_default_d_op(sb, &btrfs_dentry_operations); sb->s_export_op = &btrfs_export_ops; +#ifdef CONFIG_FS_VERITY + sb->s_vop = &btrfs_verityops; +#endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; -#endif - sb->s_flags |= MS_I_VERSION; - err = open_ctree(sb, fs_devices, (char *)data); - if (err) { - printk("btrfs: open_ctree failed\n"); - return err; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; + + ret = super_setup_bdi(sb); + if (ret) { + btrfs_err(fs_info, "super_setup_bdi failed"); + return ret; } - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); + ret = open_ctree(sb, fs_devices); + if (ret) { + btrfs_err(fs_info, "open_ctree failed: %d", ret); + return ret; + } + + btrfs_emit_options(fs_info, NULL); + + inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { - err = PTR_ERR(inode); + ret = PTR_ERR(inode); + btrfs_handle_fs_error(fs_info, ret, NULL); goto fail_close; } - sb->s_root = d_make_root(inode); + sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { - err = -ENOMEM; + ret = -ENOMEM; goto fail_close; } - save_mount_options(sb, data); - cleancache_init_fs(sb); - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; return 0; fail_close: - close_ctree(fs_info->tree_root); - return err; + close_ctree(fs_info); + return ret; } int btrfs_sync_fs(struct super_block *sb, int wait) @@ -858,488 +1015,579 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - trace_btrfs_sync_fs(wait); + trace_btrfs_sync_fs(fs_info, wait); if (!wait) { filemap_flush(fs_info->btree_inode->i_mapping); return 0; } - btrfs_wait_all_ordered_extents(fs_info, 1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ - if (PTR_ERR(trans) == -ENOENT) - return 0; - return PTR_ERR(trans); + if (PTR_ERR(trans) == -ENOENT) { + /* + * Exit unless we have some pending changes + * that need to go through commit + */ + if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT, + &fs_info->flags)) + return 0; + /* + * A non-blocking test if the fs is frozen. We must not + * start a new transaction here otherwise a deadlock + * happens. The pending operations are delayed to the + * next commit after thawing. + */ + if (sb_start_write_trylock(sb)) + sb_end_write(sb); + else + return 0; + trans = btrfs_start_transaction(root, 0); + } + if (IS_ERR(trans)) + return PTR_ERR(trans); } - return btrfs_commit_transaction(trans, root); + return btrfs_commit_transaction(trans); +} + +static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed) +{ + seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s); + *printed = true; } static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); - struct btrfs_root *root = info->tree_root; - char *compress_type; + const char *compress_type; + const char *subvol_name; + bool printed = false; - if (btrfs_test_opt(root, DEGRADED)) + if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(info, NODATASUM)) seq_puts(seq, ",nodatasum"); - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATACOW)) seq_puts(seq, ",nodatacow"); - if (btrfs_test_opt(root, NOBARRIER)) + if (btrfs_test_opt(info, NOBARRIER)) seq_puts(seq, ",nobarrier"); - if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", - (unsigned long long)info->max_inline); - if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", - (unsigned long long)info->alloc_start); + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) + seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) - seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) { - if (info->compress_type == BTRFS_COMPRESS_ZLIB) - compress_type = "zlib"; - else - compress_type = "lzo"; - if (btrfs_test_opt(root, FORCE_COMPRESS)) + seq_printf(seq, ",thread_pool=%u", info->thread_pool_size); + if (btrfs_test_opt(info, COMPRESS)) { + compress_type = btrfs_compress_type2str(info->compress_type); + if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); + if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO) + seq_printf(seq, ":%d", info->compress_level); } - if (btrfs_test_opt(root, NOSSD)) + if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); - if (btrfs_test_opt(root, SSD_SPREAD)) + if (btrfs_test_opt(info, SSD_SPREAD)) seq_puts(seq, ",ssd_spread"); - else if (btrfs_test_opt(root, SSD)) + else if (btrfs_test_opt(info, SSD)) seq_puts(seq, ",ssd"); - if (btrfs_test_opt(root, NOTREELOG)) + if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); - if (btrfs_test_opt(root, FLUSHONCOMMIT)) + if (btrfs_test_opt(info, NOLOGREPLAY)) + print_rescue_option(seq, "nologreplay", &printed); + if (btrfs_test_opt(info, USEBACKUPROOT)) + print_rescue_option(seq, "usebackuproot", &printed); + if (btrfs_test_opt(info, IGNOREBADROOTS)) + print_rescue_option(seq, "ignorebadroots", &printed); + if (btrfs_test_opt(info, IGNOREDATACSUMS)) + print_rescue_option(seq, "ignoredatacsums", &printed); + if (btrfs_test_opt(info, IGNOREMETACSUMS)) + print_rescue_option(seq, "ignoremetacsums", &printed); + if (btrfs_test_opt(info, IGNORESUPERFLAGS)) + print_rescue_option(seq, "ignoresuperflags", &printed); + if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(info, DISCARD_SYNC)) seq_puts(seq, ",discard"); - if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + if (btrfs_test_opt(info, DISCARD_ASYNC)) + seq_puts(seq, ",discard=async"); + if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_free_space_cache_v1_active(info)) seq_puts(seq, ",space_cache"); + else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) + seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(info, RESCAN_UUID_TREE)) + seq_puts(seq, ",rescan_uuid_tree"); + if (btrfs_test_opt(info, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); - if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) + if (btrfs_test_opt(info, ENOSPC_DEBUG)) seq_puts(seq, ",enospc_debug"); - if (btrfs_test_opt(root, AUTO_DEFRAG)) + if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(root, INODE_MAP_CACHE)) - seq_puts(seq, ",inode_cache"); - if (btrfs_test_opt(root, SKIP_BALANCE)) + if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); - if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + if (info->metadata_ratio) + seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio); + if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); + if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) + seq_printf(seq, ",commit=%u", info->commit_interval); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_test_opt(info, FRAGMENT_DATA)) + seq_puts(seq, ",fragment=data"); + if (btrfs_test_opt(info, FRAGMENT_METADATA)) + seq_puts(seq, ",fragment=metadata"); +#endif + if (btrfs_test_opt(info, REF_VERIFY)) + seq_puts(seq, ",ref_verify"); + if (btrfs_test_opt(info, REF_TRACKER)) + seq_puts(seq, ",ref_tracker"); + seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); + subvol_name = btrfs_get_subvol_name_from_objectid(info, + btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); + if (!IS_ERR(subvol_name)) { + seq_show_option(seq, "subvol", subvol_name); + kfree(subvol_name); + } return 0; } -static int btrfs_test_super(struct super_block *s, void *data) -{ - struct btrfs_fs_info *p = data; - struct btrfs_fs_info *fs_info = btrfs_sb(s); - - return fs_info->fs_devices == p->fs_devices; -} - -static int btrfs_set_super(struct super_block *s, void *data) -{ - int err = set_anon_super(s, data); - if (!err) - s->s_fs_info = data; - return err; -} - /* * subvolumes are identified by ino 256 */ -static inline int is_subvolume_inode(struct inode *inode) +static inline bool is_subvolume_inode(struct inode *inode) { if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) - return 1; - return 0; + return true; + return false; } -/* - * This will strip out the subvol=%s argument for an argument string and add - * subvolid=0 to make sure we get the actual tree root for path walking to the - * subvol we want. - */ -static char *setup_root_args(char *args) +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + struct vfsmount *mnt) { - unsigned len = strlen(args) + 2 + 1; - char *src, *dst, *buf; - - /* - * We need the same args as before, but with this substitution: - * s!subvol=[^,]+!subvolid=0! - * - * Since the replacement string is up to 2 bytes longer than the - * original, allocate strlen(args) + 2 + 1 bytes. - */ - - src = strstr(args, "subvol="); - /* This shouldn't happen, but just in case.. */ - if (!src) - return NULL; + struct dentry *root; + int ret; - buf = dst = kmalloc(len, GFP_NOFS); - if (!buf) - return NULL; + if (!subvol_name) { + if (!subvol_objectid) { + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), + &subvol_objectid); + if (ret) { + root = ERR_PTR(ret); + goto out; + } + } + subvol_name = btrfs_get_subvol_name_from_objectid( + btrfs_sb(mnt->mnt_sb), subvol_objectid); + if (IS_ERR(subvol_name)) { + root = ERR_CAST(subvol_name); + subvol_name = NULL; + goto out; + } - /* - * If the subvol= arg is not at the start of the string, - * copy whatever precedes it into buf. - */ - if (src != args) { - *src++ = '\0'; - strcpy(buf, args); - dst += strlen(args); } - strcpy(dst, "subvolid=0"); - dst += strlen("subvolid=0"); + root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; - /* - * If there is a "," after the original subvol=... string, - * copy that suffix into our buffer. Otherwise, we're done. - */ - src = strchr(src, ','); - if (src) - strcpy(dst, src); + if (!IS_ERR(root)) { + struct super_block *s = root->d_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(s); + struct inode *root_inode = d_inode(root); + u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root); + + ret = 0; + if (!is_subvolume_inode(root_inode)) { + btrfs_err(fs_info, "'%s' is not a valid subvolume", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + /* + * This will also catch a race condition where a + * subvolume which was passed by ID is renamed and + * another subvolume is renamed over the old location. + */ + btrfs_err(fs_info, + "subvol '%s' does not match subvolid %llu", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } + } - return buf; +out: + mntput(mnt); + kfree(subvol_name); + return root; } -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) +static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, + u32 new_pool_size, u32 old_pool_size) { - struct dentry *root; - struct vfsmount *mnt; - char *newargs; - - newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, - newargs); - kfree(newargs); - if (IS_ERR(mnt)) - return ERR_CAST(mnt); + if (new_pool_size == old_pool_size) + return; - root = mount_subtree(mnt, subvol_name); + fs_info->thread_pool_size = new_pool_size; - if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { - struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", - subvol_name); - } + btrfs_info(fs_info, "resize thread pool %d -> %d", + old_pool_size, new_pool_size); - return root; + btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + workqueue_set_max_active(fs_info->endio_workers, new_pool_size); + workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); + btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); } -/* - * Find a superblock for the given device / mount point. - * - * Note: This is based on get_sb_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. - */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, - const char *device_name, void *data) +static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, + unsigned long long old_opts, int flags) { - struct block_device *bdev = NULL; - struct super_block *s; - struct dentry *root; - struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_fs_info *fs_info = NULL; - fmode_t mode = FMODE_READ; - char *subvol_name = NULL; - u64 subvol_objectid = 0; - int error = 0; - - if (!(flags & MS_RDONLY)) - mode |= FMODE_WRITE; - - error = btrfs_parse_early_options(data, mode, fs_type, - &subvol_name, &subvol_objectid, - &fs_devices); - if (error) { - kfree(subvol_name); - return ERR_PTR(error); - } - - if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (flags & SB_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + if (flags & SB_RDONLY) + sync_filesystem(fs_info->sb); } +} - error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); - if (error) - return ERR_PTR(error); +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, + unsigned long long old_opts) +{ + const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); /* - * Setup a dummy root and fs_info for test/set super. This is because - * we don't actually fill this stuff out until open_ctree, but we need - * it for searching for existing supers, so this lets us do that and - * then open_ctree will properly initialize everything later. + * We need to cleanup all defraggable inodes if the autodefragment is + * close or the filesystem is read only. */ - fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - if (!fs_info) - return ERR_PTR(-ENOMEM); + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) { + btrfs_cleanup_defrag_inodes(fs_info); + } - fs_info->fs_devices = fs_devices; + /* If we toggled discard async */ + if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_resume(fs_info); + else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + !btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_cleanup(fs_info); + + /* If we toggled space cache */ + if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); +} - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); - if (!fs_info->super_copy || !fs_info->super_for_commit) { - error = -ENOMEM; - goto error_fs_info; +static int btrfs_remount_rw(struct btrfs_fs_info *fs_info) +{ + int ret; + + if (BTRFS_FS_ERROR(fs_info)) { + btrfs_err(fs_info, + "remounting read-write after error is not allowed"); + return -EINVAL; } - error = btrfs_open_devices(fs_devices, mode, fs_type); - if (error) - goto error_fs_info; + if (fs_info->fs_devices->rw_devices == 0) + return -EACCES; - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; + if (!btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, + "too many missing devices, writable remount is not allowed"); + return -EACCES; } - bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, - fs_info); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto error_close_devices; + if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); + return -EINVAL; } - if (s->s_root) { - btrfs_close_devices(fs_devices); - free_fs_info(fs_info); - if ((flags ^ s->s_flags) & MS_RDONLY) - error = -EBUSY; - } else { - char b[BDEVNAME_SIZE]; + /* + * NOTE: when remounting with a change that does writes, don't put it + * anywhere above this point, as we are not sure to be safe to write + * until we pass the above checks. + */ + ret = btrfs_start_pre_rw_mount(fs_info); + if (ret) + return ret; - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); - btrfs_sb(s)->bdev_holder = fs_type; - error = btrfs_fill_super(s, fs_devices, data, - flags & MS_SILENT ? 1 : 0); - } + btrfs_clear_sb_rdonly(fs_info->sb); - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) - deactivate_locked_super(s); + set_bit(BTRFS_FS_OPEN, &fs_info->flags); - return root; + /* + * If we've gone from readonly -> read-write, we need to get our + * sync/async discard lists in the right state. + */ + btrfs_discard_resume(fs_info); -error_close_devices: - btrfs_close_devices(fs_devices); -error_fs_info: - free_fs_info(fs_info); - return ERR_PTR(error); + return 0; } -static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) +static int btrfs_remount_ro(struct btrfs_fs_info *fs_info) { - spin_lock_irq(&workers->lock); - workers->max_workers = new_limit; - spin_unlock_irq(&workers->lock); -} + /* + * This also happens on 'umount -rf' or on shutdown, when the + * filesystem is busy. + */ + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); -static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, - int new_pool_size, int old_pool_size) -{ - if (new_pool_size == old_pool_size) - return; + btrfs_discard_cleanup(fs_info); - fs_info->thread_pool_size = new_pool_size; + /* Wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* Avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); - printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", - old_pool_size, new_pool_size); + btrfs_set_sb_rdonly(fs_info->sb); + + /* + * Setting SB_RDONLY will put the cleaner thread to sleep at the next + * loop if it's already active. If it's already asleep, we'll leave + * unused block groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + btrfs_delete_unused_bgs(fs_info); + + /* + * The cleaner task could be already running before we set the flag + * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make + * sure that after we finish the remount, i.e. after we call + * btrfs_commit_super(), the cleaner can no longer start a transaction + * - either because it was dropping a dead root, running delayed iputs + * or deleting an unused block group (the cleaner picked a block + * group from the list of unused block groups before we were able to + * in the previous call to btrfs_delete_unused_bgs()). + */ + wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE); - btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); - btrfs_set_max_workers(&fs_info->workers, new_pool_size); - btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); - btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, - new_pool_size); + /* + * We've set the superblock to RO mode, so we might have made the + * cleaner task sleep without running all pending delayed iputs. Go + * through all the delayed iputs here, so that if an unmount happens + * without remounting RW we don't end up at finishing close_ctree() + * with a non-empty list of delayed iputs. + */ + btrfs_run_delayed_iputs(fs_info); + + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); + + /* + * Pause the qgroup rescan worker if it is running. We don't want it to + * be still running after we are in RO mode, as after that, by the time + * we unmount, it might have left a transaction open, so we would leak + * the transaction and/or crash. + */ + btrfs_qgroup_wait_for_completion(fs_info, false); + + return btrfs_commit_super(fs_info); } -static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) +static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) { - set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + fs_info->max_inline = ctx->max_inline; + fs_info->commit_interval = ctx->commit_interval; + fs_info->metadata_ratio = ctx->metadata_ratio; + fs_info->thread_pool_size = ctx->thread_pool_size; + fs_info->mount_opt = ctx->mount_opt; + fs_info->compress_type = ctx->compress_type; + fs_info->compress_level = ctx->compress_level; } -static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, - unsigned long old_opts, int flags) +static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) { - if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && - (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || - (flags & MS_RDONLY))) { - /* wait for any defraggers to finish */ - wait_event(fs_info->transaction_wait, - (atomic_read(&fs_info->defrag_running) == 0)); - if (flags & MS_RDONLY) - sync_filesystem(fs_info->sb); - } + ctx->max_inline = fs_info->max_inline; + ctx->commit_interval = fs_info->commit_interval; + ctx->metadata_ratio = fs_info->metadata_ratio; + ctx->thread_pool_size = fs_info->thread_pool_size; + ctx->mount_opt = fs_info->mount_opt; + ctx->compress_type = fs_info->compress_type; + ctx->compress_level = fs_info->compress_level; } -static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, - unsigned long old_opts) +#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +static void btrfs_emit_options(struct btrfs_fs_info *info, + struct btrfs_fs_context *old) { - /* - * We need cleanup all defragable inodes if the autodefragment is - * close or the fs is R/O. - */ - if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && - (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || - (fs_info->sb->s_flags & MS_RDONLY))) { - btrfs_cleanup_defrag_inodes(fs_info); + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); + btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow"); + btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); + btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); + btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log"); + btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time"); + btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit"); + btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard"); + btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard"); + btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree"); + btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching"); + btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache"); + btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag"); + btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data"); + btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata"); + btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification"); + btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time"); + btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots"); + btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums"); + btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums"); + btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags"); + + btrfs_info_if_unset(info, old, NODATASUM, "setting datasum"); + btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); + btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); + btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); + btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers"); + btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); + btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); + btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); + btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag"); + btrfs_info_if_unset(info, old, COMPRESS, "use no compression"); + + /* Did the compression settings change? */ + if (btrfs_test_opt(info, COMPRESS) && + (!old || + old->compress_type != info->compress_type || + old->compress_level != info->compress_level || + (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) && + btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) { + const char *compress_type = btrfs_compress_type2str(info->compress_type); + + btrfs_info(info, "%s %s compression, level %d", + btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use", + compress_type, info->compress_level); } - clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) + btrfs_info(info, "max_inline set to %llu", info->max_inline); } -static int btrfs_remount(struct super_block *sb, int *flags, char *data) +static int btrfs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; - unsigned old_flags = sb->s_flags; - unsigned long old_opts = fs_info->mount_opt; - unsigned long old_compress_type = fs_info->compress_type; - u64 old_max_inline = fs_info->max_inline; - u64 old_alloc_start = fs_info->alloc_start; - int old_thread_pool_size = fs_info->thread_pool_size; - unsigned int old_metadata_ratio = fs_info->metadata_ratio; - int ret; - - btrfs_remount_prepare(fs_info); - - ret = btrfs_parse_options(root, data); - if (ret) { - ret = -EINVAL; - goto restore; - } + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_context old_ctx; + int ret = 0; + bool mount_reconfigure = (fc->s_fs_info != NULL); - btrfs_remount_begin(fs_info, old_opts, *flags); - btrfs_resize_thread_pool(fs_info, - fs_info->thread_pool_size, old_thread_pool_size); + btrfs_info_to_ctx(fs_info, &old_ctx); - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) - goto out; + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; - if (*flags & MS_RDONLY) { - /* - * this also happens on 'umount -rf' or on shutdown, when - * the filesystem is busy. - */ - sb->s_flags |= MS_RDONLY; + sync_filesystem(sb); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - btrfs_dev_replace_suspend_for_unmount(fs_info); - btrfs_scrub_cancel(fs_info); - btrfs_pause_balance(fs_info); + if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + return -EINVAL; - ret = btrfs_commit_super(root); - if (ret) - goto restore; - } else { - if (fs_info->fs_devices->rw_devices == 0) { - ret = -EACCES; - goto restore; - } + ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); + if (ret < 0) + return ret; - if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures && - !(*flags & MS_RDONLY)) { - printk(KERN_WARNING - "Btrfs: too many missing devices, writeable remount is not allowed\n"); - ret = -EACCES; - goto restore; + btrfs_ctx_to_info(fs_info, ctx); + btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags); + btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, + old_ctx.thread_pool_size); + + if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) { + btrfs_warn(fs_info, + "remount supports changing free space tree only from RO to RW"); + /* Make sure free space cache options match the state on disk. */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); } - - if (btrfs_super_log_root(fs_info->super_copy) != 0) { - ret = -EINVAL; - goto restore; + if (btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); } + } - ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) - goto restore; + ret = 0; + if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_ro(fs_info); + else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_rw(fs_info); + if (ret) + goto restore; - /* recover relocation */ - ret = btrfs_recover_relocation(root); - if (ret) - goto restore; + /* + * If we set the mask during the parameter parsing VFS would reject the + * remount. Here we can set the mask and the value will be updated + * appropriately. + */ + if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL)) + fc->sb_flags_mask |= SB_POSIXACL; - ret = btrfs_resume_balance_async(fs_info); - if (ret) - goto restore; + btrfs_emit_options(fs_info, &old_ctx); + wake_up_process(fs_info->transaction_kthread); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); + btrfs_clear_oneshot_options(fs_info); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - ret = btrfs_resume_dev_replace_async(fs_info); - if (ret) { - pr_warn("btrfs: failed to resume dev_replace\n"); - goto restore; - } - sb->s_flags &= ~MS_RDONLY; - } -out: - btrfs_remount_cleanup(fs_info, old_opts); return 0; - restore: - /* We've hit an error - don't reset MS_RDONLY */ - if (sb->s_flags & MS_RDONLY) - old_flags |= MS_RDONLY; - sb->s_flags = old_flags; - fs_info->mount_opt = old_opts; - fs_info->compress_type = old_compress_type; - fs_info->max_inline = old_max_inline; - mutex_lock(&fs_info->chunk_mutex); - fs_info->alloc_start = old_alloc_start; - mutex_unlock(&fs_info->chunk_mutex); - btrfs_resize_thread_pool(fs_info, - old_thread_pool_size, fs_info->thread_pool_size); - fs_info->metadata_ratio = old_metadata_ratio; - btrfs_remount_cleanup(fs_info, old_opts); + btrfs_ctx_to_info(fs_info, &old_ctx); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); return ret; } /* Used to sort the devices by max_avail(descending sort) */ -static int btrfs_cmp_device_free_bytes(const void *dev_info1, - const void *dev_info2) +static int btrfs_cmp_device_free_bytes(const void *a, const void *b) { - if (((struct btrfs_device_info *)dev_info1)->max_avail > - ((struct btrfs_device_info *)dev_info2)->max_avail) + const struct btrfs_device_info *dev_info1 = a; + const struct btrfs_device_info *dev_info2 = b; + + if (dev_info1->max_avail > dev_info2->max_avail) return -1; - else if (((struct btrfs_device_info *)dev_info1)->max_avail < - ((struct btrfs_device_info *)dev_info2)->max_avail) + else if (dev_info1->max_avail < dev_info2->max_avail) return 1; - else return 0; } @@ -1359,102 +1607,84 @@ static inline void btrfs_descending_sort_devices( * The helper to calc the free space on the devices that can be used to store * file data. */ -static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, + u64 *free_bytes) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_device_info *devices_info; + struct btrfs_device_info AUTO_KFREE(devices_info); struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - u64 skip_space; u64 type; u64 avail_space; - u64 used_space; u64 min_stripe_size; - int min_stripes = 1, num_stripes = 1; + int num_stripes = 1; int i = 0, nr_devices; - int ret; + const struct btrfs_raid_attr *rattr; + /* + * We aren't under the device list lock, so this is racy-ish, but good + * enough for our purposes. + */ nr_devices = fs_info->fs_devices->open_devices; - BUG_ON(!nr_devices); + if (!nr_devices) { + smp_mb(); + nr_devices = fs_info->fs_devices->open_devices; + ASSERT(nr_devices); + if (!nr_devices) { + *free_bytes = 0; + return 0; + } + } - devices_info = kmalloc(sizeof(*devices_info) * nr_devices, - GFP_NOFS); + devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), + GFP_KERNEL); if (!devices_info) return -ENOMEM; - /* calc min stripe number for data space alloction */ - type = btrfs_get_alloc_profile(root, 1); - if (type & BTRFS_BLOCK_GROUP_RAID0) { - min_stripes = 2; + /* calc min stripe number for data space allocation */ + type = btrfs_data_alloc_profile(fs_info); + rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; + + if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - } else if (type & BTRFS_BLOCK_GROUP_RAID1) { - min_stripes = 2; - num_stripes = 2; - } else if (type & BTRFS_BLOCK_GROUP_RAID10) { - min_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK) + num_stripes = rattr->ncopies; + else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; - } - if (type & BTRFS_BLOCK_GROUP_DUP) - min_stripe_size = 2 * BTRFS_STRIPE_LEN; - else - min_stripe_size = BTRFS_STRIPE_LEN; + /* Adjust for more than 1 stripe per device */ + min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->in_fs_metadata || !device->bdev || - device->is_tgtdev_for_dev_replace) + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + !device->bdev || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; + if (i >= nr_devices) + break; + avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ - do_div(avail_space, BTRFS_STRIPE_LEN); - avail_space *= BTRFS_STRIPE_LEN; - - /* - * In order to avoid overwritting the superblock on the drive, - * btrfs starts at an offset of at least 1MB when doing chunk - * allocation. - */ - skip_space = 1024 * 1024; - - /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) - skip_space = max(fs_info->alloc_start, skip_space); + avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* - * btrfs can not use the free space in [0, skip_space - 1], - * we must subtract it from the total. In order to implement - * it, we account the used space in this range first. + * Ensure we have at least min_stripe_size on top of the + * reserved space on the device. */ - ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - return ret; - } - - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; - - /* - * we can use the free space in [0, skip_space - 1], subtract - * it from the total. - */ - if (avail_space && avail_space >= skip_space) - avail_space -= skip_space; - else - avail_space = 0; - - if (avail_space < min_stripe_size) + if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size) continue; + avail_space -= BTRFS_DEVICE_RANGE_RESERVED; + devices_info[i].dev = device; devices_info[i].max_avail = avail_space; i++; } + rcu_read_unlock(); nr_devices = i; @@ -1462,9 +1692,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) i = nr_devices - 1; avail_space = 0; - while (nr_devices >= min_stripes) { - if (num_stripes > nr_devices) - num_stripes = nr_devices; + while (nr_devices >= rattr->devs_min) { + num_stripes = min(num_stripes, nr_devices); if (devices_info[i].max_avail >= min_stripe_size) { int j; @@ -1479,51 +1708,117 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) nr_devices--; } - kfree(devices_info); *free_bytes = avail_space; return 0; } +/* + * Calculate numbers for 'df', pessimistic in case of mixed raid profiles. + * + * If there's a redundant raid level at DATA block groups, use the respective + * multiplier to scale the sizes. + * + * Unused device space usage is based on simulating the chunk allocator + * algorithm that respects the device sizes and order of allocations. This is + * a close approximation of the actual use but there are other factors that may + * change the result (like a new metadata chunk). + * + * If metadata is exhausted, f_bavail will be 0. + */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_super_block *disk_super = fs_info->super_copy; - struct list_head *head = &fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; - int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)fs_info->fsid; + u64 total_free_meta = 0; + u32 bits = fs_info->sectorsize_bits; + __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; + unsigned factor = 1; + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; + u64 thresh = 0; + int mixed = 0; - /* holding chunk_muext to avoid allocating new chunks */ - mutex_lock(&fs_info->chunk_mutex); - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + int i; + total_free_data += found->disk_total - found->disk_used; total_free_data -= btrfs_account_ro_block_groups_free_space(found); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!list_empty(&found->block_groups[i])) + factor = btrfs_bg_type_to_factor( + btrfs_raid_array[i].bg_flag); + } + } + + /* + * Metadata in mixed block group profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; } total_used += found->disk_used; } - rcu_read_unlock(); - buf->f_namelen = BTRFS_NAME_LEN; - buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; - buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bsize = dentry->d_sb->s_blocksize; - buf->f_type = BTRFS_SUPER_MAGIC; - buf->f_bavail = total_free_data; - ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); - if (ret) { - mutex_unlock(&fs_info->chunk_mutex); + buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); + buf->f_blocks >>= bits; + buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); + + /* Account global block reserve as used, it's in logical size already */ + spin_lock(&block_rsv->lock); + /* Mixed block groups accounting is not byte-accurate, avoid overflow */ + if (buf->f_bfree >= block_rsv->size >> bits) + buf->f_bfree -= block_rsv->size >> bits; + else + buf->f_bfree = 0; + spin_unlock(&block_rsv->lock); + + buf->f_bavail = div_u64(total_free_data, factor); + ret = btrfs_calc_avail_data_space(fs_info, &total_free_data); + if (ret) return ret; - } - buf->f_bavail += total_free_data; + buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&fs_info->chunk_mutex); + + /* + * We calculate the remaining metadata space minus global reserve. If + * this is (supposedly) smaller than zero, there's no space. But this + * does not hold in practice, the exhausted state happens where's still + * some positive delta. So we apply some guesswork and compare the + * delta to a 4M threshold. (Practically observed delta was ~2M.) + * + * We probably cannot calculate the exact threshold value because this + * depends on the internal reservations requested by various + * operations, so some operations that consume a few metadata will + * succeed even if the Avail is zero. But this is better than the other + * way around. + */ + thresh = SZ_4M; + + /* + * We only want to claim there's no available space if we can no longer + * allocate chunks for our metadata profile and our global reserve will + * not fit in the free metadata space. If we aren't ->full then we + * still can allocate chunks and thus are fine using the currently + * calculated f_bavail. + */ + if (!mixed && block_rsv->space_info->full && + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) + buf->f_bavail = 0; + + buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bsize = fs_info->sectorsize; + buf->f_namelen = BTRFS_NAME_LEN; /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -1531,36 +1826,412 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32; + buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root); return 0; } +static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc) +{ + struct btrfs_fs_info *p = fc->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + return fs_info->fs_devices == p->fs_devices; +} + +static int btrfs_get_tree_super(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = fc->s_fs_info; + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_device *device; + struct super_block *sb; + blk_mode_t mode = sb_open_mode(fc->sb_flags); + int ret; + + btrfs_ctx_to_info(fs_info, ctx); + mutex_lock(&uuid_mutex); + + /* + * With 'true' passed to btrfs_scan_one_device() (mount time) we expect + * either a valid device or an error. + */ + device = btrfs_scan_one_device(fc->source, true); + ASSERT(device != NULL); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + return PTR_ERR(device); + } + fs_devices = device->fs_devices; + /* + * We cannot hold uuid_mutex calling sget_fc(), it will lead to a + * locking order reversal with s_umount. + * + * So here we increase the holding number of fs_devices, this will ensure + * the fs_devices itself won't be freed. + */ + btrfs_fs_devices_inc_holding(fs_devices); + fs_info->fs_devices = fs_devices; + mutex_unlock(&uuid_mutex); + + + sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); + if (IS_ERR(sb)) { + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + /* + * Since the fs_devices is not opened, it can be freed at any + * time after unlocking uuid_mutex. We need to avoid double + * free through put_fs_context()->btrfs_free_fs_info(). + * So here we reset fs_info->fs_devices to NULL, and let the + * regular fs_devices reclaim path to handle it. + * + * This applies to all later branches where no fs_devices is + * opened. + */ + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + return PTR_ERR(sb); + } + + if (sb->s_root) { + /* + * Not the first mount of the fs thus got an existing super block. + * Will reuse the returned super block, fs_info and fs_devices. + * + * fc->s_fs_info is not touched and will be later freed by + * put_fs_context() through btrfs_free_fs_context(). + */ + ASSERT(fc->s_fs_info == fs_info); + + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + /* + * At this stage we may have RO flag mismatch between + * fc->sb_flags and sb->s_flags. Caller should detect such + * mismatch and reconfigure with sb->s_umount rwsem held if + * needed. + */ + } else { + struct block_device *bdev; + + /* + * The first mount of the fs thus a new superblock, fc->s_fs_info + * must be NULL, and the ownership of our fs_info and fs_devices is + * transferred to the super block. + */ + ASSERT(fc->s_fs_info == NULL); + + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + ret = btrfs_open_devices(fs_devices, mode, sb); + if (ret < 0) + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + if (ret < 0) { + deactivate_locked_super(sb); + return ret; + } + if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + deactivate_locked_super(sb); + return -EACCES; + } + set_device_specific_options(fs_info); + bdev = fs_devices->latest_dev->bdev; + snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); + shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); + ret = btrfs_fill_super(sb, fs_devices); + if (ret) { + deactivate_locked_super(sb); + return ret; + } + } + + btrfs_clear_oneshot_options(fs_info); + + fc->root = dget(sb->s_root); + return 0; +} + +/* + * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes + * with different ro/rw options") the following works: + * + * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo + * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar + * + * which looks nice and innocent but is actually pretty intricate and deserves + * a long comment. + * + * On another filesystem a subvolume mount is close to something like: + * + * (iii) # create rw superblock + initial mount + * mount -t xfs /dev/sdb /opt/ + * + * # create ro bind mount + * mount --bind -o ro /opt/foo /mnt/foo + * + * # unmount initial mount + * umount /opt + * + * Of course, there's some special subvolume sauce and there's the fact that the + * sb->s_root dentry is really swapped after mount_subtree(). But conceptually + * it's very close and will help us understand the issue. + * + * The old mount API didn't cleanly distinguish between a mount being made ro + * and a superblock being made ro. The only way to change the ro state of + * either object was by passing ms_rdonly. If a new mount was created via + * mount(2) such as: + * + * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null); + * + * the MS_RDONLY flag being specified had two effects: + * + * (1) MNT_READONLY was raised -> the resulting mount got + * @mnt->mnt_flags |= MNT_READONLY raised. + * + * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems + * made the superblock ro. Note, how SB_RDONLY has the same value as + * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2). + * + * Creating a subtree mount via (iii) ends up leaving a rw superblock with a + * subtree mounted ro. + * + * But consider the effect on the old mount API on btrfs subvolume mounting + * which combines the distinct step in (iii) into a single step. + * + * By issuing (i) both the mount and the superblock are turned ro. Now when (ii) + * is issued the superblock is ro and thus even if the mount created for (ii) is + * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro + * to rw for (ii) which it did using an internal remount call. + * + * IOW, subvolume mounting was inherently complicated due to the ambiguity of + * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate + * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when + * passed by mount(8) to mount(2). + * + * Enter the new mount API. The new mount API disambiguates making a mount ro + * and making a superblock ro. + * + * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either + * fsmount() or mount_setattr() this is a pure VFS level change for a + * specific mount or mount tree that is never seen by the filesystem itself. + * + * (4) To turn a superblock ro the "ro" flag must be used with + * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem + * in fc->sb_flags. + * + * But, currently the util-linux mount command already utilizes the new mount + * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's + * btrfs or not, setting the whole super block RO. To make per-subvolume mounting + * work with different options work we need to keep backward compatibility. + */ +static int btrfs_reconfigure_for_mount(struct fs_context *fc) +{ + int ret = 0; + + if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY)) + ret = btrfs_reconfigure(fc); + + return ret; +} + +static int btrfs_get_tree_subvol(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = NULL; + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_context *dup_fc; + struct dentry *dentry; + struct vfsmount *mnt; + int ret = 0; + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. + */ + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + /* + * Dont call btrfs_free_fs_info() to free it as it's still + * initialized partially. + */ + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kvfree(fs_info); + return -ENOMEM; + } + btrfs_init_fs_info(fs_info); + + dup_fc = vfs_dup_fs_context(fc); + if (IS_ERR(dup_fc)) { + btrfs_free_fs_info(fs_info); + return PTR_ERR(dup_fc); + } + + /* + * When we do the sget_fc this gets transferred to the sb, so we only + * need to set it on the dup_fc as this is what creates the super block. + */ + dup_fc->s_fs_info = fs_info; + + ret = btrfs_get_tree_super(dup_fc); + if (ret) + goto error; + + ret = btrfs_reconfigure_for_mount(dup_fc); + up_write(&dup_fc->root->d_sb->s_umount); + if (ret) + goto error; + mnt = vfs_create_mount(dup_fc); + put_fs_context(dup_fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + /* + * This free's ->subvol_name, because if it isn't set we have to + * allocate a buffer to hold the subvol_name, so we just drop our + * reference to it here. + */ + dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt); + ctx->subvol_name = NULL; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + fc->root = dentry; + return 0; +error: + put_fs_context(dup_fc); + return ret; +} + +static int btrfs_get_tree(struct fs_context *fc) +{ + ASSERT(fc->s_fs_info == NULL); + + return btrfs_get_tree_subvol(fc); +} + static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); kill_anon_super(sb); - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); } -static struct file_system_type btrfs_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV, +static void btrfs_free_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_info *fs_info = fc->s_fs_info; + + if (fs_info) + btrfs_free_fs_info(fs_info); + + if (ctx && refcount_dec_and_test(&ctx->refs)) { + kfree(ctx->subvol_name); + kfree(ctx); + } +} + +static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc) +{ + struct btrfs_fs_context *ctx = src_fc->fs_private; + + /* + * Give a ref to our ctx to this dup, as we want to keep it around for + * our original fc so we can have the subvolume name or objectid. + * + * We unset ->source in the original fc because the dup needs it for + * mounting, and then once we free the dup it'll free ->source, so we + * need to make sure we're only pointing to it in one fc. + */ + refcount_inc(&ctx->refs); + fc->fs_private = ctx; + fc->source = src_fc->source; + src_fc->source = NULL; + return 0; +} + +static const struct fs_context_operations btrfs_fs_context_ops = { + .parse_param = btrfs_parse_param, + .reconfigure = btrfs_reconfigure, + .get_tree = btrfs_get_tree, + .dup = btrfs_dup_fs_context, + .free = btrfs_free_fs_context, }; + +static int btrfs_init_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + refcount_set(&ctx->refs, 1); + fc->fs_private = ctx; + fc->ops = &btrfs_fs_context_ops; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx); + } else { + ctx->thread_pool_size = + min_t(unsigned long, num_online_cpus() + 2, 8); + ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE; + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + fc->sb_flags |= SB_I_VERSION; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .init_fs_context = btrfs_init_fs_context, + .parameters = btrfs_fs_parameters, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | + FS_ALLOW_IDMAP | FS_MGTIME, + }; + MODULE_ALIAS_FS("btrfs"); +static int btrfs_control_open(struct inode *inode, struct file *file) +{ + /* + * The control file's private_data is used to hold the + * transaction when it is started and is used to keep + * track of whether a transaction is already in progress. + */ + file->private_data = NULL; + return 0; +} + /* - * used by btrfsctl to scan devices when no FS is mounted + * Used by /dev/btrfs-control for devices ioctls. */ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct btrfs_ioctl_vol_args *vol; - struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device = NULL; + dev_t devt = 0; int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) @@ -1569,78 +2240,251 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); + ret = btrfs_check_ioctl_vol_args_path(vol); + if (ret < 0) + goto out; switch (cmd) { case BTRFS_IOC_SCAN_DEV: - ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_fs_type, &fs_devices); + mutex_lock(&uuid_mutex); + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, false); + ret = PTR_ERR_OR_ZERO(device); + mutex_unlock(&uuid_mutex); + break; + case BTRFS_IOC_FORGET_DEV: + if (vol->name[0] != 0) { + ret = lookup_bdev(vol->name, &devt); + if (ret) + break; + } + ret = btrfs_forget_devices(devt); break; case BTRFS_IOC_DEVICES_READY: - ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_fs_type, &fs_devices); - if (ret) + mutex_lock(&uuid_mutex); + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, false); + if (IS_ERR_OR_NULL(device)) { + mutex_unlock(&uuid_mutex); + ret = PTR_ERR_OR_ZERO(device); break; - ret = !(fs_devices->num_devices == fs_devices->total_devices); + } + ret = !(device->fs_devices->num_devices == + device->fs_devices->total_devices); + mutex_unlock(&uuid_mutex); + break; + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + ret = btrfs_ioctl_get_supported_features((void __user*)arg); break; } +out: kfree(vol); return ret; } static int btrfs_freeze(struct super_block *sb) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = btrfs_sb(sb)->tree_root; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - /* no transaction, don't bother */ - if (PTR_ERR(trans) == -ENOENT) - return 0; - return PTR_ERR(trans); + set_bit(BTRFS_FS_FROZEN, &fs_info->flags); + /* + * We don't need a barrier here, we'll wait for any transaction that + * could be in progress on other threads (and do delayed iputs that + * we want to avoid on a frozen filesystem), or do the commit + * ourselves. + */ + return btrfs_commit_current_transaction(fs_info->tree_root); +} + +static int check_dev_super(struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_super_block *sb; + u64 last_trans; + u16 csum_type; + int ret = 0; + + /* This should be called with fs still frozen. */ + ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags)); + + /* Missing dev, no need to check. */ + if (!dev->bdev) + return 0; + + /* Only need to check the primary super block. */ + sb = btrfs_read_disk_super(dev->bdev, 0, true); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + /* Verify the checksum. */ + csum_type = btrfs_super_csum_type(sb); + if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) { + btrfs_err(fs_info, "csum type changed, has %u expect %u", + csum_type, btrfs_super_csum_type(fs_info->super_copy)); + ret = -EUCLEAN; + goto out; + } + + if (unlikely(btrfs_check_super_csum(fs_info, sb))) { + btrfs_err(fs_info, "csum for on-disk super block no longer matches"); + ret = -EUCLEAN; + goto out; + } + + /* Btrfs_validate_super() includes fsid check against super->fsid. */ + ret = btrfs_validate_super(fs_info, sb, 0); + if (ret < 0) + goto out; + + last_trans = btrfs_get_last_trans_committed(fs_info); + if (unlikely(btrfs_super_generation(sb) != last_trans)) { + btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", + btrfs_super_generation(sb), last_trans); + ret = -EUCLEAN; + goto out; } - return btrfs_commit_transaction(trans, root); +out: + btrfs_release_disk_super(sb); + return ret; } static int btrfs_unfreeze(struct super_block *sb) { + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + int ret = 0; + + /* + * Make sure the fs is not changed by accident (like hibernation then + * modified by other OS). + * If we found anything wrong, we mark the fs error immediately. + * + * And since the fs is frozen, no one can modify the fs yet, thus + * we don't need to hold device_list_mutex. + */ + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + ret = check_dev_super(device); + if (ret < 0) { + btrfs_handle_fs_error(fs_info, ret, + "super block on devid %llu got modified unexpectedly", + device->devid); + break; + } + } + clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); + + /* + * We still return 0, to allow VFS layer to unfreeze the fs even the + * above checks failed. Since the fs is either fine or read-only, we're + * safe to continue, without causing further damage. + */ return 0; } static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); - struct btrfs_fs_devices *cur_devices; - struct btrfs_device *dev, *first_dev = NULL; - struct list_head *head; - struct rcu_string *name; + + /* + * There should be always a valid pointer in latest_dev, it may be stale + * for a short moment in case it's being deleted but still valid until + * the end of RCU grace period. + */ + rcu_read_lock(); + seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\"); + rcu_read_unlock(); + + return 0; +} + +static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_count(fs_info, nr); + + return nr; +} + +static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) +{ + const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_free_extent_maps(fs_info, nr_to_scan); + + /* The extent map shrinker runs asynchronously, so always return 0. */ + return 0; +} + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + struct btrfs_dev_lookup_args lookup_args = { .devt = bdev->bd_dev }; + bool can_rw; mutex_lock(&fs_info->fs_devices->device_list_mutex); - cur_devices = fs_info->fs_devices; - while (cur_devices) { - head = &cur_devices->devices; - list_for_each_entry(dev, head, dev_list) { - if (dev->missing) - continue; - if (!first_dev || dev->devid < first_dev->devid) - first_dev = dev; - } - cur_devices = cur_devices->seed; + device = btrfs_find_device(fs_info->fs_devices, &lookup_args); + if (!device) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* Device not found, should not affect the running fs, just give a warning. */ + btrfs_warn(fs_info, "unable to find btrfs device for block device '%pg'", bdev); + return 0; + } + /* + * The to-be-removed device is already missing? + * + * That's weird but no special handling needed and can exit right now. + */ + if (unlikely(test_and_set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_warn(fs_info, "btrfs device id %llu is already missing", device->devid); + return 0; } - if (first_dev) { - rcu_read_lock(); - name = rcu_dereference(first_dev->name); - seq_escape(m, name->str, " \t\n\\"); - rcu_read_unlock(); - } else { - WARN_ON(1); + device->fs_devices->missing_devices++; + if (test_and_clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + list_del_init(&device->dev_alloc_list); + WARN_ON(device->fs_devices->rw_devices < 1); + device->fs_devices->rw_devices--; } + can_rw = btrfs_check_rw_degradable(fs_info, device); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * Now device is considered missing, btrfs_device_name() won't give a + * meaningful result anymore, so only output the devid. + */ + if (unlikely(!can_rw)) { + btrfs_crit(fs_info, + "btrfs device id %llu has gone missing, can not maintain read-write", + device->devid); + return -EIO; + } + btrfs_warn(fs_info, + "btrfs device id %llu has gone missing, continue as degraded", + device->devid); + btrfs_set_opt(fs_info->mount_opt, DEGRADED); return 0; } +static void btrfs_shutdown(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_force_shutdown(fs_info); +} +#endif + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -1648,18 +2492,24 @@ static const struct super_operations btrfs_super_ops = { .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, .show_devname = btrfs_show_devname, - .write_inode = btrfs_write_inode, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, + .free_inode = btrfs_free_inode, .statfs = btrfs_statfs, - .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, .unfreeze_fs = btrfs_unfreeze, + .nr_cached_objects = btrfs_nr_cached_objects, + .free_cached_objects = btrfs_free_cached_objects, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + .remove_bdev = btrfs_remove_bdev, + .shutdown = btrfs_shutdown, +#endif }; static const struct file_operations btrfs_ctl_fops = { + .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, - .compat_ioctl = btrfs_control_ioctl, + .compat_ioctl = compat_ptr_ioctl, .owner = THIS_MODULE, .llseek = noop_llseek, }; @@ -1673,121 +2523,186 @@ static struct miscdevice btrfs_misc = { MODULE_ALIAS_MISCDEV(BTRFS_MINOR); MODULE_ALIAS("devname:btrfs-control"); -static int btrfs_interface_init(void) +static int __init btrfs_interface_init(void) { return misc_register(&btrfs_misc); } -static void btrfs_interface_exit(void) +static __cold void btrfs_interface_exit(void) { - if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); + misc_deregister(&btrfs_misc); } -static void btrfs_print_info(void) +static int __init btrfs_print_mod_info(void) { - printk(KERN_INFO "Btrfs loaded" + static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL + ", experimental=on" +#endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - ", integrity-checker=on" +#ifdef CONFIG_BTRFS_ASSERT + ", assert=on" #endif - "\n"); -} - -static int __init init_btrfs_fs(void) -{ - int err; - - err = btrfs_init_sysfs(); - if (err) - return err; - - btrfs_init_compress(); - - err = btrfs_init_cachep(); - if (err) - goto free_compress; - - err = extent_io_init(); - if (err) - goto free_cachep; - - err = extent_map_init(); - if (err) - goto free_extent_io; - - err = ordered_data_init(); - if (err) - goto free_extent_map; +#ifdef CONFIG_BLK_DEV_ZONED + ", zoned=yes" +#else + ", zoned=no" +#endif +#ifdef CONFIG_FS_VERITY + ", fsverity=yes" +#else + ", fsverity=no" +#endif + ; - err = btrfs_delayed_inode_init(); - if (err) - goto free_ordered_data; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (btrfs_get_mod_read_policy() == NULL) + pr_info("Btrfs loaded%s\n", options); + else + pr_info("Btrfs loaded%s, read_policy=%s\n", + options, btrfs_get_mod_read_policy()); +#else + pr_info("Btrfs loaded%s\n", options); +#endif - err = btrfs_auto_defrag_init(); - if (err) - goto free_delayed_inode; + return 0; +} - err = btrfs_delayed_ref_init(); - if (err) - goto free_auto_defrag; +static int register_btrfs(void) +{ + return register_filesystem(&btrfs_fs_type); +} - err = btrfs_interface_init(); - if (err) - goto free_delayed_ref; +static void unregister_btrfs(void) +{ + unregister_filesystem(&btrfs_fs_type); +} - err = register_filesystem(&btrfs_fs_type); - if (err) - goto unregister_ioctl; +/* Helper structure for long init/exit functions. */ +struct init_sequence { + int (*init_func)(void); + /* Can be NULL if the init_func doesn't need cleanup. */ + void (*exit_func)(void); +}; - btrfs_init_lockdep(); +static const struct init_sequence mod_init_seq[] = { + { + .init_func = btrfs_props_init, + .exit_func = NULL, + }, { + .init_func = btrfs_init_sysfs, + .exit_func = btrfs_exit_sysfs, + }, { + .init_func = btrfs_init_compress, + .exit_func = btrfs_exit_compress, + }, { + .init_func = btrfs_init_cachep, + .exit_func = btrfs_destroy_cachep, + }, { + .init_func = btrfs_init_dio, + .exit_func = btrfs_destroy_dio, + }, { + .init_func = btrfs_transaction_init, + .exit_func = btrfs_transaction_exit, + }, { + .init_func = btrfs_ctree_init, + .exit_func = btrfs_ctree_exit, + }, { + .init_func = btrfs_free_space_init, + .exit_func = btrfs_free_space_exit, + }, { + .init_func = btrfs_extent_state_init_cachep, + .exit_func = btrfs_extent_state_free_cachep, + }, { + .init_func = extent_buffer_init_cachep, + .exit_func = extent_buffer_free_cachep, + }, { + .init_func = btrfs_bioset_init, + .exit_func = btrfs_bioset_exit, + }, { + .init_func = btrfs_extent_map_init, + .exit_func = btrfs_extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + }, { + .init_func = btrfs_read_policy_init, + .exit_func = NULL, +#endif + }, { + .init_func = ordered_data_init, + .exit_func = ordered_data_exit, + }, { + .init_func = btrfs_delayed_inode_init, + .exit_func = btrfs_delayed_inode_exit, + }, { + .init_func = btrfs_auto_defrag_init, + .exit_func = btrfs_auto_defrag_exit, + }, { + .init_func = btrfs_delayed_ref_init, + .exit_func = btrfs_delayed_ref_exit, + }, { + .init_func = btrfs_prelim_ref_init, + .exit_func = btrfs_prelim_ref_exit, + }, { + .init_func = btrfs_interface_init, + .exit_func = btrfs_interface_exit, + }, { + .init_func = btrfs_print_mod_info, + .exit_func = NULL, + }, { + .init_func = btrfs_run_sanity_tests, + .exit_func = NULL, + }, { + .init_func = register_btrfs, + .exit_func = unregister_btrfs, + } +}; - btrfs_print_info(); - btrfs_test_free_space_cache(); +static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; - return 0; +static __always_inline void btrfs_exit_btrfs_fs(void) +{ + int i; -unregister_ioctl: - btrfs_interface_exit(); -free_delayed_ref: - btrfs_delayed_ref_exit(); -free_auto_defrag: - btrfs_auto_defrag_exit(); -free_delayed_inode: - btrfs_delayed_inode_exit(); -free_ordered_data: - ordered_data_exit(); -free_extent_map: - extent_map_exit(); -free_extent_io: - extent_io_exit(); -free_cachep: - btrfs_destroy_cachep(); -free_compress: - btrfs_exit_compress(); - btrfs_exit_sysfs(); - return err; + for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { + if (!mod_init_result[i]) + continue; + if (mod_init_seq[i].exit_func) + mod_init_seq[i].exit_func(); + mod_init_result[i] = false; + } } static void __exit exit_btrfs_fs(void) { - btrfs_destroy_cachep(); - btrfs_delayed_ref_exit(); - btrfs_auto_defrag_exit(); - btrfs_delayed_inode_exit(); - ordered_data_exit(); - extent_map_exit(); - extent_io_exit(); - btrfs_interface_exit(); - unregister_filesystem(&btrfs_fs_type); - btrfs_exit_sysfs(); + btrfs_exit_btrfs_fs(); btrfs_cleanup_fs_uuids(); - btrfs_exit_compress(); } -module_init(init_btrfs_fs) +static int __init init_btrfs_fs(void) +{ + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { + ASSERT(!mod_init_result[i]); + ret = mod_init_seq[i].init_func(); + if (ret < 0) { + btrfs_exit_btrfs_fs(); + return ret; + } + mod_init_result[i] = true; + } + return 0; +} + +late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) +MODULE_DESCRIPTION("B-Tree File System (BTRFS)"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); +MODULE_SOFTDEP("pre: xxhash64"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: blake2b-256"); |
