diff options
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r-- | fs/btrfs/ioctl.c | 1117 |
1 files changed, 878 insertions, 239 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9d1eac15e09e..6c18bad53cd3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -29,16 +29,15 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> #include <linux/sched/xacct.h> +#include <linux/io_uring/cmd.h> #include "ctree.h" #include "disk-io.h" #include "export.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "volumes.h" #include "locking.h" #include "backref.h" -#include "rcu-string.h" #include "send.h" #include "dev-replace.h" #include "props.h" @@ -47,9 +46,7 @@ #include "tree-log.h" #include "compression.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" -#include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -231,6 +228,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) +{ + if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + +static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2) +{ + if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + /* * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. @@ -247,7 +258,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; @@ -365,15 +376,15 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, - strlen(comp), 0); + ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, - 0, 0); + ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -392,86 +403,6 @@ update_flags: return ret; } -/* - * Start exclusive operation @type, return true on success - */ -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - bool ret = false; - - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { - fs_info->exclusive_operation = type; - ret = true; - } - spin_unlock(&fs_info->super_lock); - - return ret; -} - -/* - * Conditionally allow to enter the exclusive operation in case it's compatible - * with the running one. This must be paired with btrfs_exclop_start_unlock and - * btrfs_exclop_finish. - * - * Compatibility: - * - the same type is already running - * - when trying to add a device and balance has been paused - * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller - * must check the condition first that would allow none -> @type - */ -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type || - (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && - type == BTRFS_EXCLOP_DEV_ADD)) - return true; - - spin_unlock(&fs_info->super_lock); - return false; -} - -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) -{ - spin_unlock(&fs_info->super_lock); -} - -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) -{ - spin_lock(&fs_info->super_lock); - WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); - spin_unlock(&fs_info->super_lock); - sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); -} - -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op) -{ - switch (op) { - case BTRFS_EXCLOP_BALANCE_PAUSED: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || - fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || - fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || - fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; - spin_unlock(&fs_info->super_lock); - break; - case BTRFS_EXCLOP_BALANCE: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; - spin_unlock(&fs_info->super_lock); - break; - default: - btrfs_warn(fs_info, - "invalid exclop balance operation %d requested", op); - } -} - static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); @@ -528,29 +459,16 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * block group is in the logical address space, which can be any * sectorsize aligned bytenr in the range [0, U64_MAX]. */ - if (range.len < fs_info->sb->s_blocksize) + if (range.len < fs_info->sectorsize) return -EINVAL; range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); - if (ret < 0) - return ret; if (copy_to_user(arg, &range, sizeof(range))) return -EFAULT; - return 0; -} - -int __pure btrfs_is_empty_uuid(u8 *uuid) -{ - int i; - - for (i = 0; i < BTRFS_UUID_SIZE; i++) { - if (uuid[i]) - return 0; - } - return 1; + return ret; } /* @@ -584,7 +502,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item *root_item; @@ -603,6 +521,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, int ret; dev_t anon_dev; u64 objectid; + u64 qgroup_reserved = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -640,19 +559,19 @@ static noinline int create_subvol(struct mnt_idmap *idmap, trans_num_items, false); if (ret) goto out_new_inode_args; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(root, &block_rsv); - goto out_new_inode_args; + goto out_release_rsv; } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - /* Tree log can't currently deal with an inode which is a new root. */ - btrfs_set_log_full_commit(trans); - ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit); + ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit); if (ret) goto out; @@ -703,6 +622,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { + int ret2; + /* * Since we don't abort the transaction in this case, free the * tree block so that we don't leak space and leave the @@ -713,7 +634,9 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_tree_lock(leaf); btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); - btrfs_free_tree_block(trans, objectid, leaf, 0, 1); + ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1); + if (ret2 < 0) + btrfs_abort_transaction(trans, ret2); free_extent_buffer(leaf); goto out; } @@ -751,15 +674,19 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; out: trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(root, &block_rsv); - btrfs_end_transaction(trans); +out_release_rsv: + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -776,11 +703,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv *block_rsv; + u64 qgroup_reserved = 0; int ret; /* We do not support snapshotting right now. */ @@ -817,24 +746,24 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, - BTRFS_BLOCK_RSV_TEMP); + block_rsv = &pending_snapshot->block_rsv; + btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item */ trans_num_items = create_subvol_num_items(inherit) + 3; - ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv, trans_num_items, false); if (ret) goto free_pending; + qgroup_reserved = block_rsv->qgroup_rsv_reserved; pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; - pending_snapshot->dir = dir; + pending_snapshot->dir = BTRFS_I(dir); pending_snapshot->inherit = inherit; trans = btrfs_start_transaction(root, 0); @@ -842,6 +771,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(trans); goto fail; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) { + btrfs_end_transaction(trans); + goto fail; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->pending_snapshot = pending_snapshot; @@ -871,7 +807,9 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); + btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); @@ -910,7 +848,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(d_inode(victim->d_parent) != dir); + /* The @victim is not inside @dir. */ + if (d_inode(victim->d_parent) != dir) + return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); @@ -962,7 +902,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; @@ -1018,7 +958,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { int ret; - bool snapshot_force_cow = false; /* * Force new buffered writes to reserve space even when NOCOW is @@ -1037,15 +976,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent, * creation. */ atomic_inc(&root->snapshot_force_cow); - snapshot_force_cow = true; - btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: - if (snapshot_force_cow) - atomic_dec(&root->snapshot_force_cow); btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } @@ -1097,7 +1034,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 new_size; u64 old_size; u64 devid = 1; @@ -1128,7 +1065,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = PTR_ERR(vol_args); goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + sizestr = vol_args->name; cancel = (strcmp("cancel", sizestr) == 0); ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); @@ -1275,14 +1215,14 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { - struct fd src = fdget(fd); + CLASS(fd, src)(fd); struct inode *src_inode; - if (!src.file) { + if (fd_empty(src)) { ret = -EINVAL; goto out_drop_write; } - src_inode = file_inode(src.file); + src_inode = file_inode(fd_file(src)); if (src_inode->i_sb != file_inode(file)->i_sb) { btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); @@ -1308,7 +1248,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, BTRFS_I(src_inode)->root, readonly, inherit); } - fdput(src); } out_drop_write: mnt_drop_write_file(file); @@ -1328,12 +1267,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); +out: kfree(vol_args); return ret; } @@ -1352,7 +1294,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto free_args; if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; @@ -1362,7 +1306,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { - u64 nums; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE) { @@ -1375,19 +1319,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (inherit->num_qgroups > PAGE_SIZE || - inherit->num_ref_copies > PAGE_SIZE || - inherit->num_excl_copies > PAGE_SIZE) { - ret = -EINVAL; - goto free_inherit; - } - - nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + - 2 * inherit->num_excl_copies; - if (vol_args->size != struct_size(inherit, qgroups, nums)) { - ret = -EINVAL; + ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size); + if (ret < 0) goto free_inherit; - } } ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), @@ -1405,7 +1339,7 @@ free_args: static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; @@ -1428,7 +1362,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; @@ -1481,7 +1415,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, spin_unlock(&root->root_item_lock); btrfs_warn(fs_info, "Attempt to set subvolume %llu read-write during send", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_drop_sem; } @@ -1675,7 +1609,7 @@ static noinline int search_ioctl(struct inode *inode, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *info = inode_to_fs_info(inode); struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; @@ -1888,9 +1822,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_ioctl_ino_lookup_user_args *args) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct super_block *sb = inode->i_sb; - struct btrfs_key upper_limit = BTRFS_I(inode)->location; - u64 treeid = BTRFS_I(inode)->root->root_key.objectid; + u64 upper_limit = btrfs_ino(BTRFS_I(inode)); + u64 treeid = btrfs_root_id(BTRFS_I(inode)->root); u64 dirid = args->dirid; unsigned long item_off; unsigned long item_len; @@ -1915,7 +1848,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * If the bottom subvolume does not exist directly under upper_limit, * construct the path in from the bottom up. */ - if (dirid != upper_limit.objectid) { + if (dirid != upper_limit) { ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; root = btrfs_get_fs_root(fs_info, treeid, true); @@ -1977,7 +1910,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * btree and lock the same leaf. */ btrfs_release_path(path); - temp_inode = btrfs_iget(sb, key2.objectid, root); + temp_inode = btrfs_iget(key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; @@ -1990,7 +1923,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } - if (key.offset == upper_limit.objectid) + if (key.offset == upper_limit) break; if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { ret = -EACCES; @@ -2062,7 +1995,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = root->root_key.objectid; + args->treeid = btrfs_root_id(root); if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2111,7 +2044,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) inode = file_inode(file); if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && - BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { /* * The subvolume does not exist under fd with which this is * called @@ -2158,7 +2091,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ - key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.objectid = btrfs_root_id(BTRFS_I(inode)->root); root = btrfs_get_fs_root(fs_info, key.objectid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); @@ -2273,7 +2206,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, return PTR_ERR(rootrefs); } - objectid = root->root_key.objectid; + objectid = btrfs_root_id(root); key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; @@ -2346,9 +2279,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, bool destroy_v2) { struct dentry *parent = file->f_path.dentry; - struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); struct dentry *dentry; struct inode *dir = d_inode(parent); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2357,7 +2290,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; - int err = 0; + int ret = 0; bool destroy_parent = false; /* We don't support snapshots with extent tree v2 yet. */ @@ -2373,7 +2306,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, return PTR_ERR(vol_args2); if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { - err = -EOPNOTSUPP; + ret = -EOPNOTSUPP; goto out; } @@ -2382,29 +2315,31 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { - vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); + if (ret < 0) + goto out; subvol_name = vol_args2->name; - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; } else { struct inode *old_dir; if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; + ret = -EINVAL; goto out; } - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, vol_args2->subvolid, 0); if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); + ret = PTR_ERR(dentry); goto out_drop_write; } @@ -2424,7 +2359,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ dput(dentry); if (IS_ERR(parent)) { - err = PTR_ERR(parent); + ret = PTR_ERR(parent); goto out_drop_write; } old_dir = dir; @@ -2448,14 +2383,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * to delete without an idmapped mount. */ if (old_dir != dir && idmap != &nop_mnt_idmap) { - err = -EOPNOTSUPP; + ret = -EOPNOTSUPP; goto free_parent; } subvol_name_ptr = btrfs_get_subvol_name_from_objectid( fs_info, vol_args2->subvolid); if (IS_ERR(subvol_name_ptr)) { - err = PTR_ERR(subvol_name_ptr); + ret = PTR_ERR(subvol_name_ptr); goto free_parent; } /* subvol_name_ptr is already nul terminated */ @@ -2466,11 +2401,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; + subvol_name = vol_args->name; - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; } @@ -2478,26 +2416,26 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (strchr(subvol_name, '/') || strncmp(subvol_name, "..", subvol_namelen) == 0) { - err = -EINVAL; + ret = -EINVAL; goto free_subvol_name; } if (!S_ISDIR(dir->i_mode)) { - err = -ENOTDIR; + ret = -ENOTDIR; goto free_subvol_name; } - err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (err == -EINTR) + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) goto free_subvol_name; dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); + ret = PTR_ERR(dentry); goto out_unlock_dir; } if (d_really_is_negative(dentry)) { - err = -ENOENT; + ret = -ENOENT; goto out_dput; } @@ -2517,7 +2455,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * Users who want to delete empty subvols should try * rmdir(2). */ - err = -EPERM; + ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; @@ -2528,29 +2466,29 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * of the subvol, not a random directory contained * within it. */ - err = -EINVAL; + ret = -EINVAL; if (root == dest) goto out_dput; - err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); - if (err) + ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); + if (ret) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(idmap, dir, dentry, 1); - if (err) + ret = btrfs_may_delete(idmap, dir, dentry, 1); + if (ret) goto out_dput; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; + ret = -EINVAL; goto out_dput; } btrfs_inode_lock(BTRFS_I(inode), 0); - err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); + ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry); btrfs_inode_unlock(BTRFS_I(inode), 0); - if (!err) + if (!ret) d_delete_notify(dir, dentry); out_dput: @@ -2567,7 +2505,7 @@ out_drop_write: out: kfree(vol_args2); kfree(vol_args); - return err; + return ret; } static int btrfs_ioctl_defrag(struct file *file, void __user *argp) @@ -2606,6 +2544,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } + /* + * Don't allow defrag on pre-content watched files, as it could + * populate the page cache with 0's via readahead. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + ret = -EINVAL; + goto out; + } + if (argp) { if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; @@ -2677,12 +2624,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) goto out; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); +out_free: kfree(vol_args); out: if (restore_op) @@ -2696,9 +2647,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2714,7 +2665,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto out; + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { args.devid = vol_args->devid; } else if (!strcmp("cancel", vol_args->name)) { @@ -2735,7 +2689,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); btrfs_exclop_finish(fs_info); @@ -2749,8 +2703,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2761,9 +2715,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2774,7 +2728,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { @@ -2790,17 +2747,18 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); +out_free: kfree(vol_args); return ret; } @@ -2904,7 +2862,7 @@ out: static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; @@ -2936,7 +2894,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(new_root->root_key.objectid)) { + if (!is_fstree(btrfs_root_id(new_root))) { ret = -ENOENT; goto out_free; } @@ -2967,7 +2925,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -3178,7 +3135,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); struct btrfs_ioctl_scrub_args *sa; int ret; @@ -3696,7 +3653,7 @@ out: static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_ctl_args *sa; int ret; @@ -3713,15 +3670,43 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) goto drop_write; } - down_write(&fs_info->subvol_sem); - switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: + down_write(&fs_info->subvol_sem); ret = btrfs_quota_enable(fs_info, sa); + up_write(&fs_info->subvol_sem); break; case BTRFS_QUOTA_CTL_DISABLE: + /* + * Lock the cleaner mutex to prevent races with concurrent + * relocation, because relocation may be building backrefs for + * blocks of the quota root while we are deleting the root. This + * is like dropping fs roots of deleted snapshots/subvolumes, we + * need the same protection. + * + * This also prevents races between concurrent tasks trying to + * disable quotas, because we will unlock and relock + * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + * + * We take this here because we have the dependency of + * + * inode_lock -> subvol_sem + * + * because of rename. With relocation we can prealloc extents, + * so that makes the dependency chain + * + * cleaner_mutex -> inode_lock -> subvol_sem + * + * so we must take the cleaner_mutex here before we take the + * subvol_sem. The deadlock can't actually happen, but this + * quiets lockdep. + */ + mutex_lock(&fs_info->cleaner_mutex); + down_write(&fs_info->subvol_sem); ret = btrfs_quota_disable(fs_info); + up_write(&fs_info->subvol_sem); + mutex_unlock(&fs_info->cleaner_mutex); break; default: ret = -EINVAL; @@ -3729,18 +3714,34 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) } kfree(sa); - up_write(&fs_info->subvol_sem); drop_write: mnt_drop_write_file(file); return ret; } +/* + * Quick check for ioctl handlers if quotas are enabled. Proper locking must be + * done before any operations. + */ +static bool qgroup_enabled(struct btrfs_fs_info *fs_info) +{ + bool ret = true; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) + ret = false; + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + return ret; +} + static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; + struct btrfs_qgroup_list *prealloc = NULL; struct btrfs_trans_handle *trans; int ret; int err; @@ -3748,6 +3749,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3758,14 +3762,27 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) goto drop_write; } + if (sa->assign) { + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + if (!prealloc) { + ret = -ENOMEM; + goto drop_write; + } + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } + /* + * Prealloc ownership is moved to the relation handler, there it's used + * or freed on error. + */ if (sa->assign) { - ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst); + ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst, prealloc); + prealloc = NULL; } else { ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); } @@ -3775,13 +3792,15 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) err = btrfs_run_qgroups(trans); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) - btrfs_handle_fs_error(fs_info, err, - "failed to update qgroup status and info"); + btrfs_warn(fs_info, + "qgroup status update failed after %s relation, marked as inconsistent", + sa->assign ? "adding" : "deleting"); err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: + kfree(prealloc); kfree(sa); drop_write: mnt_drop_write_file(file); @@ -3800,6 +3819,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3856,6 +3878,9 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3875,7 +3900,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = sa->qgroupid; if (!qgroupid) { /* take the current subvol as qgroup */ - qgroupid = root->root_key.objectid; + qgroupid = btrfs_root_id(root); } ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); @@ -3894,13 +3919,16 @@ drop_write: static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_rescan_args *qsa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3944,8 +3972,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, return 0; } -static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, - void __user *arg) +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3958,7 +3985,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; @@ -4006,7 +4033,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, !btrfs_is_empty_uuid(root_item->received_uuid)) { ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + btrfs_root_id(root)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4030,7 +4057,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + btrfs_root_id(root)); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4146,7 +4173,7 @@ static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_trans_handle *trans; @@ -4289,7 +4316,7 @@ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags flags[2]; @@ -4357,7 +4384,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4400,12 +4427,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; loff_t pos; struct kiocb kiocb; ssize_t ret; + u64 disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4458,7 +4490,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos; - ret = btrfs_encoded_read(&kiocb, &iter, &args); + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + + if (ret == -EIOCBQUEUED) { + bool unlocked = false; + u64 start, lockend, count; + + start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + if (args.compression) + count = disk_io_size; + else + count = args.len; + + ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + args.compression, &unlocked); + + if (!unlocked) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + } + if (ret >= 0) { fsnotify_access(file); if (copy_to_user(argp + copy_end, @@ -4555,7 +4612,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_iov; init_sync_kiocb(&kiocb, file); - ret = kiocb_set_rw_flags(&kiocb, 0); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) goto out_iov; kiocb.ki_pos = pos; @@ -4576,11 +4633,590 @@ out_acct: return ret; } +/* + * Context that's attached to an encoded read io_uring command, in cmd->pdu. It + * contains the fields in btrfs_uring_read_extent that are necessary to finish + * off and cleanup the I/O in btrfs_uring_read_finished. + */ +struct btrfs_uring_priv { + struct io_uring_cmd *cmd; + struct page **pages; + unsigned long nr_pages; + struct kiocb iocb; + struct iovec *iov; + struct iov_iter iter; + struct extent_state *cached_state; + u64 count; + u64 start; + u64 lockend; + int err; + bool compressed; +}; + +struct io_btrfs_cmd { + struct btrfs_uring_priv *priv; +}; + +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_priv *priv = bc->priv; + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + unsigned long index; + u64 cur; + size_t page_offset; + ssize_t ret; + + /* The inode lock has already been acquired in btrfs_uring_read_extent. */ + btrfs_lockdep_inode_acquire(inode, i_rwsem); + + if (priv->err) { + ret = priv->err; + goto out; + } + + if (priv->compressed) { + index = 0; + page_offset = 0; + } else { + index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; + page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); + } + cur = 0; + while (cur < priv->count) { + size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); + + if (copy_page_to_iter(priv->pages[index], page_offset, bytes, + &priv->iter) != bytes) { + ret = -EFAULT; + goto out; + } + + index++; + cur += bytes; + page_offset = 0; + } + ret = priv->count; + +out: + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + add_rchar(current, ret); + + for (index = 0; index < priv->nr_pages; index++) + __free_page(priv->pages[index]); + + kfree(priv->pages); + kfree(priv->iov); + kfree(priv); +} + +void btrfs_uring_read_extent_endio(void *ctx, int err) +{ + struct btrfs_uring_priv *priv = ctx; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd); + + priv->err = err; + bc->priv = priv; + + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); +} + +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state *cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + struct iovec *iov, struct io_uring_cmd *cmd) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + struct btrfs_uring_priv *priv = NULL; + unsigned long nr_pages; + int ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto out_fail; + } + + priv = kmalloc(sizeof(*priv), GFP_NOFS); + if (!priv) { + ret = -ENOMEM; + goto out_fail; + } + + priv->iocb = *iocb; + priv->iov = iov; + priv->iter = *iter; + priv->count = count; + priv->cmd = cmd; + priv->cached_state = cached_state; + priv->compressed = compressed; + priv->nr_pages = nr_pages; + priv->pages = pages; + priv->start = start; + priv->lockend = lockend; + priv->err = 0; + + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, priv); + if (ret && ret != -EIOCBQUEUED) + goto out_fail; + + /* + * If we return -EIOCBQUEUED, we're deferring the cleanup to + * btrfs_uring_read_finished(), which will handle unlocking the extent + * and inode and freeing the allocations. + */ + + /* + * We're returning to userspace with the inode lock held, and that's + * okay - it'll get unlocked in a worker thread. Call + * btrfs_lockdep_inode_release() to avoid confusing lockdep. + */ + btrfs_lockdep_inode_release(inode, i_rwsem); + + return -EIOCBQUEUED; + +out_fail: + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + kfree(priv); + return ret; +} + +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); + size_t copy_end; + int ret; + u64 disk_bytenr, disk_io_size; + struct file *file; + struct btrfs_inode *inode; + struct btrfs_fs_info *fs_info; + struct extent_io_tree *io_tree; + loff_t pos; + struct kiocb kiocb; + struct extent_state *cached_state = NULL; + u64 start, lockend; + void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + file = cmd->file; + inode = BTRFS_I(file->f_inode); + fs_info = inode->root->fs_info; + io_tree = &inode->io_tree; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; + goto out_acct; + } + + io_uring_cmd_get_async_data(cmd)->op_data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } + } + + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); + if (ret < 0) + goto out_free; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + if (issue_flags & IO_URING_F_NONBLOCK) + kiocb.ki_flags |= IOCB_NOWAIT; + + start = ALIGN_DOWN(pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, + &disk_bytenr, &disk_io_size); + if (ret < 0 && ret != -EIOCBQUEUED) + goto out_free; + + file_accessed(file); + + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { + if (ret == -EIOCBQUEUED) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + ret = -EFAULT; + goto out_free; + } + + if (ret == -EIOCBQUEUED) { + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); + + /* Match ioctl by not returning past EOF if uncompressed. */ + if (!data->args.compression) + count = min_t(u64, count, data->args.len); + + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); + + goto out_acct; + } + +out_free: + kfree(data->iov); + +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + + return ret; +} + +static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + loff_t pos; + struct kiocb kiocb; + struct file *file; + ssize_t ret; + void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + file = cmd->file; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; + goto out_acct; + } + + io_uring_cmd_get_async_data(cmd)->op_data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; + data->args.len = args32.len; + data->args.unencoded_len = args32.unencoded_len; + data->args.unencoded_offset = args32.unencoded_offset; + data->args.compression = args32.compression; + data->args.encryption = args32.encryption; + memcpy(data->args.reserved, args32.reserved, + sizeof(data->args.reserved)); +#else + ret = -ENOTTY; + goto out_acct; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (data->args.flags != 0) + goto out_acct; + if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved))) + goto out_acct; + if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (data->args.unencoded_offset > data->args.unencoded_len) + goto out_acct; + if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset) + goto out_acct; + + data->iov = data->iovstack; + ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_iov; + } + } + + if (issue_flags & IO_URING_F_NONBLOCK) { + ret = -EAGAIN; + goto out_acct; + } + + pos = data->args.offset; + ret = rw_verify_area(WRITE, file, &pos, data->args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); + if (ret) + goto out_iov; + kiocb.ki_pos = pos; + + file_start_write(file); + + ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args); + if (ret > 0) + fsnotify_modify(file); + + file_end_write(file); +out_iov: + kfree(data->iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case BTRFS_IOC_ENCODED_READ: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: +#endif + return btrfs_uring_encoded_read(cmd, issue_flags); + + case BTRFS_IOC_ENCODED_WRITE: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_WRITE_32: +#endif + return btrfs_uring_encoded_write(cmd, issue_flags); + } + + return -EINVAL; +} + +static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) +{ + struct btrfs_root *root; + struct btrfs_ioctl_subvol_wait args = { 0 }; + signed long sched_ret; + int refs; + u64 root_flags; + bool wait_for_deletion = false; + bool found = false; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + switch (args.mode) { + case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: + /* + * Wait for the first one deleted that waits until all previous + * are cleaned. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + found = true; + } + spin_unlock(&fs_info->trans_lock); + if (!found) + return -ENOENT; + + fallthrough; + case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: + if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || + BTRFS_LAST_FREE_OBJECTID < args.subvolid) + return -EINVAL; + break; + case BTRFS_SUBVOL_SYNC_COUNT: + spin_lock(&fs_info->trans_lock); + args.count = list_count_nodes(&fs_info->dead_roots); + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_FIRST: + spin_lock(&fs_info->trans_lock); + /* Last in the list was deleted first. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_LAST: + spin_lock(&fs_info->trans_lock); + /* First in the list was deleted last. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } + + /* 32bit limitation: fs_roots_radix key is not wide enough. */ + if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) + return -EOVERFLOW; + + while (1) { + /* Wait for the specific one. */ + if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) + return -EINTR; + refs = -1; + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)args.subvolid); + if (root) { + spin_lock(&root->root_item_lock); + refs = btrfs_root_refs(&root->root_item); + root_flags = btrfs_root_flags(&root->root_item); + spin_unlock(&root->root_item_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + up_read(&fs_info->subvol_sem); + + /* Subvolume does not exist. */ + if (!root) + return -ENOENT; + + /* Subvolume not deleted at all. */ + if (refs > 0) + return -EEXIST; + /* We've waited and now the subvolume is gone. */ + if (wait_for_deletion && refs == -1) { + /* Return the one we waited for as the last one. */ + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + } + + /* Subvolume not found on the first try (deleted or never existed). */ + if (refs == -1) + return -ENOENT; + + wait_for_deletion = true; + ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); + sched_ret = schedule_timeout_interruptible(HZ); + /* Early wake up or error. */ + if (sched_ret != 0) + return -EINTR; + } + + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; void __user *argp = (void __user *)arg; @@ -4649,11 +5285,10 @@ long btrfs_ioctl(struct file *file, unsigned int return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* - * The transaction thread may want to do more work, - * namely it pokes the cleaner kthread that will start - * processing uncleaned subvols. + * There may be work for the cleaner kthread to do (subvolume + * deletion, delayed iputs, defrag inodes, etc), so wake it up. */ - wake_up_process(fs_info->transaction_kthread); + wake_up_process(fs_info->cleaner_kthread); return ret; } case BTRFS_IOC_START_SYNC: @@ -4679,10 +5314,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(inode, argp, false); + return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(inode, argp, true); + return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); @@ -4699,7 +5334,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: - return btrfs_ioctl_quota_rescan_wait(fs_info, argp); + return btrfs_ioctl_quota_rescan_wait(fs_info); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: @@ -4718,6 +5353,8 @@ long btrfs_ioctl(struct file *file, unsigned int return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case FS_IOC_READ_VERITY_METADATA: + return fsverity_ioctl_read_metadata(file, argp); case BTRFS_IOC_ENCODED_READ: return btrfs_ioctl_encoded_read(file, argp, false); case BTRFS_IOC_ENCODED_WRITE: @@ -4728,6 +5365,8 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ENCODED_WRITE_32: return btrfs_ioctl_encoded_write(file, argp, true); #endif + case BTRFS_IOC_SUBVOL_SYNC_WAIT: + return btrfs_ioctl_subvol_sync(fs_info, argp); } return -ENOTTY; |