diff options
Diffstat (limited to 'fs')
94 files changed, 1444 insertions, 1074 deletions
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f3248a3e5402..c1acbc98465d 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -66,7 +66,6 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) struct p9_fid *fid; struct inode *inode; struct v9fs_inode *v9inode; - unsigned int cached; if (flags & LOOKUP_RCU) return -ECHILD; @@ -76,11 +75,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto out_valid; v9inode = V9FS_I(inode); - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - - cached = v9ses->cache & (CACHE_META | CACHE_LOOSE); - - if (!cached || v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { + if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { int retval; struct v9fs_session_info *v9ses; @@ -114,6 +109,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) p9_debug(P9_DEBUG_VFS, "refresh inode: dentry = %pd (%p), got error %pe\n", dentry, dentry, ERR_PTR(retval)); + if (retval < 0) return retval; } } @@ -150,8 +146,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = { }; const struct dentry_operations v9fs_dentry_operations = { - .d_revalidate = v9fs_lookup_revalidate, - .d_weak_revalidate = __v9fs_lookup_revalidate, .d_release = v9fs_dentry_release, .d_unalias_trylock = v9fs_dentry_unalias_trylock, .d_unalias_unlock = v9fs_dentry_unalias_unlock, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 69f378a83775..d0c77ec31b1d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1339,14 +1339,8 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) * Don't update inode if the file type is different */ umode = p9mode2unixmode(v9ses, st, &rdev); - if (inode_wrong_type(inode, umode)) { - /* - * Do this as a way of letting the caller know the inode should not - * be reused - */ - v9fs_invalidate_inode_attr(inode); + if (inode_wrong_type(inode, umode)) goto out; - } /* * We don't want to refresh inode->i_size, diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 0b404e8484d2..be297e335468 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -897,14 +897,8 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) /* * Don't update inode if the file type is different */ - if (inode_wrong_type(inode, st->st_mode)) { - /* - * Do this as a way of letting the caller know the inode should not - * be reused - */ - v9fs_invalidate_inode_attr(inode); + if (inode_wrong_type(inode, st->st_mode)) goto out; - } /* * We don't want to refresh inode->i_size, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 41e37f7f67cc..3df7b9d7fbe8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -2110,9 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); + btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i], &delayed_node_trackers[i]); - btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); } } } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0d949edc0caf..b09d4ec8c77d 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -219,6 +219,13 @@ static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) return; + /* + * Only print if there are leaked references. The caller is + * holding one reference, so if refs == 1 there is no leak. + */ + if (refcount_read(&node->refs) == 1) + return; + ref_tracker_dir_print(&node->ref_dir.dir, BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c123a3ef154a..23273d0e6f22 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -973,7 +973,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, { const u64 ra_pos = readahead_pos(ractl); const u64 ra_end = ra_pos + readahead_length(ractl); - const u64 em_end = em->start + em->ram_bytes; + const u64 em_end = em->start + em->len; /* No expansion for holes and inline extents. */ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) @@ -2228,6 +2228,14 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_account_cgroup_owner(wbc, folio, range_len); folio_unlock(folio); } + /* + * If the fs is already in error status, do not submit any writeback + * but immediately finish it. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) { + btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); + return; + } btrfs_submit_bbio(bbio, 0); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7efd1f8a1912..fa82def46e39 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2854,12 +2854,22 @@ static int btrfs_fallocate_update_isize(struct inode *inode, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; + u64 range_start; + u64 range_end; int ret; int ret2; if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) return 0; + range_start = round_down(i_size_read(inode), root->fs_info->sectorsize); + range_end = round_up(end, root->fs_info->sectorsize); + + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start, + range_end - range_start); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index dad0b492a663..d86541073d42 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1106,14 +1106,15 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, * If ret is 1 (no key found), it means this is an empty block group, * without any extents allocated from it and there's no block group * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree - * because we are using the block group tree feature, so block group - * items are stored in the block group tree. It also means there are no - * extents allocated for block groups with a start offset beyond this - * block group's end offset (this is the last, highest, block group). + * because we are using the block group tree feature (so block group + * items are stored in the block group tree) or this is a new block + * group created in the current transaction and its block group item + * was not yet inserted in the extent tree (that happens in + * btrfs_create_pending_block_groups() -> insert_block_group_item()). + * It also means there are no extents allocated for block groups with a + * start offset beyond this block group's end offset (this is the last, + * highest, block group). */ - if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) - ASSERT(ret == 0); - start = block_group->start; end = block_group->start + block_group->length; while (ret == 0) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b1b3a0553ee..3df5f36185a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6873,7 +6873,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inode_inc_iversion(inode); inode_set_ctime_current(inode); - set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 185bef0df1c2..8cb7d5a462ef 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3740,7 +3740,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) { ret = -ENOMEM; - goto drop_write; + goto out; } } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1175b8192cd7..31ad8580322a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1539,8 +1539,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst ASSERT(prealloc); /* Check the level of src and dst first */ - if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) { + kfree(prealloc); return -EINVAL; + } mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index de4cb0f3fbd0..e9224145d754 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -982,7 +982,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) extent_root = btrfs_extent_root(fs_info, 0); /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */ - if (IS_ERR(extent_root)) { + if (!extent_root) { btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling"); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); return 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8dd8de6b9fb8..0765e06d00b8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3780,6 +3780,7 @@ out: /* * Mark start of chunk relocation that is cancellable. Check if the cancellation * has been requested meanwhile and don't start in that case. + * NOTE: if this returns an error, reloc_chunk_end() must not be called. * * Return: * 0 success @@ -3796,10 +3797,8 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); - /* - * On cancel, clear all requests but let the caller mark - * the end after cleanup operations. - */ + /* On cancel, clear all requests. */ + clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); atomic_set(&fs_info->reloc_cancel_req, 0); return -ECANCELED; } @@ -3808,9 +3807,11 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) /* * Mark end of chunk relocation that is cancellable and wake any waiters. + * NOTE: call only if a previous call to reloc_chunk_start() succeeded. */ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) { + ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)); /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); @@ -4023,9 +4024,9 @@ out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); + reloc_chunk_end(fs_info); out_put_bg: btrfs_put_block_group(bg); - reloc_chunk_end(fs_info); free_reloc_control(rc); return err; } @@ -4208,8 +4209,8 @@ out_clean: ret = ret2; out_unset: unset_reloc_control(rc); -out_end: reloc_chunk_end(fs_info); +out_end: free_reloc_control(rc); out: free_reloc_roots(&reloc_roots); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4691d0bdb2e8..651b11884f82 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -694,7 +694,7 @@ static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) /* stripe->folios[] is allocated by us and no highmem is allowed. */ ASSERT(folio); - ASSERT(!folio_test_partial_kmap(folio)); + ASSERT(!folio_test_highmem(folio)); return folio_address(folio) + offset_in_folio(folio, offset); } @@ -707,7 +707,7 @@ static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int secto /* stripe->folios[] is allocated by us and no highmem is allowed. */ ASSERT(folio); - ASSERT(!folio_test_partial_kmap(folio)); + ASSERT(!folio_test_highmem(folio)); /* And the range must be contained inside the folio. */ ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9230e5066fc6..96a030d28e09 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -178,7 +178,6 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; - struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -305,6 +304,9 @@ struct send_ctx { struct btrfs_lru_cache dir_created_cache; struct btrfs_lru_cache dir_utimes_cache; + + /* Must be last as it ends in a flexible-array member. */ + struct fs_path cur_inode_path; }; struct pending_dir_move { @@ -4100,6 +4102,48 @@ out: return ret; } +static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node) +{ + const struct recorded_ref *data = k; + const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + + if (data->dir > ref->dir) + return 1; + if (data->dir < ref->dir) + return -1; + if (data->dir_gen > ref->dir_gen) + return 1; + if (data->dir_gen < ref->dir_gen) + return -1; + return 0; +} + +static bool rbtree_check_dir_ref_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + + return rbtree_check_dir_ref_comp(entry, parent) < 0; +} + +static int record_check_dir_ref_in_tree(struct rb_root *root, + struct recorded_ref *ref, struct list_head *list) +{ + struct recorded_ref *tmp_ref; + int ret; + + if (rb_find(ref, root, rbtree_check_dir_ref_comp)) + return 0; + + ret = dup_ref(ref, list); + if (ret < 0) + return ret; + + tmp_ref = list_last_entry(list, struct recorded_ref, list); + rb_add(&tmp_ref->node, root, rbtree_check_dir_ref_less); + tmp_ref->root = root; + return 0; +} + static int rename_current_inode(struct send_ctx *sctx, struct fs_path *current_path, struct fs_path *new_path) @@ -4127,11 +4171,11 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct recorded_ref *cur; struct recorded_ref *cur2; LIST_HEAD(check_dirs); + struct rb_root rbtree_check_dirs = RB_ROOT; struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; - u64 last_dir_ino_rm = 0; bool did_overwrite = false; bool is_orphan = false; bool can_rename = true; @@ -4435,7 +4479,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) goto out; } } - ret = dup_ref(cur, &check_dirs); + ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } @@ -4463,7 +4507,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) } list_for_each_entry(cur, &sctx->deleted_refs, list) { - ret = dup_ref(cur, &check_dirs); + ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } @@ -4473,7 +4517,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * We have a moved dir. Add the old parent to check_dirs */ cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); - ret = dup_ref(cur, &check_dirs); + ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } else if (!S_ISDIR(sctx->cur_inode_mode)) { @@ -4507,7 +4551,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (is_current_inode_path(sctx, cur->full_path)) fs_path_reset(&sctx->cur_inode_path); } - ret = dup_ref(cur, &check_dirs); + ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } @@ -4550,8 +4594,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - } else if (ret == inode_state_did_delete && - cur->dir != last_dir_ino_rm) { + } else if (ret == inode_state_did_delete) { ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; @@ -4563,7 +4606,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_rmdir(sctx, valid_path); if (ret < 0) goto out; - last_dir_ino_rm = cur->dir; } } } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d6e496436539..430e7419349c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1900,8 +1900,6 @@ static int btrfs_get_tree_super(struct fs_context *fc) return PTR_ERR(sb); } - set_device_specific_options(fs_info); - if (sb->s_root) { /* * Not the first mount of the fs thus got an existing super block. @@ -1946,6 +1944,7 @@ static int btrfs_get_tree_super(struct fs_context *fc) deactivate_locked_super(sb); return -EACCES; } + set_device_specific_options(fs_info); bdev = fs_devices->latest_dev->bdev; snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); @@ -2069,7 +2068,13 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); if (!fs_info->super_copy || !fs_info->super_for_commit) { - btrfs_free_fs_info(fs_info); + /* + * Dont call btrfs_free_fs_info() to free it as it's still + * initialized partially. + */ + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kvfree(fs_info); return -ENOMEM; } btrfs_init_fs_info(fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ca30b15ea452..c10b4c242acf 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1797,7 +1797,7 @@ static int check_inode_extref(struct extent_buffer *leaf, struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; u16 namelen; - if (unlikely(ptr + sizeof(*extref)) > end) { + if (unlikely(ptr + sizeof(*extref) > end)) { inode_ref_err(leaf, slot, "inode extref overflow, ptr %lu end %lu inode_extref size %zu", ptr, end, sizeof(*extref)); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 621e0df097e3..c90b2d2cb08f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7910,6 +7910,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, bool log_pinned = false; int ret; + /* The inode has a new name (ref/extref), so make sure we log it. */ + set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); + btrfs_init_log_ctx(&ctx, inode); ctx.logging_new_name = true; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index e00036672f33..0ea0df18a8e4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1753,7 +1753,7 @@ out: !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); - return -EINVAL; + ret = -EINVAL; } if (unlikely(cache->alloc_offset > cache->zone_capacity)) { diff --git a/fs/coredump.c b/fs/coredump.c index b5fc06a092a4..5c1c381ee380 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1468,7 +1468,7 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, ssize_t retval; char old_core_pattern[CORENAME_MAX_SIZE]; - if (write) + if (!write) return proc_dostring(table, write, buffer, lenp, ppos); retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); @@ -1725,7 +1725,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, if (iov_iter_rw(iter) == WRITE) { lockdep_assert_held_write(&iomi.inode->i_rwsem); iomi.flags |= IOMAP_WRITE; - } else { + } else if (!sb_rdonly(iomi.inode->i_sb)) { lockdep_assert_held(&iomi.inode->i_rwsem); } diff --git a/fs/dcache.c b/fs/dcache.c index a067fa0a965a..035cccbc9276 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2557,6 +2557,8 @@ struct dentry *d_alloc_parallel(struct dentry *parent, spin_lock(&parent->d_lock); new->d_parent = dget_dlock(parent); hlist_add_head(&new->d_sib, &parent->d_children); + if (parent->d_flags & DCACHE_DISCONNECTED) + new->d_flags |= DCACHE_DISCONNECTED; spin_unlock(&parent->d_lock); retry: diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index e5581dbeb4c2..c8d8e129eb4b 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -55,10 +55,6 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, } else { m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); - if (m->clusterofs >= 1 << vi->z_lclusterbits) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } m->pblk = le32_to_cpu(di->di_u.blkaddr); } return 0; @@ -240,21 +236,29 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, unsigned int lcn, bool lookahead) { + struct erofs_inode *vi = EROFS_I(m->inode); + int err; + + if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT) { + err = z_erofs_load_compact_lcluster(m, lcn, lookahead); + } else { + DBG_BUGON(vi->datalayout != EROFS_INODE_COMPRESSED_FULL); + err = z_erofs_load_full_lcluster(m, lcn); + } + if (err) + return err; + if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) { erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu", - m->type, lcn, EROFS_I(m->inode)->nid); + m->type, lcn, EROFS_I(m->inode)->nid); DBG_BUGON(1); return -EOPNOTSUPP; + } else if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && + m->clusterofs >= (1 << vi->z_lclusterbits)) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - - switch (EROFS_I(m->inode)->datalayout) { - case EROFS_INODE_COMPRESSED_FULL: - return z_erofs_load_full_lcluster(m, lcn); - case EROFS_INODE_COMPRESSED_COMPACT: - return z_erofs_load_compact_lcluster(m, lcn, lookahead); - default: - return -EINVAL; - } + return 0; } static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, @@ -268,20 +272,19 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned long lcn = m->lcn - lookback_distance; int err; + if (!lookback_distance) + break; + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; - if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { lookback_distance = m->delta[0]; - if (!lookback_distance) - break; continue; - } else { - m->headtype = m->type; - m->map->m_la = (lcn << lclusterbits) | m->clusterofs; - return 0; } + m->headtype = m->type; + m->map->m_la = (lcn << lclusterbits) | m->clusterofs; + return 0; } erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", lookback_distance, m->lcn, vi->nid); @@ -431,13 +434,6 @@ static int z_erofs_map_blocks_fo(struct inode *inode, end = inode->i_size; } else { if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) { - /* m.lcn should be >= 1 if endoff < m.clusterofs */ - if (!m.lcn) { - erofs_err(sb, "invalid logical cluster 0 at nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto unmap_out; - } end = (m.lcn << lclusterbits) | m.clusterofs; map->m_flags |= EROFS_MAP_FULL_MAPPED; m.delta[0] = 1; @@ -596,7 +592,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode, vi->z_fragmentoff = map->m_plen; if (recsz > offsetof(struct z_erofs_extent, pstart_lo)) vi->z_fragmentoff |= map->m_pa << 32; - } else if (map->m_plen) { + } else if (map->m_plen & Z_EROFS_EXTENT_PLEN_MASK) { map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED; fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT; @@ -715,6 +711,7 @@ static int z_erofs_map_sanity_check(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_sb_info *sbi = EROFS_I_SB(inode); + u64 pend; if (!(map->m_flags & EROFS_MAP_ENCODED)) return 0; @@ -732,6 +729,10 @@ static int z_erofs_map_sanity_check(struct inode *inode, if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) return -EOPNOTSUPP; + /* Filesystems beyond 48-bit physical block addresses are invalid */ + if (unlikely(check_add_overflow(map->m_pa, map->m_plen, &pend) || + (pend >> sbi->blkszbits) >= BIT_ULL(48))) + return -EFSCORRUPTED; return 0; } diff --git a/fs/exec.c b/fs/exec.c index 6b70c6726d31..4298e7e08d5d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error && !write) + if (!error && write) validate_coredump_safety(); return error; } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 329697c89d09..38210fb6901c 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -29,7 +29,6 @@ enum exfat_error_mode { enum { NLS_NAME_NO_LOSSY = 0, /* no lossy */ NLS_NAME_LOSSY = 1 << 0, /* just detected incorrect filename(s) */ - NLS_NAME_OVERLEN = 1 << 1, /* the length is over than its limit */ }; #define EXFAT_HASH_BITS 8 diff --git a/fs/exfat/file.c b/fs/exfat/file.c index f246cf439588..adc37b4d7fc2 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -509,8 +509,8 @@ static int exfat_ioctl_get_volume_label(struct super_block *sb, unsigned long ar static int exfat_ioctl_set_volume_label(struct super_block *sb, unsigned long arg) { - int ret = 0, lossy; - char label[FSLABEL_MAX]; + int ret = 0, lossy, label_len; + char label[FSLABEL_MAX] = {0}; struct exfat_uni_name uniname; if (!capable(CAP_SYS_ADMIN)) @@ -520,8 +520,9 @@ static int exfat_ioctl_set_volume_label(struct super_block *sb, return -EFAULT; memset(&uniname, 0, sizeof(uniname)); + label_len = strnlen(label, FSLABEL_MAX - 1); if (label[0]) { - ret = exfat_nls_to_utf16(sb, label, FSLABEL_MAX, + ret = exfat_nls_to_utf16(sb, label, label_len, &uniname, &lossy); if (ret < 0) return ret; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 7eb9c67fd35f..745dce29ddb5 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -442,7 +442,7 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, return namelen; /* return error value */ if ((lossy && !lookup) || !namelen) - return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL; + return -EINVAL; return 0; } @@ -642,10 +642,14 @@ static int exfat_find(struct inode *dir, const struct qstr *qname, info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); - info->size = le64_to_cpu(ep2->dentry.stream.valid_size); info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size); info->size = le64_to_cpu(ep2->dentry.stream.size); + if (info->valid_size < 0) { + exfat_fs_error(sb, "data valid size is invalid(%lld)", info->valid_size); + return -EIO; + } + if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) { exfat_fs_error(sb, "data size is invalid(%lld)", info->size); return -EIO; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 8243d94ceaf4..57db08a5271c 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -616,9 +616,6 @@ static int exfat_nls_to_ucs2(struct super_block *sb, unilen++; } - if (p_cstring[i] != '\0') - lossy |= NLS_NAME_OVERLEN; - *uniname = '\0'; p_uniname->name_len = unilen; p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b3e9b7bd7978..a0e66bc10093 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -280,9 +280,16 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); - /* In the no journal case, we can just do a bforget and return */ + /* + * In the no journal case, we should wait for the ongoing buffer + * to complete and do a forget. + */ if (!ext4_handle_valid(handle)) { - bforget(bh); + if (bh) { + clear_buffer_dirty(bh); + wait_on_buffer(bh); + __bforget(bh); + } return 0; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9e4ac87211e..e99306a8f47c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5319,6 +5319,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); + /* Detect invalid flag combination - can't have both inline data and extents */ + if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_error_inode(inode, function, line, 0, + "inode has both inline data and extents flags"); + ret = -EFSCORRUPTED; + goto bad_inode; + } inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 33c3a89396b1..82d5e7501455 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -513,7 +513,7 @@ void ext4_release_orphan_info(struct super_block *sb) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); - kfree(oi->of_binfo); + kvfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( @@ -637,7 +637,7 @@ int ext4_init_orphan_info(struct super_block *sb) out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); - kfree(oi->of_binfo); + kvfree(oi->of_binfo); out_put: iput(inode); return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ef38e62cda8f..775aa4f63aa3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1497,8 +1497,8 @@ static bool f2fs_map_blocks_cached(struct inode *inode, struct f2fs_dev_info *dev = &sbi->devs[bidx]; map->m_bdev = dev->bdev; - map->m_pblk -= dev->start_blk; map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); + map->m_pblk -= dev->start_blk; } else { map->m_bdev = inode->i_sb->s_bdev; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fd8e7b0b2166..db7afb806411 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1820,7 +1820,7 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); spin_lock(&inode->i_lock); - iput(inode); + atomic_dec(&inode->i_count); } trace_f2fs_drop_inode(inode, 0); return 0; diff --git a/fs/file_attr.c b/fs/file_attr.c index 12424d4945d0..1dcec88c0680 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -84,7 +84,7 @@ int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) int error; if (!inode->i_op->fileattr_get) - return -EOPNOTSUPP; + return -ENOIOCTLCMD; error = security_inode_file_getattr(dentry, fa); if (error) @@ -270,7 +270,7 @@ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, int err; if (!inode->i_op->fileattr_set) - return -EOPNOTSUPP; + return -ENOIOCTLCMD; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; @@ -312,8 +312,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp) int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); - if (err == -EOPNOTSUPP) - err = -ENOIOCTLCMD; if (!err) err = put_user(fa.flags, argp); return err; @@ -335,8 +333,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp) fileattr_fill_flags(&fa, flags); err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); - if (err == -EOPNOTSUPP) - err = -ENOIOCTLCMD; } } return err; @@ -349,8 +345,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp) int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); - if (err == -EOPNOTSUPP) - err = -ENOIOCTLCMD; if (!err) err = copy_fsxattr_to_user(&fa, argp); @@ -371,8 +365,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp) if (!err) { err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); - if (err == -EOPNOTSUPP) - err = -ENOIOCTLCMD; } } return err; @@ -424,6 +416,8 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, } error = vfs_fileattr_get(filepath.dentry, &fa); + if (error == -ENOIOCTLCMD || error == -ENOTTY) + error = -EOPNOTSUPP; if (error) return error; @@ -491,6 +485,8 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename, if (!error) { error = vfs_fileattr_set(mnt_idmap(filepath.mnt), filepath.dentry, &fa); + if (error == -ENOIOCTLCMD || error == -ENOTTY) + error = -EOPNOTSUPP; mnt_drop_write(filepath.mnt); } diff --git a/fs/file_table.c b/fs/file_table.c index b223d873e48b..cd4a3db4659a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -192,7 +192,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) f->f_sb_err = 0; /* - * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While + * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 57032eadca6c..fdc175e93f74 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -536,8 +536,6 @@ int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa) cleanup: fuse_priv_ioctl_cleanup(inode, ff); - if (err == -ENOTTY) - err = -EOPNOTSUPP; return err; } @@ -574,7 +572,5 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, cleanup: fuse_priv_ioctl_cleanup(inode, ff); - if (err == -ENOTTY) - err = -EOPNOTSUPP; return err; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9c94ed8c3ab0..f42548ee9083 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -478,14 +478,6 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, if (!hugetlb_vma_trylock_write(vma)) continue; - /* - * Skip VMAs without shareable locks. Per the design in commit - * 40549ba8f8e0, these will be handled by remove_inode_hugepages() - * called after this function with proper locking. - */ - if (!__vma_shareable_lock(vma)) - goto skip; - v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); @@ -496,7 +488,6 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * vmas. Therefore, lock is not held when calling * unmap_hugepage_range for private vmas. */ -skip: hugetlb_vma_unlock_write(vma); } } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index c7867139af69..3e510564de6e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1659,6 +1659,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) int drop_reserve = 0; int err = 0; int was_modified = 0; + int wait_for_writeback = 0; if (is_handle_aborted(handle)) return -EROFS; @@ -1782,18 +1783,22 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) } /* - * The buffer is still not written to disk, we should - * attach this buffer to current transaction so that the - * buffer can be checkpointed only after the current - * transaction commits. + * The buffer has not yet been written to disk. We should + * either clear the buffer or ensure that the ongoing I/O + * is completed, and attach this buffer to current + * transaction so that the buffer can be checkpointed only + * after the current transaction commits. */ clear_buffer_dirty(bh); + wait_for_writeback = 1; __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); spin_unlock(&journal->j_list_lock); } drop: __brelse(bh); spin_unlock(&jh->b_state_lock); + if (wait_for_writeback) + wait_on_buffer(bh); jbd2_journal_put_journal_head(jh); if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index df01d2876b68..9056f05a67dc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -270,19 +270,31 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) mirror->layout = NULL; } -static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(u32 dss_count, + gfp_t gfp_flags) { struct nfs4_ff_layout_mirror *mirror; - u32 dss_id; mirror = kzalloc(sizeof(*mirror), gfp_flags); - if (mirror != NULL) { - spin_lock_init(&mirror->lock); - refcount_set(&mirror->ref, 1); - INIT_LIST_HEAD(&mirror->mirrors); - for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) - nfs_localio_file_init(&mirror->dss[dss_id].nfl); + if (mirror == NULL) + return NULL; + + spin_lock_init(&mirror->lock); + refcount_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + + mirror->dss_count = dss_count; + mirror->dss = + kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), + gfp_flags); + if (mirror->dss == NULL) { + kfree(mirror); + return NULL; } + + for (u32 dss_id = 0; dss_id < mirror->dss_count; dss_id++) + nfs_localio_file_init(&mirror->dss[dss_id].nfl); + return mirror; } @@ -507,17 +519,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (dss_count > 1 && stripe_unit == 0) goto out_err_free; - fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); + fls->mirror_array[i] = ff_layout_alloc_mirror(dss_count, gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - fls->mirror_array[i]->dss_count = dss_count; - fls->mirror_array[i]->dss = - kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), - gfp_flags); - for (dss_id = 0; dss_id < dss_count; dss_id++) { dss_info = &fls->mirror_array[i]->dss[dss_id]; dss_info->mirror = fls->mirror_array[i]; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 6fddf43d729c..5998d6bd8a4f 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -222,6 +222,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; + clp->cl_last_renewal = jiffies; #if IS_ENABLED(CONFIG_NFS_V4_1) init_waitqueue_head(&clp->cl_lock_waitq); #endif diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f58098417142..411776718494 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3636,6 +3636,7 @@ struct nfs4_closedata { } lr; struct nfs_fattr fattr; unsigned long timestamp; + unsigned short retrans; }; static void nfs4_free_closedata(void *data) @@ -3664,6 +3665,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) .state = state, .inode = calldata->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3711,6 +3713,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -5593,9 +5596,11 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -5709,10 +5714,12 @@ static int nfs4_write_done_cb(struct rpc_task *task, .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, NFS_SERVER(inode), task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -6726,6 +6733,7 @@ struct nfs4_delegreturndata { struct nfs_fh fh; nfs4_stateid stateid; unsigned long timestamp; + unsigned short retrans; struct { struct nfs4_layoutreturn_args arg; struct nfs4_layoutreturn_res res; @@ -6746,6 +6754,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) .inode = data->inode, .stateid = &data->stateid, .task_is_privileged = data->args.seq_args.sa_privileged, + .retrans = data->retrans, }; if (!nfs4_sequence_done(task, &data->res.seq_res)) @@ -6817,6 +6826,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) task->tk_status = nfs4_async_handle_exception(task, data->res.server, task->tk_status, &exception); + data->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -7093,6 +7103,7 @@ struct nfs4_unlockdata { struct file_lock fl; struct nfs_server *server; unsigned long timestamp; + unsigned short retrans; }; static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, @@ -7147,6 +7158,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) struct nfs4_exception exception = { .inode = calldata->lsp->ls_state->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -7180,6 +7192,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_status = nfs4_async_handle_exception(task, calldata->server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) rpc_restart_call_prepare(task); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0fb6905736d5..336c510f3750 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1535,7 +1535,8 @@ static int nfs_writeback_done(struct rpc_task *task, /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE + | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); } return 0; diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index c318cf74e388..0f1a35400cd5 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -125,6 +125,13 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, return 0; } +static __be32 +nfsd4_ff_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, + struct nfsd4_layoutcommit *lcp) +{ + return nfs_ok; +} + const struct nfsd4_layout_ops ff_layout_ops = { .notify_types = NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, @@ -133,4 +140,5 @@ const struct nfsd4_layout_ops ff_layout_ops = { .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, .proc_layoutget = nfsd4_ff_proc_layoutget, .encode_layoutget = nfsd4_ff_encode_layoutget, + .proc_layoutcommit = nfsd4_ff_proc_layoutcommit, }; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e466cf52d7d7..7f7e6bb23a90 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -988,10 +988,11 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static void nfsd4_read_release(union nfsd4_op_u *u) { - if (u->read.rd_nf) + if (u->read.rd_nf) { + trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, + u->read.rd_offset, u->read.rd_length); nfsd_file_put(u->read.rd_nf); - trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, - u->read.rd_offset, u->read.rd_length); + } } static __be32 @@ -2892,10 +2893,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) rqstp->rq_lease_breaker = (void **)&cstate->clp; - trace_nfsd_compound(rqstp, args->tag, args->taglen, args->opcnt); + trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; + if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) { + /* If there are still more operations to process, + * stop here and report NFS4ERR_RESOURCE. */ + if (cstate->minorversion == 0 && + args->client_opcnt > resp->opcnt) { + op->status = nfserr_resource; + goto encode_op; + } + } + /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -2972,7 +2983,7 @@ encode_op: status = op->status; } - trace_nfsd_compound_status(args->opcnt, resp->opcnt, + trace_nfsd_compound_status(args->client_opcnt, resp->opcnt, status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 81fa7cc6c77b..c1b54322c412 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3902,6 +3902,7 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs ca->headerpadsz = 0; ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); + ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); ca->maxresp_cached = min_t(u32, ca->maxresp_cached, NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c0a3c6a7c8bb..6040a6145dad 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2488,8 +2488,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0) return false; - if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0) + if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0) return false; + argp->opcnt = min_t(u32, argp->client_opcnt, + NFSD_MAX_OPS_PER_COMPOUND); if (argp->opcnt > ARRAY_SIZE(argp->iops)) { argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops)); @@ -2628,10 +2630,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, __be32 *p; __be32 pathlen; int pathlen_offset; - int strlen, count=0; char *str, *end, *next; - - dprintk("nfsd4_encode_components(%s)\n", components); + int count = 0; pathlen_offset = xdr->buf->len; p = xdr_reserve_space(xdr, 4); @@ -2658,9 +2658,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, for (; *end && (*end != sep); end++) /* find sep or end of string */; - strlen = end - str; - if (strlen) { - if (xdr_stream_encode_opaque(xdr, str, strlen) < 0) + if (end > str) { + if (xdr_stream_encode_opaque(xdr, str, end - str) < 0) return nfserr_resource; count++; } else @@ -2939,6 +2938,12 @@ struct nfsd4_fattr_args { typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args); +static __be32 nfsd4_encode_fattr4__inval(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfserr_inval; +} + static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args) { @@ -3560,6 +3565,8 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { [FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop, [FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support, + [FATTR4_TIME_DELEG_ACCESS] = nfsd4_encode_fattr4__inval, + [FATTR4_TIME_DELEG_MODIFY] = nfsd4_encode_fattr4__inval, [FATTR4_OPEN_ARGUMENTS] = nfsd4_encode_fattr4_open_arguments, }; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index ea87b42894dd..f19320018639 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -57,6 +57,9 @@ struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ }; +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 200 + struct nfsd_genl_rqstp { struct sockaddr rq_daddr; struct sockaddr rq_saddr; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d4b48602b2b0..ee0570cbdd9e 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -903,6 +903,7 @@ struct nfsd4_compoundargs { char * tag; u32 taglen; u32 minorversion; + u32 client_opcnt; u32 opcnt; bool splice_ok; struct nfsd4_op *ops; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 1161eabf11ee..9cc7eb863643 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -17,6 +17,7 @@ #include "fanotify/fanotify.h" #include "fdinfo.h" #include "fsnotify.h" +#include "../internal.h" #if defined(CONFIG_PROC_FS) @@ -46,7 +47,12 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f->handle_bytes >> 2; + if (!super_trylock_shared(inode->i_sb)) + return; + ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); + up_read(&inode->i_sb->s_umount); + if ((ret == FILEID_INVALID) || (ret < 0)) return; diff --git a/fs/nsfs.c b/fs/nsfs.c index 648dc59bef7f..79b026a36fb6 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -490,7 +490,9 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); - VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (ns->inum != fid->ns_inum) + return NULL; if (!__ns_ref_get(ns)) return NULL; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 86f2631e6360..10923bf7c8b8 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -867,6 +867,11 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh, mlog_errno(ret); goto out; } + /* + * Invalidate extent cache after moving/defragging to prevent + * stale cached data with outdated extent flags. + */ + ocfs2_extent_map_trunc(inode, cpos); context->clusters_moved += alloc_size; next: diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index aac7e34f56c1..604a82acd164 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -178,7 +178,7 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old, err = ovl_real_fileattr_get(old, &oldfa); if (err) { /* Ntfs-3g returns -EINVAL for "no fileattr support" */ - if (err == -EOPNOTSUPP || err == -EINVAL) + if (err == -ENOTTY || err == -EINVAL) return 0; pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n", old->dentry, err); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fc52c796061d..7ab2c9daffd0 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -369,11 +369,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); - /* - * Overlayfs doesn't support deferred completions, don't copy - * this property in case it is set by the issuer. - */ - ifl &= ~IOCB_DIO_CALLER_COMP; ret = backing_file_write_iter(realfile, iter, iocb, ifl, &ctx); out_unlock: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index aaa4cf579561..e11f310ce092 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -720,7 +720,10 @@ int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa) if (err) return err; - return vfs_fileattr_get(realpath->dentry, fa); + err = vfs_fileattr_get(realpath->dentry, fa); + if (err == -ENOIOCTLCMD) + err = -ENOTTY; + return err; } int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 4076336fbba6..572a9925bd6c 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1782,15 +1782,13 @@ int resctrl_mon_resource_init(void) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; if (r->mon.mbm_cntr_assignable) { - if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); - if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); - mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; - mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & - (READS_TO_LOCAL_MEM | - READS_TO_LOCAL_S_MEM | - NON_TEMP_WRITE_TO_LOCAL_MEM); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); r->mon.mbm_assign_on_mkdir = true; resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index a4c02199fef4..17bd368574e9 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -5,17 +5,16 @@ config CIFS select NLS select NLS_UCS2_UTILS select CRYPTO - select CRYPTO_MD5 - select CRYPTO_SHA256 - select CRYPTO_SHA512 select CRYPTO_CMAC - select CRYPTO_HMAC select CRYPTO_AEAD2 select CRYPTO_CCM select CRYPTO_GCM select CRYPTO_ECB select CRYPTO_AES select CRYPTO_LIB_ARC4 + select CRYPTO_LIB_MD5 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_SHA512 select KEYS select DNS_RESOLVER select ASN1 diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 63b3b1290bed..ce2ebc213a1d 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -339,7 +339,6 @@ int sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid, struct cifs_fattr *fattr, uint sidtype) { - int rc = 0; struct key *sidkey; char *sidstr; const struct cred *saved_cred; @@ -446,12 +445,12 @@ out_revert_creds: * fails then we just fall back to using the ctx->linux_uid/linux_gid. */ got_valid_id: - rc = 0; if (sidtype == SIDOWNER) fattr->cf_uid = fuid; else fattr->cf_gid = fgid; - return rc; + + return 0; } int diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 7b7c8c38fdd0..801824825ecf 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -24,14 +24,43 @@ #include <linux/iov_iter.h> #include <crypto/aead.h> #include <crypto/arc4.h> +#include <crypto/md5.h> +#include <crypto/sha2.h> -static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, - void *priv, void *priv2) +static int cifs_sig_update(struct cifs_calc_sig_ctx *ctx, + const u8 *data, size_t len) { - struct shash_desc *shash = priv; + if (ctx->md5) { + md5_update(ctx->md5, data, len); + return 0; + } + if (ctx->hmac) { + hmac_sha256_update(ctx->hmac, data, len); + return 0; + } + return crypto_shash_update(ctx->shash, data, len); +} + +static int cifs_sig_final(struct cifs_calc_sig_ctx *ctx, u8 *out) +{ + if (ctx->md5) { + md5_final(ctx->md5, out); + return 0; + } + if (ctx->hmac) { + hmac_sha256_final(ctx->hmac, out); + return 0; + } + return crypto_shash_final(ctx->shash, out); +} + +static size_t cifs_sig_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) +{ + struct cifs_calc_sig_ctx *ctx = priv; int ret, *pret = priv2; - ret = crypto_shash_update(shash, iter_base, len); + ret = cifs_sig_update(ctx, iter_base, len); if (ret < 0) { *pret = ret; return len; @@ -42,21 +71,20 @@ static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, /* * Pass the data from an iterator into a hash. */ -static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, - struct shash_desc *shash) +static int cifs_sig_iter(const struct iov_iter *iter, size_t maxsize, + struct cifs_calc_sig_ctx *ctx) { struct iov_iter tmp_iter = *iter; int err = -EIO; - if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err, - cifs_shash_step) != maxsize) + if (iterate_and_advance_kernel(&tmp_iter, maxsize, ctx, &err, + cifs_sig_step) != maxsize) return err; return 0; } -int __cifs_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, char *signature, - struct shash_desc *shash) +int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, + char *signature, struct cifs_calc_sig_ctx *ctx) { int i; ssize_t rc; @@ -82,8 +110,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst, return -EIO; } - rc = crypto_shash_update(shash, - iov[i].iov_base, iov[i].iov_len); + rc = cifs_sig_update(ctx, iov[i].iov_base, iov[i].iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with payload\n", __func__); @@ -91,11 +118,11 @@ int __cifs_calc_signature(struct smb_rqst *rqst, } } - rc = cifs_shash_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), shash); + rc = cifs_sig_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), ctx); if (rc < 0) return rc; - rc = crypto_shash_final(shash, signature); + rc = cifs_sig_final(ctx, signature); if (rc) cifs_dbg(VFS, "%s: Could not generate hash\n", __func__); @@ -112,29 +139,22 @@ int __cifs_calc_signature(struct smb_rqst *rqst, static int cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, char *signature) { - int rc; + struct md5_ctx ctx; if (!rqst->rq_iov || !signature || !server) return -EINVAL; - - rc = cifs_alloc_hash("md5", &server->secmech.md5); - if (rc) - return -1; - - rc = crypto_shash_init(server->secmech.md5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init md5\n", __func__); - return rc; + if (fips_enabled) { + cifs_dbg(VFS, + "MD5 signature support is disabled due to FIPS\n"); + return -EOPNOTSUPP; } - rc = crypto_shash_update(server->secmech.md5, - server->session_key.response, server->session_key.len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with response\n", __func__); - return rc; - } + md5_init(&ctx); + md5_update(&ctx, server->session_key.response, server->session_key.len); - return __cifs_calc_signature(rqst, server, signature, server->secmech.md5); + return __cifs_calc_signature( + rqst, server, signature, + &(struct cifs_calc_sig_ctx){ .md5 = &ctx }); } /* must be called with server->srv_mutex held */ @@ -405,11 +425,11 @@ static __le64 find_timestamp(struct cifs_ses *ses) } static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, - const struct nls_table *nls_cp, struct shash_desc *hmacmd5) + const struct nls_table *nls_cp) { - int rc = 0; int len; char nt_hash[CIFS_NTHASH_SIZE]; + struct hmac_md5_ctx hmac_ctx; __le16 *user; wchar_t *domain; wchar_t *server; @@ -417,17 +437,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash, nls_cp); - rc = crypto_shash_setkey(hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not set NT hash as a key, rc=%d\n", __func__, rc); - return rc; - } - - rc = crypto_shash_init(hmacmd5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); - return rc; - } + hmac_md5_init_usingrawkey(&hmac_ctx, nt_hash, CIFS_NTHASH_SIZE); /* convert ses->user_name to unicode */ len = ses->user_name ? strlen(ses->user_name) : 0; @@ -442,12 +452,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, *(u16 *)user = 0; } - rc = crypto_shash_update(hmacmd5, (char *)user, 2 * len); + hmac_md5_update(&hmac_ctx, (const u8 *)user, 2 * len); kfree(user); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with user, rc=%d\n", __func__, rc); - return rc; - } /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { @@ -459,12 +465,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, nls_cp); - rc = crypto_shash_update(hmacmd5, (char *)domain, 2 * len); + hmac_md5_update(&hmac_ctx, (const u8 *)domain, 2 * len); kfree(domain); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with domain, rc=%d\n", __func__, rc); - return rc; - } } else { /* We use ses->ip_addr if no domain name available */ len = strlen(ses->ip_addr); @@ -474,25 +476,16 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, return -ENOMEM; len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); - rc = crypto_shash_update(hmacmd5, (char *)server, 2 * len); + hmac_md5_update(&hmac_ctx, (const u8 *)server, 2 * len); kfree(server); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with server, rc=%d\n", __func__, rc); - return rc; - } } - rc = crypto_shash_final(hmacmd5, ntlmv2_hash); - if (rc) - cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); - - return rc; + hmac_md5_final(&hmac_ctx, ntlmv2_hash); + return 0; } -static int -CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_desc *hmacmd5) +static void CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) { - int rc; struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *) (ses->auth_key.response + CIFS_SESS_KEY_SIZE); unsigned int hash_len; @@ -501,35 +494,15 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_ hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + offsetof(struct ntlmv2_resp, challenge.key[0])); - rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc); - return rc; - } - - rc = crypto_shash_init(hmacmd5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); - return rc; - } - if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) memcpy(ntlmv2->challenge.key, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); else memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - rc = crypto_shash_update(hmacmd5, ntlmv2->challenge.key, hash_len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc); - return rc; - } - - /* Note that the MD5 digest over writes anon.challenge_key.key */ - rc = crypto_shash_final(hmacmd5, ntlmv2->ntlmv2_hash); - if (rc) - cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); - - return rc; + /* Note that the HMAC-MD5 value overwrites ntlmv2->challenge.key */ + hmac_md5_usingrawkey(ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, + ntlmv2->challenge.key, hash_len, + ntlmv2->ntlmv2_hash); } /* @@ -586,7 +559,6 @@ out: int setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) { - struct shash_desc *hmacmd5 = NULL; unsigned char *tiblob = NULL; /* target info blob */ struct ntlmv2_resp *ntlmv2; char ntlmv2_hash[16]; @@ -657,51 +629,29 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) ntlmv2->client_chal = cc; ntlmv2->reserved2 = 0; - rc = cifs_alloc_hash("hmac(md5)", &hmacmd5); - if (rc) { - cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc); + if (fips_enabled) { + cifs_dbg(VFS, "NTLMv2 support is disabled due to FIPS\n"); + rc = -EOPNOTSUPP; goto unlock; } /* calculate ntlmv2_hash */ - rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp, hmacmd5); + rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { cifs_dbg(VFS, "Could not get NTLMv2 hash, rc=%d\n", rc); goto unlock; } /* calculate first part of the client response (CR1) */ - rc = CalcNTLMv2_response(ses, ntlmv2_hash, hmacmd5); - if (rc) { - cifs_dbg(VFS, "Could not calculate CR1, rc=%d\n", rc); - goto unlock; - } + CalcNTLMv2_response(ses, ntlmv2_hash); /* now calculate the session key for NTLMv2 */ - rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc); - goto unlock; - } - - rc = crypto_shash_init(hmacmd5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); - goto unlock; - } - - rc = crypto_shash_update(hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc); - goto unlock; - } - - rc = crypto_shash_final(hmacmd5, ses->auth_key.response); - if (rc) - cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); + hmac_md5_usingrawkey(ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, + ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, + ses->auth_key.response); + rc = 0; unlock: cifs_server_unlock(ses->server); - cifs_free_hash(&hmacmd5); setup_ntlmv2_rsp_ret: kfree_sensitive(tiblob); @@ -743,9 +693,6 @@ void cifs_crypto_secmech_release(struct TCP_Server_Info *server) { cifs_free_hash(&server->secmech.aes_cmac); - cifs_free_hash(&server->secmech.hmacsha256); - cifs_free_hash(&server->secmech.md5); - cifs_free_hash(&server->secmech.sha512); if (server->secmech.enc) { crypto_free_aead(server->secmech.enc); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 05b1fa76e8cc..185ac41bd7e9 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -173,7 +173,7 @@ module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); module_param(enable_gcm_256, bool, 0644); -MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0"); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/1"); module_param(require_gcm_256, bool, 0644); MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); @@ -2139,13 +2139,9 @@ MODULE_DESCRIPTION "also older servers complying with the SNIA CIFS Specification)"); MODULE_VERSION(CIFS_VERSION); MODULE_SOFTDEP("ecb"); -MODULE_SOFTDEP("hmac"); -MODULE_SOFTDEP("md5"); MODULE_SOFTDEP("nls"); MODULE_SOFTDEP("aes"); MODULE_SOFTDEP("cmac"); -MODULE_SOFTDEP("sha256"); -MODULE_SOFTDEP("sha512"); MODULE_SOFTDEP("aead2"); MODULE_SOFTDEP("ccm"); MODULE_SOFTDEP("gcm"); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 8f6f567d7474..203e2aaa3c25 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -24,6 +24,7 @@ #include "cifsacl.h" #include <crypto/internal/hash.h> #include <uapi/linux/cifs/cifs_mount.h> +#include "../common/cifsglob.h" #include "../common/smb2pdu.h" #include "smb2pdu.h" #include <linux/filelock.h> @@ -221,9 +222,6 @@ struct session_key { /* crypto hashing related structure/fields, not specific to a sec mech */ struct cifs_secmech { - struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ - struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ - struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */ struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */ struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */ @@ -536,8 +534,6 @@ struct smb_version_operations { void (*new_lease_key)(struct cifs_fid *); int (*generate_signingkey)(struct cifs_ses *ses, struct TCP_Server_Info *server); - int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *, - bool allocate_crypto); int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, struct cifsFileInfo *src_file); int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, @@ -702,12 +698,6 @@ get_rfc1002_length(void *buf) return be32_to_cpu(*((__be32 *)buf)) & 0xffffff; } -static inline void -inc_rfc1001_len(void *buf, int count) -{ - be32_add_cpu((__be32 *)buf, count); -} - struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; @@ -740,7 +730,7 @@ struct TCP_Server_Info { bool nosharesock; bool tcp_nodelay; bool terminate; - unsigned int credits; /* send no more requests at once */ + int credits; /* send no more requests at once */ unsigned int max_credits; /* can override large 32000 default at mnt */ unsigned int in_flight; /* number of requests on the wire to server */ unsigned int max_in_flight; /* max number of requests that were on wire */ @@ -1021,8 +1011,6 @@ compare_mid(__u16 mid, const struct smb_hdr *smb) #define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) #define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) -#define CIFS_DEFAULT_IOSIZE (1024 * 1024) - /* * Windows only supports a max of 60kb reads and 65535 byte writes. Default to * those values when posix extensions aren't in force. In actuality here, we @@ -2148,30 +2136,20 @@ extern mempool_t cifs_io_request_pool; extern mempool_t cifs_io_subrequest_pool; /* Operations for different SMB versions */ -#define SMB1_VERSION_STRING "1.0" -#define SMB20_VERSION_STRING "2.0" #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY extern struct smb_version_operations smb1_operations; extern struct smb_version_values smb1_values; extern struct smb_version_operations smb20_operations; extern struct smb_version_values smb20_values; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ -#define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; -#define SMBDEFAULT_VERSION_STRING "default" extern struct smb_version_values smbdefault_values; -#define SMB3ANY_VERSION_STRING "3" extern struct smb_version_values smb3any_values; -#define SMB30_VERSION_STRING "3.0" extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; -#define SMB302_VERSION_STRING "3.02" -#define ALT_SMB302_VERSION_STRING "3.0.2" /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ extern struct smb_version_values smb302_values; -#define SMB311_VERSION_STRING "3.1.1" -#define ALT_SMB311_VERSION_STRING "3.11" extern struct smb_version_operations smb311_operations; extern struct smb_version_values smb311_values; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index e8fba98690ce..3528c365a452 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -9,6 +9,7 @@ #define _CIFSPROTO_H #include <linux/nls.h> #include <linux/ctype.h> +#include "cifsglob.h" #include "trace.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" @@ -615,6 +616,8 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, extern struct TCP_Server_Info * cifs_find_tcp_session(struct smb3_fs_context *ctx); +struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal); + void __cifs_put_smb_ses(struct cifs_ses *ses); extern struct cifs_ses * @@ -632,9 +635,13 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_written); -int __cifs_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, char *signature, - struct shash_desc *shash); +struct cifs_calc_sig_ctx { + struct md5_ctx *md5; + struct hmac_sha256_ctx *hmac; + struct shash_desc *shash; +}; +int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, + char *signature, struct cifs_calc_sig_ctx *ctx); enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 2881efcbe09a..7da194f29fef 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1311,6 +1311,8 @@ cifs_readv_callback(struct mid_q_entry *mid) .rreq_debug_id = rdata->rreq->debug_id, .rreq_debug_index = rdata->subreq.debug_index, }; + unsigned int rreq_debug_id = rdata->rreq->debug_id; + unsigned int subreq_debug_index = rdata->subreq.debug_index; cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n", __func__, mid->mid, mid->mid_state, rdata->result, @@ -1374,6 +1376,9 @@ do_retry: __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } + trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, + server->credits, server->in_flight, + 0, cifs_trace_rw_credits_read_response_clear); rdata->credits.value = 0; rdata->subreq.error = rdata->result; rdata->subreq.transferred += rdata->got_bytes; @@ -1381,6 +1386,9 @@ do_retry: netfs_read_subreq_terminated(&rdata->subreq); release_mid(mid); add_credits(server, &credits, 0); + trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, + server->credits, server->in_flight, + credits.value, cifs_trace_rw_credits_read_response_add); } /* cifs_async_readv - send an async write, and set up mid to handle result */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index dd12f3eb61dc..55cb4b0cbd48 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -310,6 +310,8 @@ cifs_abort_connection(struct TCP_Server_Info *server) server->ssocket->flags); sock_release(server->ssocket); server->ssocket = NULL; + } else if (cifs_rdma_enabled(server)) { + smbd_destroy(server); } server->sequence_number = 0; server->session_estab = false; @@ -338,12 +340,6 @@ cifs_abort_connection(struct TCP_Server_Info *server) mid_execute_callback(mid); release_mid(mid); } - - if (cifs_rdma_enabled(server)) { - cifs_server_lock(server); - smbd_destroy(server); - cifs_server_unlock(server); - } } static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets) @@ -2015,39 +2011,31 @@ static int match_session(struct cifs_ses *ses, /** * cifs_setup_ipc - helper to setup the IPC tcon for the session * @ses: smb session to issue the request on - * @ctx: the superblock configuration context to use for building the - * new tree connection for the IPC (interprocess communication RPC) + * @seal: if encryption is requested * * A new IPC connection is made and stored in the session * tcon_ipc. The IPC tcon has the same lifetime as the session. */ -static int -cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) +struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal) { int rc = 0, xid; struct cifs_tcon *tcon; char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0}; - bool seal = false; struct TCP_Server_Info *server = ses->server; /* * If the mount request that resulted in the creation of the * session requires encryption, force IPC to be encrypted too. */ - if (ctx->seal) { - if (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) - seal = true; - else { - cifs_server_dbg(VFS, - "IPC: server doesn't support encryption\n"); - return -EOPNOTSUPP; - } + if (seal && !(server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) { + cifs_server_dbg(VFS, "IPC: server doesn't support encryption\n"); + return ERR_PTR(-EOPNOTSUPP); } /* no need to setup directory caching on IPC share, so pass in false */ tcon = tcon_info_alloc(false, netfs_trace_tcon_ref_new_ipc); if (tcon == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock(&server->srv_lock); scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", server->hostname); @@ -2057,13 +2045,13 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->ses = ses; tcon->ipc = true; tcon->seal = seal; - rc = server->ops->tree_connect(xid, ses, unc, tcon, ctx->local_nls); + rc = server->ops->tree_connect(xid, ses, unc, tcon, ses->local_nls); free_xid(xid); if (rc) { - cifs_server_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc); + cifs_server_dbg(VFS | ONCE, "failed to connect to IPC (rc=%d)\n", rc); tconInfoFree(tcon, netfs_trace_tcon_ref_free_ipc_fail); - goto out; + return ERR_PTR(rc); } cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid); @@ -2071,9 +2059,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) spin_lock(&tcon->tc_lock); tcon->status = TID_GOOD; spin_unlock(&tcon->tc_lock); - ses->tcon_ipc = tcon; -out: - return rc; + return tcon; } static struct cifs_ses * @@ -2347,6 +2333,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) { struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct cifs_tcon *ipc; struct cifs_ses *ses; unsigned int xid; int retries = 0; @@ -2525,7 +2512,12 @@ retry_new_session: list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); - cifs_setup_ipc(ses, ctx); + ipc = cifs_setup_ipc(ses, ctx->seal); + spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); + ses->tcon_ipc = !IS_ERR(ipc) ? ipc : NULL; + spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); free_xid(xid); diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 4dada26d56b5..f2ad0ccd08a7 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1120,24 +1120,63 @@ static bool target_share_equal(struct cifs_tcon *tcon, const char *s1) return match; } -static bool is_ses_good(struct cifs_ses *ses) +static bool is_ses_good(struct cifs_tcon *tcon, struct cifs_ses *ses) { struct TCP_Server_Info *server = ses->server; - struct cifs_tcon *tcon = ses->tcon_ipc; + struct cifs_tcon *ipc = NULL; bool ret; + spin_lock(&cifs_tcp_ses_lock); spin_lock(&ses->ses_lock); spin_lock(&ses->chan_lock); + ret = !cifs_chan_needs_reconnect(ses, server) && - ses->ses_status == SES_GOOD && - !tcon->need_reconnect; + ses->ses_status == SES_GOOD; + spin_unlock(&ses->chan_lock); + + if (!ret) + goto out; + + if (likely(ses->tcon_ipc)) { + if (ses->tcon_ipc->need_reconnect) { + ret = false; + goto out; + } + } else { + spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); + + ipc = cifs_setup_ipc(ses, tcon->seal); + + spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); + if (!IS_ERR(ipc)) { + if (!ses->tcon_ipc) { + ses->tcon_ipc = ipc; + ipc = NULL; + } + } else { + ret = false; + ipc = NULL; + } + } + +out: spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); + if (ipc && server->ops->tree_disconnect) { + unsigned int xid = get_xid(); + + (void)server->ops->tree_disconnect(xid, ipc); + _free_xid(xid); + } + tconInfoFree(ipc, netfs_trace_tcon_ref_free_ipc); return ret; } /* Refresh dfs referral of @ses */ -static void refresh_ses_referral(struct cifs_ses *ses) +static void refresh_ses_referral(struct cifs_tcon *tcon, struct cifs_ses *ses) { struct cache_entry *ce; unsigned int xid; @@ -1153,7 +1192,7 @@ static void refresh_ses_referral(struct cifs_ses *ses) } ses = CIFS_DFS_ROOT_SES(ses); - if (!is_ses_good(ses)) { + if (!is_ses_good(tcon, ses)) { cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n", __func__); goto out; @@ -1241,7 +1280,7 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh) up_read(&htable_rw_lock); ses = CIFS_DFS_ROOT_SES(ses); - if (!is_ses_good(ses)) { + if (!is_ses_good(tcon, ses)) { cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n", __func__); goto out; @@ -1309,7 +1348,7 @@ void dfs_cache_refresh(struct work_struct *work) tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work); list_for_each_entry(ses, &tcon->dfs_ses_list, dlist) - refresh_ses_referral(ses); + refresh_ses_referral(tcon, ses); refresh_tcon_referral(tcon, false); queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 239dd84a336f..cac355364e43 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2431,8 +2431,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, tcon = tlink_tcon(tlink); server = tcon->ses->server; - if (!server->ops->rename) - return -ENOSYS; + if (!server->ops->rename) { + rc = -ENOSYS; + goto do_rename_exit; + } /* try path-based rename first */ rc = server->ops->rename(xid, tcon, from_dentry, @@ -2482,11 +2484,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ do_rename_exit: - if (rc == 0) { + if (rc == 0) d_move(from_dentry, to_dentry); - /* Force a new lookup */ - d_drop(from_dentry); - } cifs_put_tlink(tlink); return rc; } diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index fe80e711cd75..70f3c0c67eeb 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -5,6 +5,7 @@ * Author(s): Steve French (sfrench@us.ibm.com) * */ +#include <crypto/md5.h> #include <linux/fs.h> #include <linux/stat.h> #include <linux/slab.h> @@ -37,23 +38,6 @@ #define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) md5_hash static int -symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) -{ - int rc; - struct shash_desc *md5 = NULL; - - rc = cifs_alloc_hash("md5", &md5); - if (rc) - return rc; - - rc = crypto_shash_digest(md5, link_str, link_len, md5_hash); - if (rc) - cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); - cifs_free_hash(&md5); - return rc; -} - -static int parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, char **_link_str) { @@ -77,11 +61,7 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) return -EINVAL; - rc = symlink_hash(link_len, link_str, md5_hash); - if (rc) { - cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); - return rc; - } + md5(link_str, link_len, md5_hash); scnprintf(md5_str2, sizeof(md5_str2), CIFS_MF_SYMLINK_MD5_FORMAT, @@ -103,7 +83,6 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, static int format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) { - int rc; unsigned int link_len; unsigned int ofs; u8 md5_hash[16]; @@ -116,11 +95,7 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) return -ENAMETOOLONG; - rc = symlink_hash(link_len, link_str, md5_hash); - if (rc) { - cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); - return rc; - } + md5(link_str, link_len, md5_hash); scnprintf(buf, buf_len, CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index dda6dece802a..e10123d8cd7d 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -916,6 +916,14 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, char *data_end; struct dfs_referral_level_3 *ref; + if (rsp_size < sizeof(*rsp)) { + cifs_dbg(VFS | ONCE, + "%s: header is malformed (size is %u, must be %zu)\n", + __func__, rsp_size, sizeof(*rsp)); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + *num_of_nodes = le16_to_cpu(rsp->NumberOfReferrals); if (*num_of_nodes < 1) { @@ -925,6 +933,15 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, goto parse_DFS_referrals_exit; } + if (sizeof(*rsp) + *num_of_nodes * sizeof(REFERRAL3) > rsp_size) { + cifs_dbg(VFS | ONCE, + "%s: malformed buffer (size is %u, must be at least %zu)\n", + __func__, rsp_size, + sizeof(*rsp) + *num_of_nodes * sizeof(REFERRAL3)); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + ref = (struct dfs_referral_level_3 *) &(rsp->referrals); if (ref->VersionNumber != cpu_to_le16(3)) { cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n", diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 0a8c2fcc9ded..ef3b498b0a02 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -584,7 +584,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, * to sign packets before we generate the channel signing key * (we sign with the session key) */ - rc = smb311_crypto_shash_allocate(chan->server); + rc = smb3_crypto_shash_allocate(chan->server); if (rc) { cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); mutex_unlock(&ses->session_mutex); diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 89d933b4a8bc..96bfe4c63ccf 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -7,6 +7,7 @@ * Pavel Shilovsky (pshilovsky@samba.org) 2012 * */ +#include <crypto/sha2.h> #include <linux/ctype.h> #include "cifsglob.h" #include "cifsproto.h" @@ -888,13 +889,13 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve * @iov: array containing the SMB request we will send to the server * @nvec: number of array entries for the iov */ -int +void smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, struct kvec *iov, int nvec) { - int i, rc; + int i; struct smb2_hdr *hdr; - struct shash_desc *sha512 = NULL; + struct sha512_ctx sha_ctx; hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ @@ -907,52 +908,22 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, * and we can test it. Preauth requires 3.1.1 for now. */ if (server->dialect != SMB311_PROT_ID) - return 0; + return; if (hdr->Command != SMB2_SESSION_SETUP) - return 0; + return; /* skip last sess setup response */ if ((hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) && (hdr->Status == NT_STATUS_OK || (hdr->Status != cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)))) - return 0; + return; ok: - rc = smb311_crypto_shash_allocate(server); - if (rc) - return rc; - - sha512 = server->secmech.sha512; - rc = crypto_shash_init(sha512); - if (rc) { - cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); - return rc; - } - - rc = crypto_shash_update(sha512, ses->preauth_sha_hash, - SMB2_PREAUTH_HASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); - return rc; - } - - for (i = 0; i < nvec; i++) { - rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update sha512 shash\n", - __func__); - return rc; - } - } - - rc = crypto_shash_final(sha512, ses->preauth_sha_hash); - if (rc) { - cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", - __func__); - return rc; - } - - return 0; + sha512_init(&sha_ctx); + sha512_update(&sha_ctx, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); + for (i = 0; i < nvec; i++) + sha512_update(&sha_ctx, iov[i].iov_base, iov[i].iov_len); + sha512_final(&sha_ctx, ses->preauth_sha_hash); } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 7c392cf5940b..1e39f2165e42 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2799,11 +2799,12 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; int rc; __le16 *utf16_path; - struct cached_fid *cfid = NULL; + struct cached_fid *cfid; int retries = 0, cur_sleep = 1; replay_again: /* reinitialize for possible replay */ + cfid = NULL; flags = CIFS_CP_CREATE_CLOSE_OP; oplock = SMB2_OPLOCK_LEVEL_NONE; server = cifs_pick_channel(ses); @@ -3212,8 +3213,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) { rc = -ENOMEM; - free_xid(xid); - return ERR_PTR(rc); + goto put_tlink; } oparms = (struct cifs_open_parms) { @@ -3245,6 +3245,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); } +put_tlink: cifs_put_tlink(tlink); free_xid(xid); @@ -3285,8 +3286,7 @@ set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen, utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) { rc = -ENOMEM; - free_xid(xid); - return rc; + goto put_tlink; } oparms = (struct cifs_open_parms) { @@ -3307,6 +3307,7 @@ set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen, SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); } +put_tlink: cifs_put_tlink(tlink); free_xid(xid); return rc; @@ -5446,7 +5447,6 @@ struct smb_version_operations smb20_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .calc_signature = smb2_calc_signature, .is_read_op = smb2_is_read_op, .set_oplock_level = smb2_set_oplock_level, .create_lease_buf = smb2_create_lease_buf, @@ -5550,7 +5550,6 @@ struct smb_version_operations smb21_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .calc_signature = smb2_calc_signature, .is_read_op = smb21_is_read_op, .set_oplock_level = smb21_set_oplock_level, .create_lease_buf = smb2_create_lease_buf, @@ -5660,7 +5659,6 @@ struct smb_version_operations smb30_operations = { .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, .generate_signingkey = generate_smb30signingkey, - .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, .set_oplock_level = smb3_set_oplock_level, @@ -5777,7 +5775,6 @@ struct smb_version_operations smb311_operations = { .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, .generate_signingkey = generate_smb311signingkey, - .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, .set_oplock_level = smb3_set_oplock_level, diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index b3f1398c9f79..5241daaae543 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -39,12 +39,6 @@ extern struct mid_q_entry *smb2_setup_async_request( struct TCP_Server_Info *server, struct smb_rqst *rqst); extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid); -extern int smb2_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, - bool allocate_crypto); -extern int smb3_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, - bool allocate_crypto); extern void smb2_echo_request(struct work_struct *work); extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); extern bool smb2_is_valid_oplock_break(char *buffer, @@ -295,10 +289,10 @@ extern int smb2_validate_and_copy_iov(unsigned int offset, extern void smb2_copy_fs_info_to_kstatfs( struct smb2_fs_full_size_info *pfs_inf, struct kstatfs *kst); -extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server); -extern int smb311_update_preauth_hash(struct cifs_ses *ses, - struct TCP_Server_Info *server, - struct kvec *iov, int nvec); +extern int smb3_crypto_shash_allocate(struct TCP_Server_Info *server); +extern void smb311_update_preauth_hash(struct cifs_ses *ses, + struct TCP_Server_Info *server, + struct kvec *iov, int nvec); extern int smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, const char *path, u32 desired_access, diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 33f33013b392..6a9b80385b86 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -19,6 +19,7 @@ #include <linux/mempool.h> #include <linux/highmem.h> #include <crypto/aead.h> +#include <crypto/sha2.h> #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" @@ -26,53 +27,14 @@ #include "../common/smb2status.h" #include "smb2glob.h" -static int +int smb3_crypto_shash_allocate(struct TCP_Server_Info *server) { struct cifs_secmech *p = &server->secmech; - int rc; - rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); - if (rc) - goto err; - - rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); - if (rc) - goto err; - - return 0; -err: - cifs_free_hash(&p->hmacsha256); - return rc; + return cifs_alloc_hash("cmac(aes)", &p->aes_cmac); } -int -smb311_crypto_shash_allocate(struct TCP_Server_Info *server) -{ - struct cifs_secmech *p = &server->secmech; - int rc = 0; - - rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); - if (rc) - return rc; - - rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); - if (rc) - goto err; - - rc = cifs_alloc_hash("sha512", &p->sha512); - if (rc) - goto err; - - return 0; - -err: - cifs_free_hash(&p->aes_cmac); - cifs_free_hash(&p->hmacsha256); - return rc; -} - - static int smb3_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) { @@ -247,16 +209,15 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) return tcon; } -int +static int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, - bool allocate_crypto) + bool allocate_crypto) { int rc; unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; - unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; - struct shash_desc *shash = NULL; + struct hmac_sha256_ctx hmac_ctx; struct smb_rqst drqst; __u64 sid = le64_to_cpu(shdr->SessionId); u8 key[SMB2_NTLMV2_SESSKEY_SIZE]; @@ -271,30 +232,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); - if (allocate_crypto) { - rc = cifs_alloc_hash("hmac(sha256)", &shash); - if (rc) { - cifs_server_dbg(VFS, - "%s: sha256 alloc failed\n", __func__); - goto out; - } - } else { - shash = server->secmech.hmacsha256; - } - - rc = crypto_shash_setkey(shash->tfm, key, sizeof(key)); - if (rc) { - cifs_server_dbg(VFS, - "%s: Could not update with response\n", - __func__); - goto out; - } - - rc = crypto_shash_init(shash); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not init sha256", __func__); - goto out; - } + hmac_sha256_init_usingrawkey(&hmac_ctx, key, sizeof(key)); /* * For SMB2+, __cifs_calc_signature() expects to sign only the actual @@ -305,25 +243,17 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, */ drqst = *rqst; if (drqst.rq_nvec >= 2 && iov[0].iov_len == 4) { - rc = crypto_shash_update(shash, iov[0].iov_base, - iov[0].iov_len); - if (rc) { - cifs_server_dbg(VFS, - "%s: Could not update with payload\n", - __func__); - goto out; - } + hmac_sha256_update(&hmac_ctx, iov[0].iov_base, iov[0].iov_len); drqst.rq_iov++; drqst.rq_nvec--; } - rc = __cifs_calc_signature(&drqst, server, sigptr, shash); + rc = __cifs_calc_signature( + &drqst, server, smb2_signature, + &(struct cifs_calc_sig_ctx){ .hmac = &hmac_ctx }); if (!rc) - memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); + memcpy(shdr->Signature, smb2_signature, SMB2_SIGNATURE_SIZE); -out: - if (allocate_crypto) - cifs_free_hash(&shash); return rc; } @@ -336,8 +266,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, __u8 L256[4] = {0, 0, 1, 0}; int rc = 0; unsigned char prfhash[SMB2_HMACSHA256_SIZE]; - unsigned char *hashptr = prfhash; struct TCP_Server_Info *server = ses->server; + struct hmac_sha256_ctx hmac_ctx; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); memset(key, 0x0, key_size); @@ -345,67 +275,26 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, rc = smb3_crypto_shash_allocate(server); if (rc) { cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm, - ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_init(server->secmech.hmacsha256); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_update(server->secmech.hmacsha256, i, 4); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__); - goto smb3signkey_ret; + return rc; } - rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__); - goto smb3signkey_ret; - } + hmac_sha256_init_usingrawkey(&hmac_ctx, ses->auth_key.response, + SMB2_NTLMV2_SESSKEY_SIZE); + hmac_sha256_update(&hmac_ctx, i, 4); + hmac_sha256_update(&hmac_ctx, label.iov_base, label.iov_len); + hmac_sha256_update(&hmac_ctx, &zero, 1); + hmac_sha256_update(&hmac_ctx, context.iov_base, context.iov_len); if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { - rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4); + hmac_sha256_update(&hmac_ctx, L256, 4); } else { - rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4); - } - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); - goto smb3signkey_ret; + hmac_sha256_update(&hmac_ctx, L128, 4); } + hmac_sha256_final(&hmac_ctx, prfhash); - rc = crypto_shash_final(server->secmech.hmacsha256, hashptr); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); - goto smb3signkey_ret; - } - - memcpy(key, hashptr, key_size); - -smb3signkey_ret: - return rc; + memcpy(key, prfhash, key_size); + return 0; } struct derivation { @@ -576,19 +465,21 @@ generate_smb311signingkey(struct cifs_ses *ses, return generate_smb3signingkey(ses, server, &triplet); } -int +static int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, - bool allocate_crypto) + bool allocate_crypto) { int rc; unsigned char smb3_signature[SMB2_CMACAES_SIZE]; - unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct shash_desc *shash = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; + if (server->vals->protocol_id <= SMB21_PROT_ID) + return smb2_calc_signature(rqst, server, allocate_crypto); + rc = smb3_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); if (unlikely(rc)) { cifs_server_dbg(FYI, "%s: Could not get signing key\n", __func__); @@ -643,9 +534,11 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, drqst.rq_nvec--; } - rc = __cifs_calc_signature(&drqst, server, sigptr, shash); + rc = __cifs_calc_signature( + &drqst, server, smb3_signature, + &(struct cifs_calc_sig_ctx){ .shash = shash }); if (!rc) - memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); + memcpy(shdr->Signature, smb3_signature, SMB2_SIGNATURE_SIZE); out: if (allocate_crypto) @@ -657,7 +550,6 @@ out: static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - int rc = 0; struct smb2_hdr *shdr; struct smb2_sess_setup_req *ssr; bool is_binding; @@ -684,9 +576,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) return 0; } - rc = server->ops->calc_signature(rqst, server, false); - - return rc; + return smb3_calc_signature(rqst, server, false); } int @@ -722,7 +612,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE); - rc = server->ops->calc_signature(rqst, server, true); + rc = smb3_calc_signature(rqst, server, true); if (rc) return rc; diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 316f398c70f4..85a4c55b61b8 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -172,6 +172,7 @@ static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc) * in order to notice the broken connection. */ wake_up_all(&sc->status_wait); + wake_up_all(&sc->send_io.lcredits.wait_queue); wake_up_all(&sc->send_io.credits.wait_queue); wake_up_all(&sc->send_io.pending.dec_wait_queue); wake_up_all(&sc->send_io.pending.zero_wait_queue); @@ -495,6 +496,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_send_io *request = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); struct smbdirect_socket *sc = request->socket; + int lcredits = 0; log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n", request, ib_wc_status_msg(wc->status)); @@ -504,22 +506,24 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) request->sge[i].addr, request->sge[i].length, DMA_TO_DEVICE); + mempool_free(request, sc->send_io.mem.pool); + lcredits += 1; if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { if (wc->status != IB_WC_WR_FLUSH_ERR) log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n", ib_wc_status_msg(wc->status), wc->opcode); - mempool_free(request, sc->send_io.mem.pool); smbd_disconnect_rdma_connection(sc); return; } + atomic_add(lcredits, &sc->send_io.lcredits.count); + wake_up(&sc->send_io.lcredits.wait_queue); + if (atomic_dec_and_test(&sc->send_io.pending.count)) wake_up(&sc->send_io.pending.zero_wait_queue); wake_up(&sc->send_io.pending.dec_wait_queue); - - mempool_free(request, sc->send_io.mem.pool); } static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp) @@ -567,6 +571,7 @@ static bool process_negotiation_response( log_rdma_event(ERR, "error: credits_granted==0\n"); return false; } + atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted)); if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { @@ -1114,6 +1119,24 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc, struct smbdirect_data_transfer *packet; int new_credits = 0; +wait_lcredit: + /* Wait for local send credits */ + rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue, + atomic_read(&sc->send_io.lcredits.count) > 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (rc) + goto err_wait_lcredit; + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + log_outgoing(ERR, "disconnected not sending on wait_credit\n"); + rc = -EAGAIN; + goto err_wait_lcredit; + } + if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) { + atomic_inc(&sc->send_io.lcredits.count); + goto wait_lcredit; + } + wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ rc = wait_event_interruptible(sc->send_io.credits.wait_queue, @@ -1132,23 +1155,6 @@ wait_credit: goto wait_credit; } -wait_send_queue: - wait_event(sc->send_io.pending.dec_wait_queue, - atomic_read(&sc->send_io.pending.count) < sp->send_credit_target || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - log_outgoing(ERR, "disconnected not sending on wait_send_queue\n"); - rc = -EAGAIN; - goto err_wait_send_queue; - } - - if (unlikely(atomic_inc_return(&sc->send_io.pending.count) > - sp->send_credit_target)) { - atomic_dec(&sc->send_io.pending.count); - goto wait_send_queue; - } - request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL); if (!request) { rc = -ENOMEM; @@ -1229,10 +1235,21 @@ wait_send_queue: le32_to_cpu(packet->data_length), le32_to_cpu(packet->remaining_data_length)); + /* + * Now that we got a local and a remote credit + * we add us as pending + */ + atomic_inc(&sc->send_io.pending.count); + rc = smbd_post_send(sc, request); if (!rc) return 0; + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); + + wake_up(&sc->send_io.pending.dec_wait_queue); + err_dma: for (i = 0; i < request->num_sge; i++) if (request->sge[i].addr) @@ -1246,14 +1263,14 @@ err_dma: atomic_sub(new_credits, &sc->recv_io.credits.count); err_alloc: - if (atomic_dec_and_test(&sc->send_io.pending.count)) - wake_up(&sc->send_io.pending.zero_wait_queue); - -err_wait_send_queue: - /* roll back send credits and pending */ atomic_inc(&sc->send_io.credits.count); + wake_up(&sc->send_io.credits.wait_queue); err_wait_credit: + atomic_inc(&sc->send_io.lcredits.count); + wake_up(&sc->send_io.lcredits.wait_queue); + +err_wait_lcredit: return rc; } @@ -1575,12 +1592,12 @@ void smbd_destroy(struct TCP_Server_Info *server) disable_work_sync(&sc->disconnect_work); log_rdma_event(INFO, "destroying rdma session\n"); - if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) smbd_disconnect_rdma_work(&sc->disconnect_work); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) { log_rdma_event(INFO, "wait for transport being disconnected\n"); - wait_event_interruptible( - sc->status_wait, - sc->status == SMBDIRECT_SOCKET_DISCONNECTED); + wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); + log_rdma_event(INFO, "waited for transport being disconnected\n"); } /* @@ -1624,19 +1641,7 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "free receive buffers\n"); destroy_receive_buffers(sc); - /* - * For performance reasons, memory registration and deregistration - * are not locked by srv_mutex. It is possible some processes are - * blocked on transport srv_mutex while holding memory registration. - * Release the transport srv_mutex to allow them to hit the failure - * path when sending data, and then release memory registrations. - */ log_rdma_event(INFO, "freeing mr list\n"); - while (atomic_read(&sc->mr_io.used.count)) { - cifs_server_unlock(server); - msleep(1000); - cifs_server_lock(server); - } destroy_mr_list(sc); ib_free_cq(sc->ib.send_cq); @@ -1779,6 +1784,7 @@ static struct smbd_connection *_smbd_get_connection( struct smbdirect_socket *sc; struct smbdirect_socket_parameters *sp; struct rdma_conn_param conn_param; + struct ib_qp_cap qp_cap; struct ib_qp_init_attr qp_attr; struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; struct ib_port_immutable port_immutable; @@ -1850,6 +1856,25 @@ static struct smbd_connection *_smbd_get_connection( goto config_failed; } + sp->responder_resources = + min_t(u8, sp->responder_resources, + sc->ib.dev->attrs.max_qp_rd_atom); + log_rdma_mr(INFO, "responder_resources=%d\n", + sp->responder_resources); + + /* + * We use allocate sp->responder_resources * 2 MRs + * and each MR needs WRs for REG and INV, so + * we use '* 4'. + * + * +1 for ib_drain_qp() + */ + memset(&qp_cap, 0, sizeof(qp_cap)); + qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1; + qp_cap.max_recv_wr = sp->recv_credit_max + 1; + qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; + qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); if (IS_ERR(sc->ib.pd)) { rc = PTR_ERR(sc->ib.pd); @@ -1860,7 +1885,7 @@ static struct smbd_connection *_smbd_get_connection( sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, - sp->send_credit_target, IB_POLL_SOFTIRQ); + qp_cap.max_send_wr, IB_POLL_SOFTIRQ); if (IS_ERR(sc->ib.send_cq)) { sc->ib.send_cq = NULL; goto alloc_cq_failed; @@ -1868,7 +1893,7 @@ static struct smbd_connection *_smbd_get_connection( sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, - sp->recv_credit_max, IB_POLL_SOFTIRQ); + qp_cap.max_recv_wr, IB_POLL_SOFTIRQ); if (IS_ERR(sc->ib.recv_cq)) { sc->ib.recv_cq = NULL; goto alloc_cq_failed; @@ -1877,11 +1902,7 @@ static struct smbd_connection *_smbd_get_connection( memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smbd_qp_async_error_upcall; qp_attr.qp_context = sc; - qp_attr.cap.max_send_wr = sp->send_credit_target; - qp_attr.cap.max_recv_wr = sp->recv_credit_max; - qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; - qp_attr.cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; - qp_attr.cap.max_inline_data = 0; + qp_attr.cap = qp_cap; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = sc->ib.send_cq; @@ -1895,12 +1916,6 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.qp = sc->rdma.cm_id->qp; - sp->responder_resources = - min_t(u8, sp->responder_resources, - sc->ib.dev->attrs.max_qp_rd_atom); - log_rdma_mr(INFO, "responder_resources=%d\n", - sp->responder_resources); - memset(&conn_param, 0, sizeof(conn_param)); conn_param.initiator_depth = sp->initiator_depth; conn_param.responder_resources = sp->responder_resources; @@ -2352,18 +2367,84 @@ static void smbd_mr_recovery_work(struct work_struct *work) } } +static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr) +{ + struct smbdirect_socket *sc = mr->socket; + + lockdep_assert_held(&mr->mutex); + + if (mr->state == SMBDIRECT_MR_DISABLED) + return; + + if (mr->mr) + ib_dereg_mr(mr->mr); + if (mr->sgt.nents) + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + kfree(mr->sgt.sgl); + + mr->mr = NULL; + mr->sgt.sgl = NULL; + mr->sgt.nents = 0; + + mr->state = SMBDIRECT_MR_DISABLED; +} + +static void smbd_mr_free_locked(struct kref *kref) +{ + struct smbdirect_mr_io *mr = + container_of(kref, struct smbdirect_mr_io, kref); + + lockdep_assert_held(&mr->mutex); + + /* + * smbd_mr_disable_locked() should already be called! + */ + if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED)) + smbd_mr_disable_locked(mr); + + mutex_unlock(&mr->mutex); + mutex_destroy(&mr->mutex); + kfree(mr); +} + static void destroy_mr_list(struct smbdirect_socket *sc) { struct smbdirect_mr_io *mr, *tmp; + LIST_HEAD(all_list); + unsigned long flags; disable_work_sync(&sc->mr_io.recovery_work); - list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) { - if (mr->state == SMBDIRECT_MR_INVALIDATED) - ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, - mr->sgt.nents, mr->dir); - ib_dereg_mr(mr->mr); - kfree(mr->sgt.sgl); - kfree(mr); + + spin_lock_irqsave(&sc->mr_io.all.lock, flags); + list_splice_tail_init(&sc->mr_io.all.list, &all_list); + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); + + list_for_each_entry_safe(mr, tmp, &all_list, list) { + mutex_lock(&mr->mutex); + + smbd_mr_disable_locked(mr); + list_del(&mr->list); + mr->socket = NULL; + + /* + * No kref_put_mutex() as it's already locked. + * + * If smbd_mr_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbd_mr_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock. + * + * If the mr is still registered it will + * be dangling (detached from the connection + * waiting for smbd_deregister_mr() to be + * called in order to free the memory. + */ + if (!kref_put(&mr->kref, smbd_mr_free_locked)) + mutex_unlock(&mr->mutex); } } @@ -2377,10 +2458,9 @@ static void destroy_mr_list(struct smbdirect_socket *sc) static int allocate_mr_list(struct smbdirect_socket *sc) { struct smbdirect_socket_parameters *sp = &sc->parameters; - int i; - struct smbdirect_mr_io *smbdirect_mr, *tmp; - - INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); + struct smbdirect_mr_io *mr; + int ret; + u32 i; if (sp->responder_resources == 0) { log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); @@ -2389,42 +2469,52 @@ static int allocate_mr_list(struct smbdirect_socket *sc) /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < sp->responder_resources * 2; i++) { - smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); - if (!smbdirect_mr) - goto cleanup_entries; - smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type, - sp->max_frmr_depth); - if (IS_ERR(smbdirect_mr->mr)) { + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + ret = -ENOMEM; + goto kzalloc_mr_failed; + } + + kref_init(&mr->kref); + mutex_init(&mr->mutex); + + mr->mr = ib_alloc_mr(sc->ib.pd, + sc->mr_io.type, + sp->max_frmr_depth); + if (IS_ERR(mr->mr)) { + ret = PTR_ERR(mr->mr); log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", sc->mr_io.type, sp->max_frmr_depth); - goto out; + goto ib_alloc_mr_failed; } - smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth, - sizeof(struct scatterlist), - GFP_KERNEL); - if (!smbdirect_mr->sgt.sgl) { + + mr->sgt.sgl = kcalloc(sp->max_frmr_depth, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!mr->sgt.sgl) { + ret = -ENOMEM; log_rdma_mr(ERR, "failed to allocate sgl\n"); - ib_dereg_mr(smbdirect_mr->mr); - goto out; + goto kcalloc_sgl_failed; } - smbdirect_mr->state = SMBDIRECT_MR_READY; - smbdirect_mr->socket = sc; + mr->state = SMBDIRECT_MR_READY; + mr->socket = sc; - list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list); + list_add_tail(&mr->list, &sc->mr_io.all.list); atomic_inc(&sc->mr_io.ready.count); } + + INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); + return 0; -out: - kfree(smbdirect_mr); -cleanup_entries: - list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) { - list_del(&smbdirect_mr->list); - ib_dereg_mr(smbdirect_mr->mr); - kfree(smbdirect_mr->sgt.sgl); - kfree(smbdirect_mr); - } - return -ENOMEM; +kcalloc_sgl_failed: + ib_dereg_mr(mr->mr); +ib_alloc_mr_failed: + mutex_destroy(&mr->mutex); + kfree(mr); +kzalloc_mr_failed: + destroy_mr_list(sc); + return ret; } /* @@ -2458,6 +2548,7 @@ again: list_for_each_entry(ret, &sc->mr_io.all.list, list) { if (ret->state == SMBDIRECT_MR_READY) { ret->state = SMBDIRECT_MR_REGISTERED; + kref_get(&ret->kref); spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); atomic_dec(&sc->mr_io.ready.count); atomic_inc(&sc->mr_io.used.count); @@ -2504,9 +2595,8 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, { struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_mr_io *smbdirect_mr; + struct smbdirect_mr_io *mr; int rc, num_pages; - enum dma_data_direction dir; struct ib_reg_wr *reg_wr; num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); @@ -2517,49 +2607,47 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, return NULL; } - smbdirect_mr = get_mr(sc); - if (!smbdirect_mr) { + mr = get_mr(sc); + if (!mr) { log_rdma_mr(ERR, "get_mr returning NULL\n"); return NULL; } - dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - smbdirect_mr->dir = dir; - smbdirect_mr->need_invalidate = need_invalidate; - smbdirect_mr->sgt.nents = 0; - smbdirect_mr->sgt.orig_nents = 0; + mutex_lock(&mr->mutex); + + mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + mr->need_invalidate = need_invalidate; + mr->sgt.nents = 0; + mr->sgt.orig_nents = 0; log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", num_pages, iov_iter_count(iter), sp->max_frmr_depth); - smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth); + smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth); - rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, - smbdirect_mr->sgt.nents, dir); + rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); if (!rc) { log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", - num_pages, dir, rc); + num_pages, mr->dir, rc); goto dma_map_error; } - rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl, - smbdirect_mr->sgt.nents, NULL, PAGE_SIZE); - if (rc != smbdirect_mr->sgt.nents) { + rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); + if (rc != mr->sgt.nents) { log_rdma_mr(ERR, - "ib_map_mr_sg failed rc = %d nents = %x\n", - rc, smbdirect_mr->sgt.nents); + "ib_map_mr_sg failed rc = %d nents = %x\n", + rc, mr->sgt.nents); goto map_mr_error; } - ib_update_fast_reg_key(smbdirect_mr->mr, - ib_inc_rkey(smbdirect_mr->mr->rkey)); - reg_wr = &smbdirect_mr->wr; + ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey)); + reg_wr = &mr->wr; reg_wr->wr.opcode = IB_WR_REG_MR; - smbdirect_mr->cqe.done = register_mr_done; - reg_wr->wr.wr_cqe = &smbdirect_mr->cqe; + mr->cqe.done = register_mr_done; + reg_wr->wr.wr_cqe = &mr->cqe; reg_wr->wr.num_sge = 0; reg_wr->wr.send_flags = IB_SEND_SIGNALED; - reg_wr->mr = smbdirect_mr->mr; - reg_wr->key = smbdirect_mr->mr->rkey; + reg_wr->mr = mr->mr; + reg_wr->key = mr->mr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; @@ -2570,24 +2658,51 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, * on the next ib_post_send when we actually send I/O to remote peer */ rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL); - if (!rc) - return smbdirect_mr; + if (!rc) { + /* + * get_mr() gave us a reference + * via kref_get(&mr->kref), we keep that and let + * the caller use smbd_deregister_mr() + * to remove it again. + */ + mutex_unlock(&mr->mutex); + return mr; + } log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", rc, reg_wr->key); /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/ map_mr_error: - ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, - smbdirect_mr->sgt.nents, smbdirect_mr->dir); + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); dma_map_error: - smbdirect_mr->state = SMBDIRECT_MR_ERROR; + mr->sgt.nents = 0; + mr->state = SMBDIRECT_MR_ERROR; if (atomic_dec_and_test(&sc->mr_io.used.count)) wake_up(&sc->mr_io.cleanup.wait_queue); smbd_disconnect_rdma_connection(sc); + /* + * get_mr() gave us a reference + * via kref_get(&mr->kref), we need to remove it again + * on error. + * + * No kref_put_mutex() as it's already locked. + * + * If smbd_mr_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbd_mr_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock. + */ + if (!kref_put(&mr->kref, smbd_mr_free_locked)) + mutex_unlock(&mr->mutex); + return NULL; } @@ -2612,44 +2727,55 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) * and we have to locally invalidate the buffer to prevent data is being * modified by remote peer after upper layer consumes it */ -int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) +void smbd_deregister_mr(struct smbdirect_mr_io *mr) { - struct ib_send_wr *wr; - struct smbdirect_socket *sc = smbdirect_mr->socket; - int rc = 0; + struct smbdirect_socket *sc = mr->socket; + + mutex_lock(&mr->mutex); + if (mr->state == SMBDIRECT_MR_DISABLED) + goto put_kref; + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + smbd_mr_disable_locked(mr); + goto put_kref; + } + + if (mr->need_invalidate) { + struct ib_send_wr *wr = &mr->inv_wr; + int rc; - if (smbdirect_mr->need_invalidate) { /* Need to finish local invalidation before returning */ - wr = &smbdirect_mr->inv_wr; wr->opcode = IB_WR_LOCAL_INV; - smbdirect_mr->cqe.done = local_inv_done; - wr->wr_cqe = &smbdirect_mr->cqe; + mr->cqe.done = local_inv_done; + wr->wr_cqe = &mr->cqe; wr->num_sge = 0; - wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey; + wr->ex.invalidate_rkey = mr->mr->rkey; wr->send_flags = IB_SEND_SIGNALED; - init_completion(&smbdirect_mr->invalidate_done); + init_completion(&mr->invalidate_done); rc = ib_post_send(sc->ib.qp, wr, NULL); if (rc) { log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); + smbd_mr_disable_locked(mr); smbd_disconnect_rdma_connection(sc); goto done; } - wait_for_completion(&smbdirect_mr->invalidate_done); - smbdirect_mr->need_invalidate = false; + wait_for_completion(&mr->invalidate_done); + mr->need_invalidate = false; } else /* * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED * and defer to mr_recovery_work to recover the MR for next use */ - smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; + mr->state = SMBDIRECT_MR_INVALIDATED; - if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) { - ib_dma_unmap_sg( - sc->ib.dev, smbdirect_mr->sgt.sgl, - smbdirect_mr->sgt.nents, - smbdirect_mr->dir); - smbdirect_mr->state = SMBDIRECT_MR_READY; + if (mr->sgt.nents) { + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + mr->sgt.nents = 0; + } + + if (mr->state == SMBDIRECT_MR_INVALIDATED) { + mr->state = SMBDIRECT_MR_READY; if (atomic_inc_return(&sc->mr_io.ready.count) == 1) wake_up(&sc->mr_io.ready.wait_queue); } else @@ -2663,7 +2789,23 @@ done: if (atomic_dec_and_test(&sc->mr_io.used.count)) wake_up(&sc->mr_io.cleanup.wait_queue); - return rc; +put_kref: + /* + * No kref_put_mutex() as it's already locked. + * + * If smbd_mr_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbd_mr_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock + * and keep the mr in SMBDIRECT_MR_READY or + * SMBDIRECT_MR_ERROR state. + */ + if (!kref_put(&mr->kref, smbd_mr_free_locked)) + mutex_unlock(&mr->mutex); } static bool smb_set_sge(struct smb_extract_to_rdma *rdma, diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index d67ac5ddaff4..577d37dbeb8a 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -60,7 +60,7 @@ int smbd_send(struct TCP_Server_Info *server, struct smbdirect_mr_io *smbd_register_mr( struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate); -int smbd_deregister_mr(struct smbdirect_mr_io *mr); +void smbd_deregister_mr(struct smbdirect_mr_io *mr); #else #define cifs_rdma_enabled(server) 0 diff --git a/fs/smb/client/trace.c b/fs/smb/client/trace.c index 465483787193..16b0e719731f 100644 --- a/fs/smb/client/trace.c +++ b/fs/smb/client/trace.c @@ -4,5 +4,6 @@ * * Author(s): Steve French <stfrench@microsoft.com> */ +#include "cifsglob.h" #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index b88fa04f5792..029910d56c22 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -178,7 +178,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler, memcpy(pacl, value, size); if (pTcon->ses->server->ops->set_acl) { int aclflags = 0; - rc = 0; switch (handler->flags) { case XATTR_CIFS_NTSD_FULL: diff --git a/fs/smb/common/cifsglob.h b/fs/smb/common/cifsglob.h new file mode 100644 index 000000000000..00fd215e3eb5 --- /dev/null +++ b/fs/smb/common/cifsglob.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ +/* + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) + * + */ +#ifndef _COMMON_CIFS_GLOB_H +#define _COMMON_CIFS_GLOB_H + +static inline void inc_rfc1001_len(void *buf, int count) +{ + be32_add_cpu((__be32 *)buf, count); +} + +#define SMB1_VERSION_STRING "1.0" +#define SMB20_VERSION_STRING "2.0" +#define SMB21_VERSION_STRING "2.1" +#define SMBDEFAULT_VERSION_STRING "default" +#define SMB3ANY_VERSION_STRING "3" +#define SMB30_VERSION_STRING "3.0" +#define SMB302_VERSION_STRING "3.02" +#define ALT_SMB302_VERSION_STRING "3.0.2" +#define SMB311_VERSION_STRING "3.1.1" +#define ALT_SMB311_VERSION_STRING "3.11" + +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +#endif /* _COMMON_CIFS_GLOB_H */ diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index db22a1d0546b..ee5a90d691c8 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -142,7 +142,15 @@ struct smbdirect_socket { } mem; /* - * The credit state for the send side + * The local credit state for ib_post_send() + */ + struct { + atomic_t count; + wait_queue_head_t wait_queue; + } lcredits; + + /* + * The remote credit state for the send side */ struct { atomic_t count; @@ -337,6 +345,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work); disable_delayed_work_sync(&sc->idle.timer_work); + atomic_set(&sc->send_io.lcredits.count, 0); + init_waitqueue_head(&sc->send_io.lcredits.wait_queue); + atomic_set(&sc->send_io.credits.count, 0); init_waitqueue_head(&sc->send_io.credits.wait_queue); @@ -437,13 +448,22 @@ enum smbdirect_mr_state { SMBDIRECT_MR_READY, SMBDIRECT_MR_REGISTERED, SMBDIRECT_MR_INVALIDATED, - SMBDIRECT_MR_ERROR + SMBDIRECT_MR_ERROR, + SMBDIRECT_MR_DISABLED }; struct smbdirect_mr_io { struct smbdirect_socket *socket; struct ib_cqe cqe; + /* + * We can have up to two references: + * 1. by the connection + * 2. by the registration + */ + struct kref kref; + struct mutex mutex; + struct list_head list; enum smbdirect_mr_state state; diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 6fa025374f2f..1c181ef99929 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -147,14 +147,11 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; - int method; - down_read(&sess->rpc_lock); + lockdep_assert_held(&sess->rpc_lock); entry = xa_load(&sess->rpc_handle_list, id); - method = entry ? entry->method : 0; - up_read(&sess->rpc_lock); - return method; + return entry ? entry->method : 0; } void ksmbd_session_destroy(struct ksmbd_session *sess) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index ab1d45fcebde..f901ae18e68a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1806,6 +1806,7 @@ int smb2_sess_setup(struct ksmbd_work *work) if (ksmbd_conn_need_reconnect(conn)) { rc = -EFAULT; + ksmbd_user_session_put(sess); sess = NULL; goto out_err; } @@ -4625,8 +4626,15 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess, * pipe without opening it, checking error condition here */ id = req->VolatileFileId; - if (!ksmbd_session_rpc_method(sess, id)) + + lockdep_assert_not_held(&sess->rpc_lock); + + down_read(&sess->rpc_lock); + if (!ksmbd_session_rpc_method(sess, id)) { + up_read(&sess->rpc_lock); return -ENOENT; + } + up_read(&sess->rpc_lock); ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n", req->FileInfoClass, req->VolatileFileId); @@ -6824,6 +6832,7 @@ int smb2_read(struct ksmbd_work *work) nbytes = ksmbd_vfs_read(work, fp, length, &offset, aux_payload_buf); if (nbytes < 0) { + kvfree(aux_payload_buf); err = nbytes; goto out; } diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index d742ba754348..863716207a0d 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -10,6 +10,7 @@ #include "glob.h" #include "nterr.h" +#include "../common/cifsglob.h" #include "../common/smb2pdu.h" #include "smb2pdu.h" @@ -26,16 +27,8 @@ #define SMB311_PROT 6 #define BAD_PROT 0xFFFF -#define SMB1_VERSION_STRING "1.0" -#define SMB20_VERSION_STRING "2.0" -#define SMB21_VERSION_STRING "2.1" -#define SMB30_VERSION_STRING "3.0" -#define SMB302_VERSION_STRING "3.02" -#define SMB311_VERSION_STRING "3.1.1" - #define SMB_ECHO_INTERVAL (60 * HZ) -#define CIFS_DEFAULT_IOSIZE (64 * 1024) #define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ #define MAX_STREAM_PROT_LEN 0x00FFFFFF @@ -464,9 +457,4 @@ static inline unsigned int get_rfc1002_len(void *buf) { return be32_to_cpu(*((__be32 *)buf)) & 0xffffff; } - -static inline void inc_rfc1001_len(void *buf, int count) -{ - be32_add_cpu((__be32 *)buf, count); -} #endif /* __SMB_COMMON_H__ */ diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 2aa1b29bea08..2c08cccfa680 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -263,10 +263,16 @@ static void ipc_msg_handle_free(int handle) static int handle_response(int type, void *payload, size_t sz) { - unsigned int handle = *(unsigned int *)payload; + unsigned int handle; struct ipc_msg_table_entry *entry; int ret = 0; + /* Prevent 4-byte read beyond declared payload size */ + if (sz < sizeof(unsigned int)) + return -EINVAL; + + handle = *(unsigned int *)payload; + ipc_update_last_active(); down_read(&ipc_msg_table_lock); hash_for_each_possible(ipc_msg_table, entry, ipc_table_hlist, handle) { @@ -825,6 +831,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle if (!msg) return NULL; + lockdep_assert_not_held(&sess->rpc_lock); + + down_read(&sess->rpc_lock); msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; @@ -833,6 +842,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle req->flags |= KSMBD_RPC_WRITE_METHOD; req->payload_sz = payload_sz; memcpy(req->payload, payload, payload_sz); + up_read(&sess->rpc_lock); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); @@ -849,6 +859,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle) if (!msg) return NULL; + lockdep_assert_not_held(&sess->rpc_lock); + + down_read(&sess->rpc_lock); msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; @@ -856,6 +869,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle) req->flags |= rpc_context_flags(sess); req->flags |= KSMBD_RPC_READ_METHOD; req->payload_sz = 0; + up_read(&sess->rpc_lock); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); @@ -876,6 +890,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle if (!msg) return NULL; + lockdep_assert_not_held(&sess->rpc_lock); + + down_read(&sess->rpc_lock); msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; @@ -884,6 +901,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle req->flags |= KSMBD_RPC_IOCTL_METHOD; req->payload_sz = payload_sz; memcpy(req->payload, payload, payload_sz); + up_read(&sess->rpc_lock); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index b3077766d6ec..7d86553fcc7c 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -219,6 +219,7 @@ static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc) * in order to notice the broken connection. */ wake_up_all(&sc->status_wait); + wake_up_all(&sc->send_io.lcredits.wait_queue); wake_up_all(&sc->send_io.credits.wait_queue); wake_up_all(&sc->send_io.pending.zero_wait_queue); wake_up_all(&sc->recv_io.reassembly.wait_queue); @@ -417,9 +418,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->ib.dev = sc->rdma.cm_id->device; - INIT_WORK(&sc->recv_io.posted.refill_work, - smb_direct_post_recv_credits); - INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); conn = ksmbd_conn_alloc(); @@ -450,11 +448,10 @@ static void free_transport(struct smb_direct_transport *t) struct smbdirect_recv_io *recvmsg; disable_work_sync(&sc->disconnect_work); - if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) smb_direct_disconnect_rdma_work(&sc->disconnect_work); - wait_event_interruptible(sc->status_wait, - sc->status == SMBDIRECT_SOCKET_DISCONNECTED); - } + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) + wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); /* * Wake up all waiters in all wait queues @@ -469,9 +466,11 @@ static void free_transport(struct smb_direct_transport *t) disable_delayed_work_sync(&sc->idle.timer_work); disable_work_sync(&sc->idle.immediate_work); + if (sc->rdma.cm_id) + rdma_lock_handler(sc->rdma.cm_id); + if (sc->ib.qp) { ib_drain_qp(sc->ib.qp); - ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs); sc->ib.qp = NULL; rdma_destroy_qp(sc->rdma.cm_id); } @@ -498,8 +497,10 @@ static void free_transport(struct smb_direct_transport *t) ib_free_cq(sc->ib.recv_cq); if (sc->ib.pd) ib_dealloc_pd(sc->ib.pd); - if (sc->rdma.cm_id) + if (sc->rdma.cm_id) { + rdma_unlock_handler(sc->rdma.cm_id); rdma_destroy_id(sc->rdma.cm_id); + } smb_direct_destroy_pools(sc); ksmbd_conn_free(KSMBD_TRANS(t)->conn); @@ -524,6 +525,12 @@ static void smb_direct_free_sendmsg(struct smbdirect_socket *sc, { int i; + /* + * The list needs to be empty! + * The caller should take care of it. + */ + WARN_ON_ONCE(!list_empty(&msg->sibling_list)); + if (msg->num_sge > 0) { ib_dma_unmap_single(sc->ib.dev, msg->sge[0].addr, msg->sge[0].length, @@ -909,9 +916,9 @@ static void smb_direct_post_recv_credits(struct work_struct *work) static void send_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smbdirect_send_io *sendmsg, *sibling; + struct smbdirect_send_io *sendmsg, *sibling, *next; struct smbdirect_socket *sc; - struct list_head *pos, *prev, *end; + int lcredits = 0; sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); sc = sendmsg->socket; @@ -920,27 +927,31 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) ib_wc_status_msg(wc->status), wc->status, wc->opcode); + /* + * Free possible siblings and then the main send_io + */ + list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) { + list_del_init(&sibling->sibling_list); + smb_direct_free_sendmsg(sc, sibling); + lcredits += 1; + } + /* Note this frees wc->wr_cqe, but not wc */ + smb_direct_free_sendmsg(sc, sendmsg); + lcredits += 1; + if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { pr_err("Send error. status='%s (%d)', opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, wc->opcode); smb_direct_disconnect_rdma_connection(sc); + return; } + atomic_add(lcredits, &sc->send_io.lcredits.count); + wake_up(&sc->send_io.lcredits.wait_queue); + if (atomic_dec_and_test(&sc->send_io.pending.count)) wake_up(&sc->send_io.pending.zero_wait_queue); - - /* iterate and free the list of messages in reverse. the list's head - * is invalid. - */ - for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next; - prev != end; pos = prev, prev = prev->prev) { - sibling = container_of(pos, struct smbdirect_send_io, sibling_list); - smb_direct_free_sendmsg(sc, sibling); - } - - sibling = container_of(pos, struct smbdirect_send_io, sibling_list); - smb_direct_free_sendmsg(sc, sibling); } static int manage_credits_prior_sending(struct smbdirect_socket *sc) @@ -988,8 +999,6 @@ static int smb_direct_post_send(struct smbdirect_socket *sc, ret = ib_post_send(sc->ib.qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (atomic_dec_and_test(&sc->send_io.pending.count)) - wake_up(&sc->send_io.pending.zero_wait_queue); smb_direct_disconnect_rdma_connection(sc); } return ret; @@ -1032,19 +1041,29 @@ static int smb_direct_flush_send_list(struct smbdirect_socket *sc, last->wr.send_flags = IB_SEND_SIGNALED; last->wr.wr_cqe = &last->cqe; + /* + * Remove last from send_ctx->msg_list + * and splice the rest of send_ctx->msg_list + * to last->sibling_list. + * + * send_ctx->msg_list is a valid empty list + * at the end. + */ + list_del_init(&last->sibling_list); + list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list); + send_ctx->wr_cnt = 0; + ret = smb_direct_post_send(sc, &first->wr); - if (!ret) { - smb_direct_send_ctx_init(send_ctx, - send_ctx->need_invalidate_rkey, - send_ctx->remote_key); - } else { - atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count); - wake_up(&sc->send_io.credits.wait_queue); - list_for_each_entry_safe(first, last, &send_ctx->msg_list, - sibling_list) { - smb_direct_free_sendmsg(sc, first); + if (ret) { + struct smbdirect_send_io *sibling, *next; + + list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) { + list_del_init(&sibling->sibling_list); + smb_direct_free_sendmsg(sc, sibling); } + smb_direct_free_sendmsg(sc, last); } + return ret; } @@ -1070,6 +1089,23 @@ static int wait_for_credits(struct smbdirect_socket *sc, } while (true); } +static int wait_for_send_lcredit(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx) +{ + if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) { + int ret; + + ret = smb_direct_flush_send_list(sc, send_ctx, false); + if (ret) + return ret; + } + + return wait_for_credits(sc, + &sc->send_io.lcredits.wait_queue, + &sc->send_io.lcredits.count, + 1); +} + static int wait_for_send_credits(struct smbdirect_socket *sc, struct smbdirect_send_batch *send_ctx) { @@ -1257,9 +1293,13 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc, int data_length; struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; + ret = wait_for_send_lcredit(sc, send_ctx); + if (ret) + goto lcredit_failed; + ret = wait_for_send_credits(sc, send_ctx); if (ret) - return ret; + goto credit_failed; data_length = 0; for (i = 0; i < niov; i++) @@ -1267,10 +1307,8 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc, ret = smb_direct_create_header(sc, data_length, remaining_data_length, &msg); - if (ret) { - atomic_inc(&sc->send_io.credits.count); - return ret; - } + if (ret) + goto header_failed; for (i = 0; i < niov; i++) { struct ib_sge *sge; @@ -1308,7 +1346,11 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc, return 0; err: smb_direct_free_sendmsg(sc, msg); +header_failed: atomic_inc(&sc->send_io.credits.count); +credit_failed: + atomic_inc(&sc->send_io.lcredits.count); +lcredit_failed: return ret; } @@ -1574,18 +1616,14 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, get_buf_page_count(desc_buf, desc_buf_len), msg->sg_list, SG_CHUNK_SIZE); if (ret) { - kfree(msg); ret = -ENOMEM; - goto out; + goto free_msg; } ret = get_sg_list(desc_buf, desc_buf_len, msg->sgt.sgl, msg->sgt.orig_nents); - if (ret < 0) { - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); - goto out; - } + if (ret < 0) + goto free_table; ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, @@ -1596,9 +1634,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); if (ret < 0) { pr_err("failed to init rdma_rw_ctx: %d\n", ret); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); - goto out; + goto free_table; } list_add_tail(&msg->list, &msg_list); @@ -1630,6 +1666,12 @@ out: atomic_add(credits_needed, &sc->rw_io.credits.count); wake_up(&sc->rw_io.credits.wait_queue); return ret; + +free_table: + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); +free_msg: + kfree(msg); + goto out; } static int smb_direct_rdma_write(struct ksmbd_transport *t, @@ -1687,10 +1729,10 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, } case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: { - ib_drain_qp(sc->ib.qp); - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; smb_direct_disconnect_rdma_work(&sc->disconnect_work); + if (sc->ib.qp) + ib_drain_qp(sc->ib.qp); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { @@ -1864,27 +1906,17 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) goto out_err; } - smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); return 0; out_err: put_recvmsg(sc, recvmsg); return ret; } -static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc) -{ - return min_t(unsigned int, - sc->ib.dev->attrs.max_fast_reg_page_list_len, - 256); -} - -static int smb_direct_init_params(struct smbdirect_socket *sc, - struct ib_qp_cap *cap) +static int smb_direct_init_params(struct smbdirect_socket *sc) { struct smbdirect_socket_parameters *sp = &sc->parameters; - struct ib_device *device = sc->ib.dev; - int max_send_sges, max_rw_wrs, max_send_wrs; - unsigned int max_sge_per_wr, wrs_per_credit; + int max_send_sges; + unsigned int maxpages; /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, * SMB2 response could be mapped. @@ -1895,67 +1927,20 @@ static int smb_direct_init_params(struct smbdirect_socket *sc, return -EINVAL; } - /* Calculate the number of work requests for RDMA R/W. - * The maximum number of pages which can be registered - * with one Memory region can be transferred with one - * R/W credit. And at least 4 work requests for each credit - * are needed for MR registration, RDMA R/W, local & remote - * MR invalidation. - */ - sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc); - sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size, - (sc->rw_io.credits.num_pages - 1) * - PAGE_SIZE); - - max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, - device->attrs.max_sge_rd); - max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, - max_send_sges); - wrs_per_credit = max_t(unsigned int, 4, - DIV_ROUND_UP(sc->rw_io.credits.num_pages, - max_sge_per_wr) + 1); - max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit; - - max_send_wrs = sp->send_credit_target + max_rw_wrs; - if (max_send_wrs > device->attrs.max_cqe || - max_send_wrs > device->attrs.max_qp_wr) { - pr_err("consider lowering send_credit_target = %d\n", - sp->send_credit_target); - pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - device->attrs.max_cqe, device->attrs.max_qp_wr); - return -EINVAL; - } + atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); - if (sp->recv_credit_max > device->attrs.max_cqe || - sp->recv_credit_max > device->attrs.max_qp_wr) { - pr_err("consider lowering receive_credit_max = %d\n", - sp->recv_credit_max); - pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", - device->attrs.max_cqe, device->attrs.max_qp_wr); - return -EINVAL; - } - - if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) { - pr_err("warning: device max_send_sge = %d too small\n", - device->attrs.max_send_sge); - return -EINVAL; - } - if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { - pr_err("warning: device max_recv_sge = %d too small\n", - device->attrs.max_recv_sge); - return -EINVAL; - } + maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE); + sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev, + sc->rdma.cm_id->port_num, + maxpages); + sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max); + /* add one extra in order to handle unaligned pages */ + sc->rw_io.credits.max += 1; sc->recv_io.credits.target = 1; atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); - cap->max_send_wr = max_send_wrs; - cap->max_recv_wr = sp->recv_credit_max; - cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; - cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; - cap->max_inline_data = 0; - cap->max_rdma_ctxs = sc->rw_io.credits.max; return 0; } @@ -2029,13 +2014,129 @@ err: return -ENOMEM; } -static int smb_direct_create_qpair(struct smbdirect_socket *sc, - struct ib_qp_cap *cap) +static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr) +{ + /* + * This could be split out of rdma_rw_init_qp() + * and be a helper function next to rdma_rw_mr_factor() + * + * We can't check unlikely(rdma_rw_force_mr) here, + * but that is most likely 0 anyway. + */ + u32 factor; + + WARN_ON_ONCE(attr->port_num == 0); + + /* + * Each context needs at least one RDMA READ or WRITE WR. + * + * For some hardware we might need more, eventually we should ask the + * HCA driver for a multiplier here. + */ + factor = 1; + + /* + * If the device needs MRs to perform RDMA READ or WRITE operations, + * we'll need two additional MRs for the registrations and the + * invalidation. + */ + if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd) + factor += 2; /* inv + reg */ + + return factor * attr->cap.max_rdma_ctxs; +} + +static int smb_direct_create_qpair(struct smbdirect_socket *sc) { struct smbdirect_socket_parameters *sp = &sc->parameters; int ret; + struct ib_qp_cap qp_cap; struct ib_qp_init_attr qp_attr; - int pages_per_rw; + u32 max_send_wr; + u32 rdma_send_wr; + + /* + * Note that {rdma,ib}_create_qp() will call + * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0. + * It will adjust cap->max_send_wr to the required + * number of additional WRs for the RDMA RW operations. + * It will cap cap->max_send_wr to the device limit. + * + * +1 for ib_drain_qp + */ + qp_cap.max_send_wr = sp->send_credit_target + 1; + qp_cap.max_recv_wr = sp->recv_credit_max + 1; + qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; + qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; + qp_cap.max_inline_data = 0; + qp_cap.max_rdma_ctxs = sc->rw_io.credits.max; + + /* + * Find out the number of max_send_wr + * after rdma_rw_init_qp() adjusted it. + * + * We only do it on a temporary variable, + * as rdma_create_qp() will trigger + * rdma_rw_init_qp() again. + */ + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.cap = qp_cap; + qp_attr.port_num = sc->rdma.cm_id->port_num; + rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr); + max_send_wr = qp_cap.max_send_wr + rdma_send_wr; + + if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe || + qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) { + pr_err("Possible CQE overrun: max_send_wr %d\n", + qp_cap.max_send_wr); + pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + sc->ib.dev->attrs.max_cqe, + sc->ib.dev->attrs.max_qp_wr); + pr_err("consider lowering send_credit_target = %d\n", + sp->send_credit_target); + return -EINVAL; + } + + if (qp_cap.max_rdma_ctxs && + (max_send_wr >= sc->ib.dev->attrs.max_cqe || + max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) { + pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n", + rdma_send_wr, qp_cap.max_send_wr, max_send_wr); + pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + sc->ib.dev->attrs.max_cqe, + sc->ib.dev->attrs.max_qp_wr); + pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n", + sp->send_credit_target, qp_cap.max_rdma_ctxs); + return -EINVAL; + } + + if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe || + qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) { + pr_err("Possible CQE overrun: max_recv_wr %d\n", + qp_cap.max_recv_wr); + pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + sc->ib.dev->attrs.max_cqe, + sc->ib.dev->attrs.max_qp_wr); + pr_err("consider lowering receive_credit_max = %d\n", + sp->recv_credit_max); + return -EINVAL; + } + + if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge || + qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) { + pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + sc->ib.dev->attrs.max_send_sge, + sc->ib.dev->attrs.max_recv_sge); + return -EINVAL; + } sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); if (IS_ERR(sc->ib.pd)) { @@ -2046,8 +2147,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc, } sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, - sp->send_credit_target + - cap->max_rdma_ctxs, + max_send_wr, IB_POLL_WORKQUEUE); if (IS_ERR(sc->ib.send_cq)) { pr_err("Can't create RDMA send CQ\n"); @@ -2057,7 +2157,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc, } sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, - sp->recv_credit_max, + qp_cap.max_recv_wr, IB_POLL_WORKQUEUE); if (IS_ERR(sc->ib.recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); @@ -2066,10 +2166,18 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc, goto err; } + /* + * We reset completely here! + * As the above use was just temporary + * to calc max_send_wr and rdma_send_wr. + * + * rdma_create_qp() will trigger rdma_rw_init_qp() + * again if max_rdma_ctxs is not 0. + */ memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smb_direct_qpair_handler; qp_attr.qp_context = sc; - qp_attr.cap = *cap; + qp_attr.cap = qp_cap; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = sc->ib.send_cq; @@ -2085,18 +2193,6 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc, sc->ib.qp = sc->rdma.cm_id->qp; sc->rdma.cm_id->event_handler = smb_direct_cm_handler; - pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1; - if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) { - ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs, - sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG, - sc->rw_io.credits.num_pages, 0); - if (ret) { - pr_err("failed to init mr pool count %zu pages %zu\n", - sc->rw_io.credits.max, sc->rw_io.credits.num_pages); - goto err; - } - } - return 0; err: if (sc->ib.qp) { @@ -2154,8 +2250,8 @@ static int smb_direct_prepare(struct ksmbd_transport *t) return -ECONNABORTED; ret = smb_direct_check_recvmsg(recvmsg); - if (ret == -ECONNABORTED) - goto out; + if (ret) + goto put; req = (struct smbdirect_negotiate_req *)recvmsg->packet; sp->max_recv_size = min_t(int, sp->max_recv_size, @@ -2170,23 +2266,46 @@ static int smb_direct_prepare(struct ksmbd_transport *t) sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); - ret = smb_direct_send_negotiate_response(sc, ret); -out: +put: spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.queue_length--; list_del(&recvmsg->list); spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); put_recvmsg(sc, recvmsg); + if (ret == -ECONNABORTED) + return ret; + + if (ret) + goto respond; + + /* + * We negotiated with success, so we need to refill the recv queue. + * We do that with sc->idle.immediate_work still being disabled + * via smbdirect_socket_init(), so that queue_work(sc->workqueue, + * &sc->idle.immediate_work) in smb_direct_post_recv_credits() + * is a no-op. + * + * The message that grants the credits to the client is + * the negotiate response. + */ + INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); + smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); + if (unlikely(sc->first_error)) + return sc->first_error; + INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); + +respond: + ret = smb_direct_send_negotiate_response(sc, ret); + return ret; } static int smb_direct_connect(struct smbdirect_socket *sc) { - struct ib_qp_cap qp_cap; int ret; - ret = smb_direct_init_params(sc, &qp_cap); + ret = smb_direct_init_params(sc); if (ret) { pr_err("Can't configure RDMA parameters\n"); return ret; @@ -2198,7 +2317,7 @@ static int smb_direct_connect(struct smbdirect_socket *sc) return ret; } - ret = smb_direct_create_qpair(sc, &qp_cap); + ret = smb_direct_create_qpair(sc); if (ret) { pr_err("Can't accept RDMA client: %d\n", ret); return ret; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 2d78e94072a0..e142bac4f9f8 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -498,17 +498,26 @@ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, } EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); -static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, +static int sysfs_group_attrs_change_owner(struct kobject *kobj, + struct kernfs_node *grp_kn, const struct attribute_group *grp, struct iattr *newattrs) { struct kernfs_node *kn; - int error; + int error, i; + umode_t mode; if (grp->attrs) { struct attribute *const *attr; - for (attr = grp->attrs; *attr; attr++) { + for (i = 0, attr = grp->attrs; *attr; i++, attr++) { + if (grp->is_visible) { + mode = grp->is_visible(kobj, *attr, i); + if (mode & SYSFS_GROUP_INVISIBLE) + break; + if (!mode) + continue; + } kn = kernfs_find_and_get(grp_kn, (*attr)->name); if (!kn) return -ENOENT; @@ -523,7 +532,14 @@ static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, if (grp->bin_attrs) { const struct bin_attribute *const *bin_attr; - for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { + for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { + if (grp->is_bin_visible) { + mode = grp->is_bin_visible(kobj, *bin_attr, i); + if (mode & SYSFS_GROUP_INVISIBLE) + break; + if (!mode) + continue; + } kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); if (!kn) return -ENOENT; @@ -573,7 +589,7 @@ int sysfs_group_change_owner(struct kobject *kobj, error = kernfs_setattr(grp_kn, &newattrs); if (!error) - error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs); + error = sysfs_group_attrs_change_owner(kobj, grp_kn, grp, &newattrs); kernfs_put(grp_kn); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 8930d5254e1d..b99da294e9a3 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -119,6 +119,15 @@ config XFS_RT See the xfs man page in section 5 for additional information. + This option is mandatory to support zoned block devices. For these + devices, the realtime subvolume must be backed by a zoned block + device and a regular block device used as the main device (for + metadata). If the zoned block device is a host-managed SMR hard-disk + containing conventional zones at the beginning of its address space, + XFS will use the disk conventional zones as the main device and the + remaining sequential write required zones as the backing storage for + the realtime subvolume. + If unsure, say N. config XFS_DRAIN_INTENTS @@ -156,7 +165,7 @@ config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y depends on XFS_ONLINE_SCRUB - select DEBUG_FS + depends on DEBUG_FS help If you say Y here, the kernel will gather usage data about the online metadata check subsystem. This includes the number diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index d36a6ae0abe5..d4fcf591e63d 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -50,6 +50,12 @@ struct xfs_rtgroup { uint8_t *rtg_rsum_cache; struct xfs_open_zone *rtg_open_zone; }; + + /* + * Count of outstanding GC operations for zoned XFS. Any RTG with a + * non-zero rtg_gccount will not be picked as new GC victim. + */ + atomic_t rtg_gccount; }; /* diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 26721fab5cab..091c79e432e5 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -376,6 +376,36 @@ out_incomplete: return error; } +static uint +xchk_nlinks_ilock_dir( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* + * We're going to scan the directory entries, so we must be ready to + * pull the data fork mappings into memory if they aren't already. + */ + if (xfs_need_iread_extents(&ip->i_df)) + lock_mode = XFS_ILOCK_EXCL; + + /* + * We're going to scan the parent pointers, so we must be ready to + * pull the attr fork mappings into memory if they aren't already. + */ + if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) && + xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + + /* + * Take the IOLOCK so that other threads cannot start a directory + * update while we're scanning. + */ + lock_mode |= XFS_IOLOCK_SHARED; + xfs_ilock(ip, lock_mode); + return lock_mode; +} + /* Walk a directory to bump the observed link counts of the children. */ STATIC int xchk_nlinks_collect_dir( @@ -394,8 +424,7 @@ xchk_nlinks_collect_dir( return 0; /* Prevent anyone from changing this directory while we walk it. */ - xfs_ilock(dp, XFS_IOLOCK_SHARED); - lock_mode = xfs_ilock_data_map_shared(dp); + lock_mode = xchk_nlinks_ilock_dir(dp); /* * The dotdot entry of an unlinked directory still points to the last @@ -452,7 +481,6 @@ out_abort: xchk_iscan_abort(&xnc->collect_iscan); out_unlock: xfs_iunlock(dp, lock_mode); - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return error; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 773d959965dc..47edf3041631 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1751,7 +1751,7 @@ xfs_init_buftarg( const char *descr) { /* The maximum size of the buftarg is only known once the sb is read. */ - btp->bt_nr_sectors = (xfs_daddr_t)-1; + btp->bt_nr_sectors = XFS_BUF_DADDR_MAX; /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8fa7bdf59c91..e25cd2a160f3 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache; */ struct xfs_buf; +#define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX) #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) #define XBF_READ (1u << 0) /* buffer intended for reading from device */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f046d1215b04..b871dfde372b 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -236,7 +236,6 @@ typedef struct xfs_mount { bool m_update_sb; /* sb needs update in mount */ unsigned int m_max_open_zones; unsigned int m_zonegc_low_space; - struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */ /* max_atomic_write mount option value */ unsigned long long m_awu_max_bytes; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e85a156dc17d..1067ebb3b001 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -102,7 +102,7 @@ static const struct constant_table dax_param_enums[] = { * Table driven mount option parser. */ enum { - Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, + Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, @@ -114,7 +114,21 @@ enum { Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, }; +#define fsparam_dead(NAME) \ + __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL) + static const struct fs_parameter_spec xfs_fs_parameters[] = { + /* + * These mount options were supposed to be deprecated in September 2025 + * but the deprecation warning was buggy, so not all users were + * notified. The deprecation is now obnoxiously loud and postponed to + * September 2030. + */ + fsparam_dead("attr2"), + fsparam_dead("noattr2"), + fsparam_dead("ikeep"), + fsparam_dead("noikeep"), + fsparam_u32("logbufs", Opt_logbufs), fsparam_string("logbsize", Opt_logbsize), fsparam_string("logdev", Opt_logdev), @@ -786,6 +800,12 @@ xfs_fs_evict_inode( truncate_inode_pages_final(&inode->i_data); clear_inode(inode); + + if (IS_ENABLED(CONFIG_XFS_RT) && + S_ISREG(inode->i_mode) && inode->i_private) { + xfs_open_zone_put(inode->i_private); + inode->i_private = NULL; + } } static void @@ -1373,16 +1393,25 @@ suffix_kstrtoull( static inline void xfs_fs_warn_deprecated( struct fs_context *fc, - struct fs_parameter *param, - uint64_t flag, - bool value) + struct fs_parameter *param) { - /* Don't print the warning if reconfiguring and current mount point - * already had the flag set + /* + * Always warn about someone passing in a deprecated mount option. + * Previously we wouldn't print the warning if we were reconfiguring + * and current mount point already had the flag set, but that was not + * the right thing to do. + * + * Many distributions mount the root filesystem with no options in the + * initramfs and rely on mount -a to remount the root fs with the + * options in fstab. However, the old behavior meant that there would + * never be a warning about deprecated mount options for the root fs in + * /etc/fstab. On a single-fs system, that means no warning at all. + * + * Compounding this problem are distribution scripts that copy + * /proc/mounts to fstab, which means that we can't remove mount + * options unless we're 100% sure they have only ever been advertised + * in /proc/mounts in response to explicitly provided mount options. */ - if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && - !!(XFS_M(fc->root->d_sb)->m_features & flag) == value) - return; xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); } @@ -1408,6 +1437,9 @@ xfs_fs_parse_param( return opt; switch (opt) { + case Op_deprecated: + xfs_fs_warn_deprecated(fc, param); + return 0; case Opt_logbufs: parsing_mp->m_logbufs = result.uint_32; return 0; @@ -1528,7 +1560,6 @@ xfs_fs_parse_param( xfs_mount_set_dax_mode(parsing_mp, result.uint_32); return 0; #endif - /* Following mount options will be removed in September 2025 */ case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; @@ -2221,7 +2252,7 @@ xfs_init_fs_context( struct xfs_mount *mp; int i; - mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) return -ENOMEM; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 1147bacb2da8..040402240807 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -26,14 +26,22 @@ #include "xfs_trace.h" #include "xfs_mru_cache.h" +static void +xfs_open_zone_free_rcu( + struct callback_head *cb) +{ + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); + + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); +} + void xfs_open_zone_put( struct xfs_open_zone *oz) { - if (atomic_dec_and_test(&oz->oz_ref)) { - xfs_rtgroup_rele(oz->oz_rtg); - kfree(oz); - } + if (atomic_dec_and_test(&oz->oz_ref)) + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); } static inline uint32_t @@ -238,6 +246,14 @@ xfs_zoned_map_extent( * If a data write raced with this GC write, keep the existing data in * the data fork, mark our newly written GC extent as reclaimable, then * move on to the next extent. + * + * Note that this can also happen when racing with operations that do + * not actually invalidate the data, but just move it to a different + * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the + * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the + * data was just moved around, GC fails to free the zone, but the zone + * becomes a GC candidate again as soon as all previous GC I/O has + * finished and these blocks will be moved out eventually. */ if (old_startblock != NULLFSBLOCK && old_startblock != data.br_startblock) @@ -614,14 +630,25 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) } /* - * Try to pack inodes that are written back after they were closed tight instead - * of trying to open new zones for them or spread them to the least recently - * used zone. This optimizes the data layout for workloads that untar or copy - * a lot of small files. Right now this does not separate multiple such + * Try to tightly pack small files that are written back after they were closed + * instead of trying to open new zones for them or spread them to the least + * recently used zone. This optimizes the data layout for workloads that untar + * or copy a lot of small files. Right now this does not separate multiple such * streams. */ static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) { + struct xfs_mount *mp = ip->i_mount; + size_t zone_capacity = + XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks); + + /* + * Do not pack write files that are already using a full zone to avoid + * fragmentation. + */ + if (i_size_read(VFS_I(ip)) >= zone_capacity) + return false; + return !inode_is_open_for_write(VFS_I(ip)) && !(ip->i_diflags & XFS_DIFLAG_APPEND); } @@ -746,97 +773,54 @@ xfs_mark_rtg_boundary( } /* - * Cache the last zone written to for an inode so that it is considered first - * for subsequent writes. - */ -struct xfs_zone_cache_item { - struct xfs_mru_cache_elem mru; - struct xfs_open_zone *oz; -}; - -static inline struct xfs_zone_cache_item * -xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) -{ - return container_of(mru, struct xfs_zone_cache_item, mru); -} - -static void -xfs_zone_cache_free_func( - void *data, - struct xfs_mru_cache_elem *mru) -{ - struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); - - xfs_open_zone_put(item->oz); - kfree(item); -} - -/* * Check if we have a cached last open zone available for the inode and * if yes return a reference to it. */ static struct xfs_open_zone * -xfs_cached_zone( - struct xfs_mount *mp, - struct xfs_inode *ip) +xfs_get_cached_zone( + struct xfs_inode *ip) { - struct xfs_mru_cache_elem *mru; - struct xfs_open_zone *oz; + struct xfs_open_zone *oz; - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); - if (!mru) - return NULL; - oz = xfs_zone_cache_item(mru)->oz; + rcu_read_lock(); + oz = VFS_I(ip)->i_private; if (oz) { /* * GC only steals open zones at mount time, so no GC zones * should end up in the cache. */ ASSERT(!oz->oz_is_gc); - ASSERT(atomic_read(&oz->oz_ref) > 0); - atomic_inc(&oz->oz_ref); + if (!atomic_inc_not_zero(&oz->oz_ref)) + oz = NULL; } - xfs_mru_cache_done(mp->m_zone_cache); + rcu_read_unlock(); + return oz; } /* - * Update the last used zone cache for a given inode. + * Stash our zone in the inode so that is is reused for future allocations. * - * The caller must have a reference on the open zone. + * The open_zone structure will be pinned until either the inode is freed or + * until the cached open zone is replaced with a different one because the + * current one was full when we tried to use it. This means we keep any + * open zone around forever as long as any inode that used it for the last + * write is cached, which slightly increases the memory use of cached inodes + * that were every written to, but significantly simplifies the cached zone + * lookup. Because the open_zone is clearly marked as full when all data + * in the underlying RTG was written, the caching is always safe. */ static void -xfs_zone_cache_create_association( - struct xfs_inode *ip, - struct xfs_open_zone *oz) +xfs_set_cached_zone( + struct xfs_inode *ip, + struct xfs_open_zone *oz) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_zone_cache_item *item = NULL; - struct xfs_mru_cache_elem *mru; + struct xfs_open_zone *old_oz; - ASSERT(atomic_read(&oz->oz_ref) > 0); atomic_inc(&oz->oz_ref); - - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); - if (mru) { - /* - * If we have an association already, update it to point to the - * new zone. - */ - item = xfs_zone_cache_item(mru); - xfs_open_zone_put(item->oz); - item->oz = oz; - xfs_mru_cache_done(mp->m_zone_cache); - return; - } - - item = kmalloc(sizeof(*item), GFP_KERNEL); - if (!item) { - xfs_open_zone_put(oz); - return; - } - item->oz = oz; - xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); + old_oz = xchg(&VFS_I(ip)->i_private, oz); + if (old_oz) + xfs_open_zone_put(old_oz); } static void @@ -880,15 +864,14 @@ xfs_zone_alloc_and_submit( * the inode is still associated with a zone and use that if so. */ if (!*oz) - *oz = xfs_cached_zone(mp, ip); + *oz = xfs_get_cached_zone(ip); if (!*oz) { select_zone: *oz = xfs_select_zone(mp, write_hint, pack_tight); if (!*oz) goto out_error; - - xfs_zone_cache_create_association(ip, *oz); + xfs_set_cached_zone(ip, *oz); } alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), @@ -966,6 +949,12 @@ xfs_free_open_zones( xfs_open_zone_put(oz); } spin_unlock(&zi->zi_open_zones_lock); + + /* + * Wait for all open zones to be freed so that they drop the group + * references: + */ + rcu_barrier(); } struct xfs_init_zones { @@ -1279,14 +1268,6 @@ xfs_mount_zones( error = xfs_zone_gc_mount(mp); if (error) goto out_free_zone_info; - - /* - * Set up a mru cache to track inode to open zone for data placement - * purposes. The magic values for group count and life time is the - * same as the defaults for file streams, which seems sane enough. - */ - xfs_mru_cache_create(&mp->m_zone_cache, mp, - 5000, 10, xfs_zone_cache_free_func); return 0; out_free_zone_info: @@ -1300,5 +1281,4 @@ xfs_unmount_zones( { xfs_zone_gc_unmount(mp); xfs_free_zone_info(mp->m_zone_info); - xfs_mru_cache_destroy(mp->m_zone_cache); } diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 064cd1a857a0..4ade54445532 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -114,6 +114,8 @@ struct xfs_gc_bio { /* Open Zone being written to */ struct xfs_open_zone *oz; + struct xfs_rtgroup *victim_rtg; + /* Bio used for reads and writes, including the bvec used by it */ struct bio_vec bv; struct bio bio; /* must be last */ @@ -264,6 +266,7 @@ xfs_zone_gc_iter_init( iter->rec_count = 0; iter->rec_idx = 0; iter->victim_rtg = victim_rtg; + atomic_inc(&victim_rtg->rtg_gccount); } /* @@ -362,6 +365,7 @@ xfs_zone_gc_query( return 0; done: + atomic_dec(&iter->victim_rtg->rtg_gccount); xfs_rtgroup_rele(iter->victim_rtg); iter->victim_rtg = NULL; return 0; @@ -451,6 +455,20 @@ xfs_zone_gc_pick_victim_from( if (!rtg) continue; + /* + * If the zone is already undergoing GC, don't pick it again. + * + * This prevents us from picking one of the zones for which we + * already submitted GC I/O, but for which the remapping hasn't + * concluded yet. This won't cause data corruption, but + * increases write amplification and slows down GC, so this is + * a bad thing. + */ + if (atomic_read(&rtg->rtg_gccount)) { + xfs_rtgroup_rele(rtg); + continue; + } + /* skip zones that are just waiting for a reset */ if (rtg_rmap(rtg)->i_used_blocks == 0 || rtg_rmap(rtg)->i_used_blocks >= victim_used) { @@ -491,21 +509,6 @@ xfs_zone_gc_select_victim( struct xfs_rtgroup *victim_rtg = NULL; unsigned int bucket; - if (xfs_is_shutdown(mp)) - return false; - - if (iter->victim_rtg) - return true; - - /* - * Don't start new work if we are asked to stop or park. - */ - if (kthread_should_stop() || kthread_should_park()) - return false; - - if (!xfs_zoned_need_gc(mp)) - return false; - spin_lock(&zi->zi_used_buckets_lock); for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); @@ -703,6 +706,9 @@ xfs_zone_gc_start_chunk( chunk->scratch = &data->scratch[data->scratch_idx]; chunk->data = data; chunk->oz = oz; + chunk->victim_rtg = iter->victim_rtg; + atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); bio->bi_end_io = xfs_zone_gc_end_io; @@ -725,6 +731,8 @@ static void xfs_zone_gc_free_chunk( struct xfs_gc_bio *chunk) { + atomic_dec(&chunk->victim_rtg->rtg_gccount); + xfs_rtgroup_rele(chunk->victim_rtg); list_del(&chunk->entry); xfs_open_zone_put(chunk->oz); xfs_irele(chunk->ip); @@ -785,6 +793,10 @@ xfs_zone_gc_split_write( split_chunk->oz = chunk->oz; atomic_inc(&chunk->oz->oz_ref); + split_chunk->victim_rtg = chunk->victim_rtg; + atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); + chunk->offset += split_len; chunk->len -= split_len; chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); @@ -975,6 +987,27 @@ xfs_zone_gc_reset_zones( } while (next); } +static bool +xfs_zone_gc_should_start_new_work( + struct xfs_zone_gc_data *data) +{ + if (xfs_is_shutdown(data->mp)) + return false; + if (!xfs_zone_gc_space_available(data)) + return false; + + if (!data->iter.victim_rtg) { + if (kthread_should_stop() || kthread_should_park()) + return false; + if (!xfs_zoned_need_gc(data->mp)) + return false; + if (!xfs_zone_gc_select_victim(data)) + return false; + } + + return true; +} + /* * Handle the work to read and write data for GC and to reset the zones, * including handling all completions. @@ -982,7 +1015,7 @@ xfs_zone_gc_reset_zones( * Note that the order of the chunks is preserved so that we don't undo the * optimal order established by xfs_zone_gc_query(). */ -static bool +static void xfs_zone_gc_handle_work( struct xfs_zone_gc_data *data) { @@ -996,30 +1029,22 @@ xfs_zone_gc_handle_work( zi->zi_reset_list = NULL; spin_unlock(&zi->zi_reset_list_lock); - if (!xfs_zone_gc_select_victim(data) || - !xfs_zone_gc_space_available(data)) { - if (list_empty(&data->reading) && - list_empty(&data->writing) && - list_empty(&data->resetting) && - !reset_list) - return false; - } - - __set_current_state(TASK_RUNNING); - try_to_freeze(); - - if (reset_list) + if (reset_list) { + set_current_state(TASK_RUNNING); xfs_zone_gc_reset_zones(data, reset_list); + } list_for_each_entry_safe(chunk, next, &data->resetting, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_finish_reset(chunk); } list_for_each_entry_safe(chunk, next, &data->writing, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_finish_chunk(chunk); } @@ -1027,15 +1052,18 @@ xfs_zone_gc_handle_work( list_for_each_entry_safe(chunk, next, &data->reading, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_write_chunk(chunk); } blk_finish_plug(&plug); - blk_start_plug(&plug); - while (xfs_zone_gc_start_chunk(data)) - ; - blk_finish_plug(&plug); - return true; + if (xfs_zone_gc_should_start_new_work(data)) { + set_current_state(TASK_RUNNING); + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); + } } /* @@ -1059,8 +1087,18 @@ xfs_zoned_gcd( for (;;) { set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); xfs_set_zonegc_running(mp); - if (xfs_zone_gc_handle_work(data)) + + xfs_zone_gc_handle_work(data); + + /* + * Only sleep if nothing set the state to running. Else check for + * work again as someone might have queued up more work and woken + * us in the meantime. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); continue; + } if (list_empty(&data->reading) && list_empty(&data->writing) && diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h index 35e6de3d25ed..4322e26dd99a 100644 --- a/fs/xfs/xfs_zone_priv.h +++ b/fs/xfs/xfs_zone_priv.h @@ -44,6 +44,8 @@ struct xfs_open_zone { * the life time of an open zone. */ struct xfs_rtgroup *oz_rtg; + + struct rcu_head oz_rcu; }; /* |
