diff options
Diffstat (limited to 'fs')
38 files changed, 423 insertions, 225 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 3282adc84d52..a25c9910d90b 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,7 +31,7 @@ config BTRFS_FS continue to be mountable and usable by newer kernels. For more information, please see the web pages at - http://btrfs.wiki.kernel.org. + https://btrfs.readthedocs.io To compile this file system support as a module, choose M here. The module will be called btrfs. diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0cb1dee965a0..b2e5107b7cec 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3028,8 +3028,16 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); - /* We didn't update the block group item, need to revert @commit_used. */ - if (ret < 0) { + /* + * We didn't update the block group item, need to revert commit_used + * unless the block group item didn't exist yet - this is to prevent a + * race with a concurrent insertion of the block group item, with + * insert_block_group_item(), that happened just after we attempted to + * update. In that case we would reset commit_used to 0 just after the + * insertion set it to a value greater than 0 - if the block group later + * becomes with 0 used bytes, we would incorrectly skip its update. + */ + if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->commit_used = old_commit_used; spin_unlock(&cache->lock); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 53c1211dd60b..caf0bbd028d1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -412,6 +412,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { + struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; @@ -419,18 +420,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) if (RB_EMPTY_NODE(&delayed_item->rb_node)) return; - delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; + /* If it's in a rbtree, then we need to have delayed node locked. */ + lockdep_assert_held(&delayed_node->mutex); + + delayed_root = delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) - root = &delayed_item->delayed_node->ins_root; + root = &delayed_node->ins_root; else - root = &delayed_item->delayed_node->del_root; + root = &delayed_node->del_root; rb_erase_cached(&delayed_item->rb_node, root); RB_CLEAR_NODE(&delayed_item->rb_node); - delayed_item->delayed_node->count--; + delayed_node->count--; finish_one_item(delayed_root); } @@ -1153,20 +1157,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { - btrfs_release_delayed_node(curr_node); - curr_node = NULL; btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; curr_node = btrfs_next_delayed_node(curr_node); + /* + * See the comment below about releasing path before releasing + * node. If the commit of delayed items was successful the path + * should always be released, but in case of an error, it may + * point to locked extent buffers (a leaf at the very least). + */ + ASSERT(path->nodes[0] == NULL); btrfs_release_delayed_node(prev_node); } + /* + * Release the path to avoid a potential deadlock and lockdep splat when + * releasing the delayed node, as that requires taking the delayed node's + * mutex. If another task starts running delayed items before we take + * the mutex, it will first lock the mutex and then it may try to lock + * the same btree path (leaf). + */ + btrfs_free_path(path); + if (curr_node) btrfs_release_delayed_node(curr_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } -/* Will return 0 or -ENOMEM */ +static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; +} + +/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, @@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); + /* + * First attempt to insert the delayed item. This is to make the error + * handling path simpler in case we fail (-EEXIST). There's no risk of + * any other task coming in and running the delayed item before we do + * the metadata space reservation below, because we are holding the + * delayed node's mutex and that mutex must also be locked before the + * node's delayed items can be run. + */ + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); + if (unlikely(ret)) { + btrfs_err(trans->fs_info, +"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", + name_len, name, index, btrfs_root_id(delayed_node->root), + delayed_node->inode_id, dir->index_cnt, + delayed_node->index_cnt, ret); + btrfs_release_delayed_item(delayed_item); + btrfs_release_dir_index_item_space(trans); + mutex_unlock(&delayed_node->mutex); + goto release_node; + } + if (delayed_node->index_item_leaves == 0 || delayed_node->curr_index_batch_size + data_len > leaf_data_size) { delayed_node->curr_index_batch_size = data_len; @@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, * impossible. */ if (WARN_ON(ret)) { - mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_item(delayed_item); + mutex_unlock(&delayed_node->mutex); goto release_node; } delayed_node->index_item_leaves++; - } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { - const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); - - /* - * Adding the new dir index item does not require touching another - * leaf, so we can release 1 unit of metadata that was previously - * reserved when starting the transaction. This applies only to - * the case where we had a transaction start and excludes the - * transaction join case (when replaying log trees). - */ - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, bytes, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); - ASSERT(trans->bytes_reserved >= bytes); - trans->bytes_reserved -= bytes; - } - - ret = __btrfs_add_delayed_item(delayed_node, delayed_item); - if (unlikely(ret)) { - btrfs_err(trans->fs_info, - "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - name_len, name, delayed_node->root->root_key.objectid, - delayed_node->inode_id, ret); - BUG(); + } else { + btrfs_release_dir_index_item_space(trans); } mutex_unlock(&delayed_node->mutex); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0a96ea8c1d3a..68f60d50e1fd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -520,6 +520,7 @@ static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; @@ -533,18 +534,19 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); return filemap_dirty_folio(mapping, folio); } + + ASSERT(spi); subpage = folio_get_private(folio); - ASSERT(subpage->dirty_bitmap); - while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { + for (cur_bit = spi->dirty_offset; + cur_bit < spi->dirty_offset + spi->bitmap_nr_bits; + cur_bit++) { unsigned long flags; u64 cur; - u16 tmp = (1 << cur_bit); spin_lock_irqsave(&subpage->lock, flags); - if (!(tmp & subpage->dirty_bitmap)) { + if (!test_bit(cur_bit, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - cur_bit++; continue; } spin_unlock_irqrestore(&subpage->lock, flags); @@ -557,7 +559,7 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); - cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1; } return filemap_dirty_folio(mapping, folio); } @@ -1547,7 +1549,7 @@ static int transaction_kthread(void *arg) delta = ktime_get_seconds() - cur->start_time; if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && - cur->state < TRANS_STATE_COMMIT_START && + cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= msecs_to_jiffies((delta - 1) * 1000); @@ -2682,8 +2684,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); - btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, BTRFS_LOCKDEP_TRANS_UNBLOCKED); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, @@ -4870,7 +4872,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) while (!list_empty(&fs_info->trans_list)) { t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); - if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state >= TRANS_STATE_COMMIT_PREP) { refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a18ee7b5a166..75ab766fe156 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1958,6 +1958,13 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } + /* + * We don't need the path anymore, so release it and + * avoid deadlocks and lockdep warnings in case + * btrfs_iget() needs to lookup the inode from its root + * btree and lock the same leaf. + */ + btrfs_release_path(path); temp_inode = btrfs_iget(sb, key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); @@ -1978,7 +1985,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } - btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index edb9b4a0dba1..7d6ee1e609bf 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -79,7 +79,7 @@ enum btrfs_lock_nesting { }; enum btrfs_lockdep_trans_states { - BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP, BTRFS_LOCKDEP_TRANS_UNBLOCKED, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, BTRFS_LOCKDEP_TRANS_COMPLETED, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b46ab348e8e5..345c449d588c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -639,7 +639,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); - ASSERT(trans); + ASSERT(trans || BTRFS_FS_ERROR(fs_info)); if (trans) { if (atomic_dec_and_test(&trans->pending_ordered)) wake_up(&trans->pending_wait); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 874e4394df86..0bf42dccb041 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -56,12 +56,17 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | Call btrfs_commit_transaction() on any trans handle attached to * | transaction N * V - * Transaction N [[TRANS_STATE_COMMIT_START]] + * Transaction N [[TRANS_STATE_COMMIT_PREP]] + * | + * | If there are simultaneous calls to btrfs_commit_transaction() one will win + * | the race and the rest will wait for the winner to commit the transaction. + * | + * | The winner will wait for previous running transaction to completely finish + * | if there is one. * | - * | Will wait for previous running transaction to completely finish if there - * | is one + * Transaction N [[TRANS_STATE_COMMIT_START]] * | - * | Then one of the following happes: + * | Then one of the following happens: * | - Wait for all other trans handle holders to release. * | The btrfs_commit_transaction() caller will do the commit work. * | - Wait for current transaction to be committed by others. @@ -112,6 +117,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_COMMIT_PREP] = 0U, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | @@ -1982,7 +1988,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) * Wait for the current transaction commit to start and block * subsequent transaction joins */ - btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); @@ -2129,7 +2135,7 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) return; lockdep_assert_held(&trans->fs_info->trans_lock); - ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } @@ -2153,7 +2159,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); - btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); @@ -2213,7 +2219,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } spin_lock(&fs_info->trans_lock); - if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; add_pending_snapshot(trans); @@ -2225,7 +2231,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) want_state = TRANS_STATE_SUPER_COMMITTED; btrfs_trans_state_lockdep_release(fs_info, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); @@ -2237,9 +2243,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } - cur_trans->state = TRANS_STATE_COMMIT_START; + cur_trans->state = TRANS_STATE_COMMIT_PREP; wake_up(&fs_info->transaction_blocked_wait); - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; @@ -2260,11 +2266,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_put_transaction(prev_trans); if (ret) goto lockdep_release; - } else { - spin_unlock(&fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); } } else { - spin_unlock(&fs_info->trans_lock); /* * The previous transaction was aborted and was already removed * from the list of transactions at fs_info->trans_list. So we @@ -2272,11 +2276,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * corrupt state (pointing to trees with unwritten nodes/leafs). */ if (BTRFS_FS_ERROR(fs_info)) { + spin_unlock(&fs_info->trans_lock); ret = -EROFS; goto lockdep_release; } } + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&fs_info->transaction_blocked_wait); + spin_unlock(&fs_info->trans_lock); + /* * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit @@ -2586,7 +2595,7 @@ lockdep_release: goto cleanup_transaction; lockdep_trans_commit_start_release: - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_end_transaction(trans); return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8e9fa23bd7fe..6b309f8a99a8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -14,6 +14,7 @@ enum btrfs_trans_state { TRANS_STATE_RUNNING, + TRANS_STATE_COMMIT_PREP, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index e028fafa04f3..996271473609 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -32,10 +32,16 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 storage_space, remaining_space, max_variable_size; efi_status_t status; - status = efivar_query_variable_info(attr, &storage_space, &remaining_space, - &max_variable_size); - if (status != EFI_SUCCESS) - return efi_status_to_err(status); + /* Some UEFI firmware does not implement QueryVariableInfo() */ + storage_space = remaining_space = 0; + if (efi_rt_services_supported(EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO)) { + status = efivar_query_variable_info(attr, &storage_space, + &remaining_space, + &max_variable_size); + if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED) + pr_warn_ratelimited("query_variable_info() failed: 0x%lx\n", + status); + } /* * This is not a normal filesystem, so no point in pretending it has a block diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c91db9f57524..1e599305d85f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> +#include <linux/freezer.h> #include <trace/events/ext4.h> /* @@ -6906,6 +6907,21 @@ __acquires(bitlock) return ret; } +static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, + ext4_group_t grp) +{ + if (grp < ext4_get_groups_count(sb)) + return EXT4_CLUSTERS_PER_GROUP(sb) - 1; + return (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp) - 1) >> + EXT4_CLUSTER_BITS(sb); +} + +static bool ext4_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) @@ -6913,9 +6929,12 @@ __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count; + bool set_trimmed = false; void *bitmap; bitmap = e4b->bd_bitmap; + if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group)) + set_trimmed = true; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -6930,16 +6949,14 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) - break; + return count; count += next - start; } free_count += next - start; start = next + 1; - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } + if (ext4_trim_interrupted()) + return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); @@ -6951,6 +6968,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) break; } + if (set_trimmed) + EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); + return count; } @@ -6961,7 +6981,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count - * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy @@ -6971,7 +6990,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks, bool set_trimmed) + ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; @@ -6988,13 +7007,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < EXT4_SB(sb)->s_last_trim_minblks) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0 && set_trimmed) - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } else { + else ret = 0; - } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -7027,7 +7043,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); - bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -7046,10 +7061,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks - 1) { + if (end >= max_blks - 1) end = max_blks - 1; - eof = true; - } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -7063,9 +7076,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - whole_group = true; for (group = first_group; group <= last_group; group++) { + if (ext4_trim_interrupted()) + break; grp = ext4_get_group_info(sb, group); if (!grp) continue; @@ -7082,13 +7096,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) { + if (group == last_group) end = last_cluster; - whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; - } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen, whole_group); + end, minlen); if (cnt < 0) { ret = cnt; break; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 41a6411c600b..bbda587f76b8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -343,17 +343,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + - (EXT4_BLOCK_SIZE(inode->i_sb) - - sizeof(struct ext4_dir_entry_tail))); - while (d < top && d->rec_len) + (blocksize - sizeof(struct ext4_dir_entry_tail))); + while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + - le16_to_cpu(d->rec_len)); + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; @@ -364,7 +364,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, #endif if (t->det_reserved_zero1 || - le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != + sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; @@ -445,13 +446,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); + unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); - if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + if (rlen == blocksize) count_offset = 8; - else if (le16_to_cpu(dirent->rec_len) == 12) { + else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); - if (le16_to_cpu(dp->rec_len) != - EXT4_BLOCK_SIZE(inode->i_sb) - 12) + if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || @@ -1315,6 +1317,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; + int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); @@ -1335,11 +1338,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; - map_tail->size = le16_to_cpu(de->rec_len); + map_tail->size = ext4_rec_len_from_disk(de->rec_len, + blocksize); count++; cond_resched(); } - de = ext4_next_entry(de, dir->i_sb->s_blocksize); + de = ext4_next_entry(de, blocksize); } return count; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9cbf8d98489a..4a280be229a6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2010,7 +2010,9 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { if (!spin_trylock(&gl->gl_lockref.lock)) continue; - if (!gl->gl_lockref.count) { + if (gl->gl_lockref.count <= 1 && + (gl->gl_state == LM_ST_UNLOCKED || + demote_ok(gl))) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d26759a98b10..f41ca89d216b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -567,15 +567,16 @@ static void freeze_go_callback(struct gfs2_glock *gl, bool remote) struct super_block *sb = sdp->sd_vfs; if (!remote || - gl->gl_state != LM_ST_SHARED || + (gl->gl_state != LM_ST_SHARED && + gl->gl_state != LM_ST_UNLOCKED) || gl->gl_demote_state != LM_ST_UNLOCKED) return; /* * Try to get an active super block reference to prevent racing with - * unmount (see trylock_super()). But note that unmount isn't the only - * place where a write lock on s_umount is taken, and we can fail here - * because of things like remount as well. + * unmount (see super_trylock_shared()). But note that unmount isn't + * the only place where a write lock on s_umount is taken, and we can + * fail here because of things like remount as well. */ if (down_read_trylock(&sb->s_umount)) { atomic_inc(&sb->s_active); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 21ada332d555..1429945215a0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -50,7 +50,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) return ret; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) return 0; ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); if (ret) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1073259902a6..8d6f934c3d95 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -298,14 +298,12 @@ static int journal_finish_inode_data_buffers(journal_t *journal, static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) { - struct page *page = bh->b_page; char *addr; __u32 checksum; - addr = kmap_atomic(page); - checksum = crc32_be(crc32_sum, - (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); - kunmap_atomic(addr); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); + checksum = crc32_be(crc32_sum, addr, bh->b_size); + kunmap_local(addr); return checksum; } @@ -322,7 +320,6 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; - struct page *page = bh->b_page; __u8 *addr; __u32 csum32; __be32 seq; @@ -331,11 +328,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, return; seq = cpu_to_be32(sequence); - addr = kmap_atomic(page); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), - bh->b_size); - kunmap_atomic(addr); + csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + kunmap_local(addr); if (jbd2_has_feature_csum3(j)) tag3->t_checksum = cpu_to_be32(csum32); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 768fa05bcbed..30dec2bd2ecc 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1601,6 +1601,8 @@ static journal_t *journal_init_common(struct block_device *bdev, err_cleanup: percpu_counter_destroy(&journal->j_checkpoint_jh_count); + if (journal->j_chksum_driver) + crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); journal_fail_superblock(journal); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 4d1fda1f7143..5f08b5fd105a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -935,19 +935,15 @@ static void warn_dirty_buffer(struct buffer_head *bh) /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ static void jbd2_freeze_jh_data(struct journal_head *jh) { - struct page *page; - int offset; char *source; struct buffer_head *bh = jh2bh(jh); J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); - page = bh->b_page; - offset = offset_in_page(bh->b_data); - source = kmap_atomic(page); + source = kmap_local_folio(bh->b_folio, bh_offset(bh)); /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); - memcpy(jh->b_frozen_data, source + offset, bh->b_size); - kunmap_atomic(source); + jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers); + memcpy(jh->b_frozen_data, source, bh->b_size); + kunmap_local(source); /* * Now that the frozen data is saved off, we need to store any matching diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 3404707ddbe7..2cd3ccf4c439 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -47,12 +47,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) xas_for_each(&xas, folio, last_page) { loff_t pg_end; bool pg_failed = false; + bool folio_started; if (xas_retry(&xas, folio)) continue; pg_end = folio_pos(folio) + folio_size(folio) - 1; + folio_started = false; for (;;) { loff_t sreq_end; @@ -60,8 +62,10 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { folio_start_fscache(folio); + folio_started = true; + } pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 47d892a1d363..f6c74f424691 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -93,12 +93,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq, dreq->max_count = dreq_len; if (dreq->count > dreq_len) dreq->count = dreq_len; - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - else /* Clear outstanding error if this is EOF */ - dreq->error = 0; } + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) + dreq->error = hdr->error; } static void @@ -120,6 +118,18 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, dreq->count = dreq_len; } +static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, + struct nfs_page *req) +{ + loff_t offs = req_offset(req); + size_t req_start = (size_t)(offs - dreq->io_start); + + if (req_start < dreq->max_count) + dreq->max_count = req_start; + if (req_start < dreq->count) + dreq->count = req_start; +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -488,7 +498,9 @@ static void nfs_direct_add_page_head(struct list_head *list, kref_get(&head->wb_kref); } -static void nfs_direct_join_group(struct list_head *list, struct inode *inode) +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *req, *subreq; @@ -510,7 +522,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode) nfs_release_request(subreq); } } while ((subreq = subreq->wb_this_page) != req); - nfs_join_page_group(req, inode); + nfs_join_page_group(req, cinfo, inode); } } @@ -528,20 +540,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode, static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; - struct nfs_page *req, *tmp; + struct nfs_page *req; LIST_HEAD(reqs); struct nfs_commit_info cinfo; - LIST_HEAD(failed); nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - nfs_direct_join_group(&reqs, dreq->inode); + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); - dreq->count = 0; - dreq->max_count = 0; - list_for_each_entry(req, &reqs, wb_list) - dreq->max_count += req->wb_bytes; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); get_dreq(dreq); @@ -549,27 +556,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); /* Bump the transmission count */ req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_move_request(req, &failed); - spin_lock(&cinfo.inode->i_lock); - dreq->flags = 0; - if (desc.pg_error < 0) + spin_lock(&dreq->lock); + if (dreq->error < 0) { + desc.pg_error = dreq->error; + } else if (desc.pg_error != -EAGAIN) { + dreq->flags = 0; + if (!desc.pg_error) + desc.pg_error = -EIO; dreq->error = desc.pg_error; - else - dreq->error = -EIO; - spin_unlock(&cinfo.inode->i_lock); + } else + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + break; } nfs_release_request(req); } nfs_pageio_complete(&desc); - while (!list_empty(&failed)) { - req = nfs_list_entry(failed.next); + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); nfs_unlock_and_release_request(req); + if (desc.pg_error == -EAGAIN) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } else { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } } if (put_dreq(dreq)) @@ -589,8 +609,6 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; - dreq->max_count = 0; - dreq->count = 0; dreq->flags = NFS_ODIRECT_DONE; } else { status = dreq->error; @@ -601,7 +619,12 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (status >= 0 && !nfs_write_match_verf(verf, req)) { + if (status < 0) { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } else if (!nfs_write_match_verf(verf, req)) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; /* * Despite the reboot, the write was successful, @@ -609,7 +632,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) */ req->wb_nio = 0; nfs_mark_request_commit(req, NULL, &cinfo, 0); - } else /* Error or match */ + } else nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -662,6 +685,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) while (!list_empty(&reqs)) { req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); + nfs_direct_truncate_request(dreq, req); nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -711,7 +735,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) { + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && + !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; flags = dreq->flags; @@ -755,18 +780,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error) static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_page *req; + struct nfs_commit_info cinfo; trace_nfs_direct_write_reschedule_io(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - if (dreq->error == 0) { + if (dreq->error == 0) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.offset + hdr->args.count - - hdr->io_start; - } + set_bit(NFS_IOHDR_REDO, &hdr->flags); spin_unlock(&dreq->lock); + while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } } static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { @@ -794,9 +824,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; + struct nfs_commit_info cinfo; ssize_t result = 0; size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + bool defer = false; trace_nfs_direct_write_schedule_iovec(dreq); @@ -837,17 +869,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_lock_request(req); - if (!nfs_pageio_add_request(&desc, req)) { - result = desc.pg_error; - nfs_unlock_and_release_request(req); - break; - } pgbase = 0; bytes -= req_len; requested_bytes += req_len; pos += req_len; dreq->bytes_left -= req_len; + + if (defer) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + continue; + } + + nfs_lock_request(req); + if (nfs_pageio_add_request(&desc, req)) + continue; + + /* Exit on hard errors */ + if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + + /* If the error is soft, defer remaining requests */ + nfs_init_cinfo_from_dreq(&cinfo, dreq); + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + desc.pg_error = 0; + defer = true; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7deb3cd76abe..a1dc33864906 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1235,6 +1235,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -EPFNOSUPPORT: case -EPROTONOSUPPORT: case -EOPNOTSUPP: + case -EINVAL: case -ECONNREFUSED: case -ECONNRESET: case -EHOSTDOWN: diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 27fb25567ce7..11e3a285594c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -417,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) .net = old->cl_net, .servername = old->cl_hostname, }; + int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ? + clp->cl_max_connect : old->cl_max_connect; if (clp->cl_proto != old->cl_proto) return; @@ -430,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) xprt_args.addrlen = clp_salen; rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, &max_connect); } /** @@ -1010,6 +1012,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); __set_bit(NFS_CS_DS, &cl_init.init_flags); + __set_bit(NFS_CS_PNFS, &cl_init.init_flags); + cl_init.max_connect = NFS_MAX_TRANSPORTS; /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 794343790ea8..3508d8238826 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2703,8 +2703,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data, return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + struct nfs_fh *fh = &o_res->fh; + nfs4_sequence_free_slot(&o_res->seq_res); - nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL); + if (o_arg->claim == NFS4_OPEN_CLAIM_FH) + fh = NFS_FH(d_inode(data->dentry)); + nfs4_proc_getattr(server, fh, o_res->f_attr, NULL); } return 0; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f4cca8f00c0c..8c1ee1a1a28f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_inode_remove_request(struct nfs_page *req); -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * @@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -void -nfs_join_page_group(struct nfs_page *head, struct inode *inode) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; @@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *head; + struct nfs_commit_info cinfo; int ret; + nfs_init_cinfo_from_inode(&cinfo, inode); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed @@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) return ERR_PTR(ret); } - nfs_join_page_group(head, inode); + nfs_join_page_group(head, &cinfo, inode); return head; } @@ -955,18 +958,16 @@ static void nfs_folio_clear_commit(struct folio *folio) } /* Called holding the request lock on @req */ -static void -nfs_clear_request_commit(struct nfs_page *req) +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct nfs_open_context *ctx = nfs_req_openctx(req); struct inode *inode = d_inode(ctx->dentry); - struct nfs_commit_info cinfo; - nfs_init_cinfo_from_inode(&cinfo, inode); mutex_lock(&NFS_I(inode)->commit_mutex); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_folio_clear_commit(nfs_page_to_folio(req)); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5ca748309c26..4199ede0583c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1058,8 +1058,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rename->rn_tname, rename->rn_tnamelen); if (status) return status; - set_change_info(&rename->rn_sinfo, &cstate->current_fh); - set_change_info(&rename->rn_tinfo, &cstate->save_fh); + set_change_info(&rename->rn_sinfo, &cstate->save_fh); + set_change_info(&rename->rn_tinfo, &cstate->current_fh); return nfs_ok; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1582af33e204..c7af1095f6b5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1082,11 +1082,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) int nfsd_pool_stats_release(struct inode *inode, struct file *file) { + struct seq_file *seq = file->private_data; + struct svc_serv *serv = seq->private; int ret = seq_release(inode, file); - struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); - nfsd_put(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index bae404a1bad4..d1761ec5866a 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -618,7 +618,8 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; - if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) { + if (inode->i_flags & OVL_COPY_I_FLAGS_MASK && + (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) { /* * Copy the fileattr inode flags that are the source of already * copied i_flags diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 3b4cc633d763..4193633c4c7a 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -19,7 +19,6 @@ struct ovl_aio_req { struct kiocb iocb; refcount_t ref; struct kiocb *orig_iocb; - struct fd fd; }; static struct kmem_cache *ovl_aio_request_cachep; @@ -280,7 +279,7 @@ static rwf_t ovl_iocb_to_rwf(int ifl) static inline void ovl_aio_put(struct ovl_aio_req *aio_req) { if (refcount_dec_and_test(&aio_req->ref)) { - fdput(aio_req->fd); + fput(aio_req->iocb.ki_filp); kmem_cache_free(ovl_aio_request_cachep, aio_req); } } @@ -342,10 +341,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); @@ -409,10 +407,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index de2dfbaae821..d7c302442c1e 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2680,7 +2680,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, } cifsFileInfo_put(cfile); - return -ENOTSUPP; + return -EOPNOTSUPP; } int cifs_truncate_page(struct address_space *mapping, loff_t from) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index d9eda2e958b4..9aeecee6b91b 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -297,7 +297,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", credits->value, new_val); - return -ENOTSUPP; + return -EOPNOTSUPP; } spin_lock(&server->req_lock); @@ -1161,7 +1161,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, /* Use a fudge factor of 256 bytes in case we collide * with a different set_EAs command. */ - if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + if (CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 < used_len + ea_name_len + ea_value_len + 1) { rc = -ENOSPC; @@ -4591,7 +4591,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (shdr->Command != SMB2_READ) { cifs_server_dbg(VFS, "only big read responses are supported\n"); - return -ENOTSUPP; + return -EOPNOTSUPP; } if (server->ops->is_session_expired && diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 092b0087c9dc..44d4943e9c56 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -89,20 +89,26 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, struct TCP_Server_Info *server) { struct smb3_hdr_req *smb3_hdr; + shdr->ProtocolId = SMB2_PROTO_NUMBER; shdr->StructureSize = cpu_to_le16(64); shdr->Command = smb2_cmd; - if (server->dialect >= SMB30_PROT_ID) { - /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */ - smb3_hdr = (struct smb3_hdr_req *)shdr; - /* if primary channel is not set yet, use default channel for chan sequence num */ - if (SERVER_IS_CHAN(server)) - smb3_hdr->ChannelSequence = - cpu_to_le16(server->primary_server->channel_sequence_num); - else - smb3_hdr->ChannelSequence = cpu_to_le16(server->channel_sequence_num); - } + if (server) { + /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */ + if (server->dialect >= SMB30_PROT_ID) { + smb3_hdr = (struct smb3_hdr_req *)shdr; + /* + * if primary channel is not set yet, use default + * channel for chan sequence num + */ + if (SERVER_IS_CHAN(server)) + smb3_hdr->ChannelSequence = + cpu_to_le16(server->primary_server->channel_sequence_num); + else + smb3_hdr->ChannelSequence = + cpu_to_le16(server->channel_sequence_num); + } spin_lock(&server->req_lock); /* Request up to 10 credits but don't go over the limit. */ if (server->credits >= server->max_credits) @@ -2234,7 +2240,7 @@ create_durable_v2_buf(struct cifs_open_parms *oparms) * (most servers default to 120 seconds) and most clients default to 0. * This can be overridden at mount ("handletimeout=") if the user wants * a different persistent (or resilient) handle timeout for all opens - * opens on a particular SMB3 mount. + * on a particular SMB3 mount. */ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); @@ -2379,7 +2385,7 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp) return 0; } -/* See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ +/* See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ static void setup_owner_group_sids(char *buf) { struct owner_group_sids *sids = (struct owner_group_sids *)buf; @@ -3124,6 +3130,7 @@ void SMB2_ioctl_free(struct smb_rqst *rqst) { int i; + if (rqst && rqst->rq_iov) { cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ for (i = 1; i < rqst->rq_nvec; i++) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 1b5d9794ed5b..d52057a511ee 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -18,7 +18,7 @@ #include <linux/bvec.h> #include <linux/highmem.h> #include <linux/uaccess.h> -#include <asm/processor.h> +#include <linux/processor.h> #include <linux/mempool.h> #include <linux/sched/signal.h> #include <linux/task_io_accounting_ops.h> diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 749660110878..544022dd6d20 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -6312,7 +6312,7 @@ int smb2_read(struct ksmbd_work *work) aux_payload_buf, nbytes); kvfree(aux_payload_buf); - + aux_payload_buf = NULL; nbytes = 0; if (remain_bytes < 0) { err = (int)remain_bytes; diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index e5e438bf5499..6c0305be895e 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1420,7 +1420,6 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, out: posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); - mark_inode_dirty(inode); return rc; } diff --git a/fs/stat.c b/fs/stat.c index 6822ac77aec2..6e60389d6a15 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -419,12 +419,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat #ifdef __ARCH_WANT_NEW_STAT -#if BITS_PER_LONG == 32 -# define choose_32_64(a,b) a -#else -# define choose_32_64(a,b) b -#endif - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 237c6f370ad9..9f64e7332796 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -185,17 +185,49 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void * /** * eventfs_set_ef_status_free - set the ef->status to free + * @ti: the tracefs_inode of the dentry * @dentry: dentry who's status to be freed * * eventfs_set_ef_status_free will be called if no more * references remain */ -void eventfs_set_ef_status_free(struct dentry *dentry) +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) { struct tracefs_inode *ti_parent; - struct eventfs_file *ef; + struct eventfs_inode *ei; + struct eventfs_file *ef, *tmp; + + /* The top level events directory may be freed by this */ + if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { + LIST_HEAD(ef_del_list); + + mutex_lock(&eventfs_mutex); + + ei = ti->private; + + /* Record all the top level files */ + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + lockdep_is_held(&eventfs_mutex)) { + list_add_tail(&ef->del_list, &ef_del_list); + } + + /* Nothing should access this, but just in case! */ + ti->private = NULL; + + mutex_unlock(&eventfs_mutex); + + /* Now safely free the top level files and their children */ + list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { + list_del(&ef->del_list); + eventfs_remove(ef); + } + + kfree(ei); + return; + } mutex_lock(&eventfs_mutex); + ti_parent = get_tracefs(dentry->d_parent->d_inode); if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE)) goto out; @@ -420,7 +452,8 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) ei = ti->private; idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_rcu(ef, &ei->e_top_files, list) { + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + srcu_read_lock_held(&eventfs_srcu)) { create_dentry(ef, dentry, false); } srcu_read_unlock(&eventfs_srcu, idx); @@ -491,6 +524,9 @@ struct dentry *eventfs_create_events_dir(const char *name, struct tracefs_inode *ti; struct inode *inode; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (IS_ERR(dentry)) return dentry; @@ -507,7 +543,7 @@ struct dentry *eventfs_create_events_dir(const char *name, INIT_LIST_HEAD(&ei->e_top_files); ti = get_tracefs(inode); - ti->flags |= TRACEFS_EVENT_INODE; + ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE; ti->private = ei; inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; @@ -538,6 +574,9 @@ struct eventfs_file *eventfs_add_subsystem_dir(const char *name, struct eventfs_inode *ei_parent; struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (!parent) return ERR_PTR(-EINVAL); @@ -569,6 +608,9 @@ struct eventfs_file *eventfs_add_dir(const char *name, { struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (!ef_parent) return ERR_PTR(-EINVAL); @@ -606,6 +648,9 @@ int eventfs_add_events_file(const char *name, umode_t mode, struct eventfs_inode *ei; struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return -ENODEV; + if (!parent) return -EINVAL; @@ -654,6 +699,9 @@ int eventfs_add_file(const char *name, umode_t mode, { struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return -ENODEV; + if (!ef_parent) return -EINVAL; @@ -791,7 +839,6 @@ void eventfs_remove(struct eventfs_file *ef) void eventfs_remove_events_dir(struct dentry *dentry) { struct tracefs_inode *ti; - struct eventfs_inode *ei; if (!dentry || !dentry->d_inode) return; @@ -800,8 +847,6 @@ void eventfs_remove_events_dir(struct dentry *dentry) if (!ti || !(ti->flags & TRACEFS_EVENT_INODE)) return; - ei = ti->private; d_invalidate(dentry); dput(dentry); - kfree(ei); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index de5b72216b1a..891653ba9cf3 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) ti = get_tracefs(inode); if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ef_status_free(dentry); + eventfs_set_ef_status_free(ti, dentry); iput(inode); } @@ -673,6 +673,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, */ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) { + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + return __create_dir(name, parent, &simple_dir_inode_operations); } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 69c2b1d87c46..4f2e49e2197b 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -3,7 +3,8 @@ #define _TRACEFS_INTERNAL_H enum { - TRACEFS_EVENT_INODE = BIT(1), + TRACEFS_EVENT_INODE = BIT(1), + TRACEFS_EVENT_TOP_INODE = BIT(2), }; struct tracefs_inode { @@ -24,6 +25,6 @@ struct inode *tracefs_get_inode(struct super_block *sb); struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); struct dentry *eventfs_failed_creating(struct dentry *dentry); struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_set_ef_status_free(struct dentry *dentry); +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ |