diff options
Diffstat (limited to 'fs')
101 files changed, 1168 insertions, 820 deletions
diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 0c51889a60b3..29281b7c3887 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -46,8 +46,8 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) * NOTE: these are set after open so only reflect 9p client not * underlying file system on server. */ -static inline void v9fs_fid_add_modes(struct p9_fid *fid, int s_flags, - int s_cache, unsigned int f_flags) +static inline void v9fs_fid_add_modes(struct p9_fid *fid, unsigned int s_flags, + unsigned int s_cache, unsigned int f_flags) { if (fid->qid.type != P9_QTFILE) return; @@ -57,7 +57,7 @@ static inline void v9fs_fid_add_modes(struct p9_fid *fid, int s_flags, (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) { fid->mode |= P9L_DIRECT; /* no read or write cache */ } else if ((!(s_cache & CACHE_WRITEBACK)) || - (f_flags & O_DSYNC) | (s_flags & V9FS_SYNC)) { + (f_flags & O_DSYNC) || (s_flags & V9FS_SYNC)) { fid->mode |= P9L_NOWRITECACHE; } } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index c7f774fe398f..d525957594b6 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -545,8 +545,6 @@ void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) p9_client_begin_disconnect(v9ses->clnt); } -extern int v9fs_error_init(void); - static struct kobject *v9fs_kobj; #ifdef CONFIG_9P_FSCACHE diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 06a2514f0d88..698c43dd5dc8 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -108,7 +108,7 @@ enum p9_cache_bits { struct v9fs_session_info { /* options */ - unsigned char flags; + unsigned int flags; unsigned char nodev; unsigned short debug; unsigned int afid; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 45b684b7d8d7..4102759a5cb5 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -208,7 +208,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) struct p9_fid *fid; __le32 version; loff_t i_size; - int retval = 0; + int retval = 0, put_err; fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", @@ -221,7 +221,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) spin_lock(&inode->i_lock); hlist_del(&fid->ilist); spin_unlock(&inode->i_lock); - retval = p9_fid_put(fid); + put_err = p9_fid_put(fid); + retval = retval < 0 ? retval : put_err; } if ((filp->f_mode & FMODE_WRITE)) { diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 2996fb00387f..11cd8d23f6f2 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -505,10 +505,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) p9_debug(P9_DEBUG_MMAP, "filp :%p\n", filp); if (!(v9ses->cache & CACHE_WRITEBACK)) { - p9_debug(P9_DEBUG_CACHE, "(no mmap mode)"); - if (vma->vm_flags & VM_MAYSHARE) - return -ENODEV; - invalidate_inode_pages2(filp->f_mapping); + p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)"); return generic_file_readonly_mmap(filp, vma); } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 36b466e35887..950cf61f118b 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -163,7 +163,6 @@ int v9fs_uflags2omode(int uflags, int extended) { int ret; - ret = 0; switch (uflags&3) { default: case O_RDONLY: @@ -603,7 +602,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); - err = 0; name = dentry->d_name.name; dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { @@ -815,8 +813,6 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); - err = 0; - v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); p9_omode = v9fs_uflags2omode(flags, v9fs_proto_dotu(v9ses)); @@ -912,7 +908,6 @@ v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, return -EINVAL; p9_debug(P9_DEBUG_VFS, "\n"); - retval = 0; old_inode = d_inode(old_dentry); new_inode = d_inode(new_dentry); v9ses = v9fs_inode2v9ses(old_inode); @@ -1066,7 +1061,6 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, if (retval) return retval; - retval = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (iattr->ia_valid & ATTR_FILE) { fid = iattr->ia_file->private_data; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 5361cd2d7996..14510872ecc3 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -366,7 +366,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); - err = 0; v9ses = v9fs_inode2v9ses(dir); omode |= S_IFDIR; diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig index 3b3a6b1423c6..54c12d9484cb 100644 --- a/fs/autofs/Kconfig +++ b/fs/autofs/Kconfig @@ -1,18 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -config AUTOFS4_FS - tristate "Old Kconfig name for Kernel automounter support" - select AUTOFS_FS - help - This name exists for people to just automatically pick up the - new name of the autofs Kconfig option. All it does is select - the new option name. - - It will go away in a release or two as people have - transitioned to just plain AUTOFS_FS. - config AUTOFS_FS tristate "Kernel automounter support (supports v3, v4 and v5)" - default n help The automounter is a tool to automatically mount remote file systems on demand. This implementation is partially kernel-based to reduce diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 48ae509f2ac2..82324c327a50 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -441,13 +441,23 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; + int progress; caching_ctl = btrfs_get_caching_control(cache); if (!caching_ctl) return; + /* + * We've already failed to allocate from this block group, so even if + * there's enough space in the block group it isn't contiguous enough to + * allow for an allocation, so wait for at least the next wakeup tick, + * or for the thing to be done. + */ + progress = atomic_read(&caching_ctl->progress); + wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || - (cache->free_space_ctl->free_space >= num_bytes)); + (progress != atomic_read(&caching_ctl->progress) && + (cache->free_space_ctl->free_space >= num_bytes))); btrfs_put_caching_control(caching_ctl); } @@ -499,12 +509,16 @@ static void fragment_free_space(struct btrfs_block_group *block_group) * used yet since their free space will be released as soon as the transaction * commits. */ -u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end) +int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, + u64 *total_added_ret) { struct btrfs_fs_info *info = block_group->fs_info; - u64 extent_start, extent_end, size, total_added = 0; + u64 extent_start, extent_end, size; int ret; + if (total_added_ret) + *total_added_ret = 0; + while (start < end) { ret = find_first_extent_bit(&info->excluded_extents, start, &extent_start, &extent_end, @@ -517,10 +531,12 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; - total_added += size; ret = btrfs_add_free_space_async_trimmed(block_group, start, size); - BUG_ON(ret); /* -ENOMEM or logic error */ + if (ret) + return ret; + if (total_added_ret) + *total_added_ret += size; start = extent_end + 1; } else { break; @@ -529,13 +545,15 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end if (start < end) { size = end - start; - total_added += size; ret = btrfs_add_free_space_async_trimmed(block_group, start, size); - BUG_ON(ret); /* -ENOMEM or logic error */ + if (ret) + return ret; + if (total_added_ret) + *total_added_ret += size; } - return total_added; + return 0; } /* @@ -779,8 +797,13 @@ next: if (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY) { - total_found += add_new_free_space(block_group, last, - key.objectid); + u64 space_added; + + ret = add_new_free_space(block_group, last, key.objectid, + &space_added); + if (ret) + goto out; + total_found += space_added; if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + fs_info->nodesize; @@ -789,17 +812,18 @@ next: if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; - if (wakeup) + if (wakeup) { + atomic_inc(&caching_ctl->progress); wake_up(&caching_ctl->wait); + } } } path->slots[0]++; } - ret = 0; - - total_found += add_new_free_space(block_group, last, - block_group->start + block_group->length); + ret = add_new_free_space(block_group, last, + block_group->start + block_group->length, + NULL); out: btrfs_free_path(path); return ret; @@ -898,6 +922,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; refcount_set(&caching_ctl->count, 2); + atomic_set(&caching_ctl->progress, 0); btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); @@ -1640,13 +1665,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - trace_btrfs_add_unused_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); + trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); - } else { + } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { /* Pull out the block group from the reclaim_bgs list. */ + trace_btrfs_add_unused_block_group(bg); list_move_tail(&bg->bg_list, &fs_info->unused_bgs); } spin_unlock(&fs_info->unused_bgs_lock); @@ -2087,6 +2113,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) /* Shouldn't have super stripes in sequential zones */ if (zoned && nr) { + kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", cache->start); @@ -2292,9 +2319,11 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, cache->start, - cache->start + cache->length); + ret = add_new_free_space(cache, cache->start, + cache->start + cache->length, NULL); btrfs_free_excluded_extents(cache); + if (ret) + goto error; } ret = btrfs_add_block_group_cache(info, cache); @@ -2668,6 +2697,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) next: btrfs_delayed_refs_rsv_release(fs_info, 1); list_del_init(&block_group->bg_list); + clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); } btrfs_trans_release_chunk_metadata(trans); } @@ -2707,6 +2737,13 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran if (!cache) return ERR_PTR(-ENOMEM); + /* + * Mark it as new before adding it to the rbtree of block groups or any + * list, so that no other task finds it and calls btrfs_mark_bg_unused() + * before the new flag is set. + */ + set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); + cache->length = size; set_free_space_tree_thresholds(cache); cache->flags = type; @@ -2730,9 +2767,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - add_new_free_space(cache, chunk_offset, chunk_offset + size); - + ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); btrfs_free_excluded_extents(cache); + if (ret) { + btrfs_put_block_group(cache); + return ERR_PTR(ret); + } /* * Ensure the corresponding space_info object is created and diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index f204addc3fe8..74b61e663028 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -70,6 +70,11 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, + /* + * Indicate that block group is in the list of new block groups of a + * transaction. + */ + BLOCK_GROUP_FLAG_NEW, }; enum btrfs_caching_type { @@ -85,6 +90,8 @@ struct btrfs_caching_control { wait_queue_head_t wait; struct btrfs_work work; struct btrfs_block_group *block_group; + /* Track progress of caching during allocation. */ + atomic_t progress; refcount_t count; }; @@ -284,8 +291,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); -u64 add_new_free_space(struct btrfs_block_group *block_group, - u64 start, u64 end); +int add_new_free_space(struct btrfs_block_group *block_group, + u64 start, u64 end, u64 *total_added_ret); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 6279d200cf83..77684c5e0c8b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -349,6 +349,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) } read_unlock(&fs_info->global_root_lock); + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + num_bytes += btrfs_root_used(&fs_info->block_group_root->root_item); + min_items++; + } + /* * But we also want to reserve enough space so we can do the fallback * global reserve for an unlink, which is an additional diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f2d2b313bde5..9419f4e37a58 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -443,6 +443,7 @@ struct btrfs_drop_extents_args { struct btrfs_file_private { void *filldir_buf; + u64 last_index; struct extent_state *llseek_cached_state; }; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6b457b010cbc..6d51db066503 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1632,6 +1632,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) } bool btrfs_readdir_get_delayed_items(struct inode *inode, + u64 last_index, struct list_head *ins_list, struct list_head *del_list) { @@ -1651,14 +1652,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); - while (item) { + while (item && item->index <= last_index) { refcount_inc(&item->refs); list_add_tail(&item->readdir_list, ins_list); item = __btrfs_next_delayed_item(item); } item = __btrfs_first_delayed_deletion_item(delayed_node); - while (item) { + while (item && item->index <= last_index) { refcount_inc(&item->refs); list_add_tail(&item->readdir_list, del_list); item = __btrfs_next_delayed_item(item); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 4f21daa3dbc7..dc1085b2a397 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -148,6 +148,7 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info); /* Used for readdir() */ bool btrfs_readdir_get_delayed_items(struct inode *inode, + u64 last_index, struct list_head *ins_list, struct list_head *del_list); void btrfs_readdir_put_delayed_items(struct inode *inode, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7513388b0567..a9a2c5446c18 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1103,7 +1103,8 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_drew_lock_init(&root->snapshot_lock); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - !btrfs_is_data_reloc_root(root)) { + !btrfs_is_data_reloc_root(root) && + is_fstree(root->root_key.objectid)) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1300,6 +1301,16 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, root = btrfs_get_global_root(fs_info, objectid); if (root) return root; + + /* + * If we're called for non-subvolume trees, and above function didn't + * find one, do not try to read it from disk. + * + * This is namely for free-space-tree and quota tree, which can change + * at runtime and should only be grabbed from fs_info. + */ + if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + return ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { @@ -3438,11 +3449,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * For devices supporting discard turn on discard=async automatically, * unless it's already set or disabled. This could be turned off by * nodiscard for the same mount. + * + * The zoned mode piggy backs on the discard functionality for + * resetting a zone. There is no reason to delay the zone reset as it is + * fast enough. So, do not enable async discard for zoned mode. */ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || btrfs_test_opt(fs_info, DISCARD_ASYNC) || btrfs_test_opt(fs_info, NODISCARD)) && - fs_info->fs_devices->discardable) { + fs_info->fs_devices->discardable && + !btrfs_is_zoned(fs_info)) { btrfs_set_and_info(fs_info, DISCARD_ASYNC, "auto enabling async discard"); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 911908ea5f6f..f396a9afa403 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4310,8 +4310,11 @@ have_block_group: ret = 0; } - if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { + if (!cache_block_group_error) + cache_block_group_error = -EIO; goto loop; + } if (!find_free_extent_check_size_class(ffe_ctl, block_group)) goto loop; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a91d5ad27984..90ad3006ef3a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -902,7 +902,30 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, size -= len; pg_offset += len; disk_bytenr += len; - bio_ctrl->len_to_oe_boundary -= len; + + /* + * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * sector aligned. alloc_new_bio() then sets it to the end of + * our ordered extent for writes into zoned devices. + * + * When len_to_oe_boundary is tracking an ordered extent, we + * trust the ordered extent code to align things properly, and + * the check above to cap our write to the ordered extent + * boundary is correct. + * + * When len_to_oe_boundary is U32_MAX, the cap above would + * result in a 4095 byte IO for the last page right before + * we hit the bio limit of UINT_MAX. bio_add_page() has all + * the checks required to make sure we don't overflow the bio, + * and we should just ignore len_to_oe_boundary completely + * unless we're using it to track an ordered extent. + * + * It's pretty hard to make a bio sized U32_MAX, but it can + * happen when the page cache is able to feed us contiguous + * pages for large extents. + */ + if (bio_ctrl->len_to_oe_boundary != U32_MAX) + bio_ctrl->len_to_oe_boundary -= len; /* Ordered extent boundary: move on to a new bio. */ if (bio_ctrl->len_to_oe_boundary == 0) @@ -2145,6 +2168,12 @@ retry: continue; } + if (!folio_test_dirty(folio)) { + /* Someone wrote it for us. */ + folio_unlock(folio); + continue; + } + if (wbc->sync_mode != WB_SYNC_NONE) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); @@ -2164,11 +2193,12 @@ retry: } /* - * the filesystem may choose to bump up nr_to_write. + * The filesystem may choose to bump up nr_to_write. * We have to make sure to honor the new nr_to_write - * at any time + * at any time. */ - nr_to_write_done = wbc->nr_to_write <= 0; + nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && + wbc->nr_to_write <= 0); } folio_batch_release(&fbatch); cond_resched(); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 0cdb3e86f29b..a6d8368ed0ed 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -760,8 +760,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { start = em_end; - if (end != (u64)-1) - len = start + len - em_end; goto next; } @@ -829,8 +827,8 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (!split) goto remove_em; } - split->start = start + len; - split->len = em_end - (start + len); + split->start = end; + split->len = em_end - end; split->block_start = em->block_start; split->flags = flags; split->compress_type = em->compress_type; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 045ddce32eca..f169378e2ca6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1515,9 +1515,13 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { - total_found += add_new_free_space(block_group, - extent_start, - offset); + u64 space_added; + + ret = add_new_free_space(block_group, extent_start, + offset, &space_added); + if (ret) + goto out; + total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; wake_up(&caching_ctl->wait); @@ -1529,8 +1533,9 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } } if (prev_bit == 1) { - total_found += add_new_free_space(block_group, extent_start, - end); + ret = add_new_free_space(block_group, extent_start, end, NULL); + if (ret) + goto out; extent_count++; } @@ -1569,6 +1574,8 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, end = block_group->start + block_group->length; while (1) { + u64 space_added; + ret = btrfs_next_item(root, path); if (ret < 0) goto out; @@ -1583,8 +1590,11 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - total_found += add_new_free_space(block_group, key.objectid, - key.objectid + key.offset); + ret = add_new_free_space(block_group, key.objectid, + key.objectid + key.offset, &space_added); + if (ret) + goto out; + total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; wake_up(&caching_ctl->wait); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dbbb67293e34..aa090b0b5d29 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1654,8 +1654,6 @@ out_unlock: clear_bits, page_ops); start += cur_alloc_size; - if (start >= end) - return ret; } /* @@ -1664,9 +1662,11 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits | EXTENT_CLEAR_DATA_RESV, - page_ops); + if (start < end) { + clear_bits |= EXTENT_CLEAR_DATA_RESV; + extent_clear_unlock_delalloc(inode, start, end, locked_page, + clear_bits, page_ops); + } return ret; } @@ -3482,15 +3482,21 @@ zeroit: void btrfs_add_delayed_iput(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned long flags; if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); - spin_lock(&fs_info->delayed_iput_lock); + /* + * Need to be irq safe here because we can be called from either an irq + * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq + * context. + */ + spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); ASSERT(list_empty(&inode->delayed_iput)); list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); } @@ -3499,37 +3505,46 @@ static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) wake_up(&fs_info->delayed_iputs_wait); - spin_lock(&fs_info->delayed_iput_lock); + spin_lock_irq(&fs_info->delayed_iput_lock); } static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { if (!list_empty(&inode->delayed_iput)) { - spin_lock(&fs_info->delayed_iput_lock); + spin_lock_irq(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput)) run_delayed_iput_locked(fs_info, inode); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { - - spin_lock(&fs_info->delayed_iput_lock); + /* + * btrfs_put_ordered_extent() can run in irq context (see bio.c), which + * calls btrfs_add_delayed_iput() and that needs to lock + * fs_info->delayed_iput_lock. So we need to disable irqs here to + * prevent a deadlock. + */ + spin_lock_irq(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); - cond_resched_lock(&fs_info->delayed_iput_lock); + if (need_resched()) { + spin_unlock_irq(&fs_info->delayed_iput_lock); + cond_resched(); + spin_lock_irq(&fs_info->delayed_iput_lock); + } } - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); } /* @@ -3659,11 +3674,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(fs_info->sb, last_objectid, root); - ret = PTR_ERR_OR_ZERO(inode); - if (ret && ret != -ENOENT) - goto out; + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + if (ret != -ENOENT) + goto out; + } - if (ret == -ENOENT && root == fs_info->tree_root) { + if (!inode && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0; @@ -3724,17 +3742,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (ret == -ENOENT || inode->i_nlink) { - if (!ret) { + if (!inode || inode->i_nlink) { + if (inode) { ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); + inode = NULL; if (ret) goto out; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - iput(inode); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", @@ -3742,10 +3760,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); - if (ret) { - iput(inode); + if (ret) goto out; - } continue; } @@ -4847,9 +4863,6 @@ again: ret = -ENOMEM; goto out; } - ret = set_page_extent_mapped(page); - if (ret < 0) - goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_read_folio(NULL, page_folio(page)); @@ -4864,6 +4877,17 @@ again: goto out_unlock; } } + + /* + * We unlock the page after the io is completed and then re-lock it + * above. release_folio() could have come in between that and cleared + * PagePrivate(), but left the page in the mapping. Set the page mapped + * here to make sure it's properly set for the subpage stuff. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + wait_on_page_writeback(page); lock_extent(io_tree, block_start, block_end, &cached_state); @@ -5849,6 +5873,74 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, } /* + * Find the highest existing sequence number in a directory and then set the + * in-memory index_cnt variable to the first free sequence number. + */ +static int btrfs_set_inode_index_count(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + if (path->slots[0] == 0) { + inode->index_cnt = BTRFS_DIR_START_INDEX; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != btrfs_ino(inode) || + found_key.type != BTRFS_DIR_INDEX_KEY) { + inode->index_cnt = BTRFS_DIR_START_INDEX; + goto out; + } + + inode->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) +{ + if (dir->index_cnt == (u64)-1) { + int ret; + + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + } + + *index = dir->index_cnt; + + return 0; +} + +/* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy * our information into that, and then dir_emit from the buffer. This is @@ -5860,10 +5952,17 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, static int btrfs_opendir(struct inode *inode, struct file *file) { struct btrfs_file_private *private; + u64 last_index; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); + if (ret) + return ret; private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); if (!private) return -ENOMEM; + private->last_index = last_index; private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!private->filldir_buf) { kfree(private); @@ -5930,7 +6029,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) INIT_LIST_HEAD(&ins_list); INIT_LIST_HEAD(&del_list); - put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); + put = btrfs_readdir_get_delayed_items(inode, private->last_index, + &ins_list, &del_list); again: key.type = BTRFS_DIR_INDEX_KEY; @@ -5948,6 +6048,8 @@ again: break; if (found_key.offset < ctx->pos) continue; + if (found_key.offset > private->last_index) + break; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) continue; di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); @@ -6084,57 +6186,6 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, } /* - * find the highest existing sequence number in a directory - * and then set the in-memory index_cnt variable to reflect - * free sequence numbers - */ -static int btrfs_set_inode_index_count(struct btrfs_inode *inode) -{ - struct btrfs_root *root = inode->root; - struct btrfs_key key, found_key; - struct btrfs_path *path; - struct extent_buffer *leaf; - int ret; - - key.objectid = btrfs_ino(inode); - key.type = BTRFS_DIR_INDEX_KEY; - key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - /* FIXME: we should be able to handle this */ - if (ret == 0) - goto out; - ret = 0; - - if (path->slots[0] == 0) { - inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; - } - - path->slots[0]--; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != btrfs_ino(inode) || - found_key.type != BTRFS_DIR_INDEX_KEY) { - inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; - } - - inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; -} - -/* * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree */ @@ -7849,8 +7900,11 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { - bbio->bio.bi_status = errno_to_blk_status(ret); - btrfs_dio_end_io(bbio); + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + file_offset, dip->bytes, + !ret); + bio->bi_status = errno_to_blk_status(ret); + iomap_dio_bio_end_io(bio); return; } } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index da1f84a0eb29..2637d6b157ff 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4445,4 +4445,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) ulist_free(entry->old_roots); kfree(entry); } + *root = RB_ROOT; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f37b925d587f..0249ea52bb80 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -71,7 +71,7 @@ static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio); static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) @@ -2404,7 +2404,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) return 0; } -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2445,9 +2445,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - if (!need_check) - goto writeback; - p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) return -ENOMEM; @@ -2516,7 +2513,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) q_sector.page = NULL; } -writeback: /* * time to start writing. Make bios for everything from the * higher layers (the bio_list in our rbio) and our p/q. Ignore @@ -2699,7 +2695,6 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) static void scrub_rbio(struct btrfs_raid_bio *rbio) { - bool need_check = false; int sector_nr; int ret; @@ -2722,7 +2717,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) * We have every sector properly prepared. Can finish the scrub * and writeback the good content. */ - ret = finish_parity_scrub(rbio, need_check); + ret = finish_parity_scrub(rbio); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 25a3361caedc..46c3c1d57266 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1916,7 +1916,39 @@ again: err = PTR_ERR(root); break; } - ASSERT(root->reloc_root == reloc_root); + + if (unlikely(root->reloc_root != reloc_root)) { + if (root->reloc_root) { + btrfs_err(fs_info, +"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", + root->root_key.objectid, + root->reloc_root->root_key.objectid, + root->reloc_root->root_key.type, + root->reloc_root->root_key.offset, + btrfs_root_generation( + &root->reloc_root->root_item), + reloc_root->root_key.objectid, + reloc_root->root_key.type, + reloc_root->root_key.offset, + btrfs_root_generation( + &reloc_root->root_item)); + } else { + btrfs_err(fs_info, +"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", + root->root_key.objectid, + reloc_root->root_key.objectid, + reloc_root->root_key.type, + reloc_root->root_key.offset, + btrfs_root_generation( + &reloc_root->root_item)); + } + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(root); + btrfs_abort_transaction(trans, -EUCLEAN); + if (!err) + err = -EUCLEAN; + break; + } /* * set reference count to 1, so btrfs_recover_relocation @@ -1989,7 +2021,7 @@ again: root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - if (IS_ERR(root)) { + if (WARN_ON(IS_ERR(root))) { /* * For recovery we read the fs roots on mount, * and if we didn't find the root then we marked @@ -1998,17 +2030,14 @@ again: * memory. However there's no reason we can't * handle the error properly here just in case. */ - ASSERT(0); ret = PTR_ERR(root); goto out; } - if (root->reloc_root != reloc_root) { + if (WARN_ON(root->reloc_root != reloc_root)) { /* - * This is actually impossible without something - * going really wrong (like weird race condition - * or cosmic rays). + * This can happen if on-disk metadata has some + * corruption, e.g. bad reloc tree key offset. */ - ASSERT(0); ret = -EINVAL; goto out; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4cae41bd6de0..7289f5bff397 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -605,7 +605,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr btrfs_stack_header_bytenr(header), logical); return; } - if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) { + if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0) { bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cf306351b148..91b6c2fdc420 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -826,8 +826,13 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) trans = start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH, true); - if (trans == ERR_PTR(-ENOENT)) - btrfs_wait_for_commit(root->fs_info, 0); + if (trans == ERR_PTR(-ENOENT)) { + int ret; + + ret = btrfs_wait_for_commit(root->fs_info, 0); + if (ret) + return ERR_PTR(ret); + } return trans; } @@ -931,6 +936,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) } wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); + ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); out: return ret; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 038dfa8f1788..ab08a0b01311 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -446,6 +446,20 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_item_key_to_cpu(leaf, &item_key, slot); is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); + /* + * Bad rootid for reloc trees. + * + * Reloc trees are only for subvolume trees, other trees only need + * to be COWed to be relocated. + */ + if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID && + !is_fstree(key->offset))) { + generic_err(leaf, slot, + "invalid reloc tree for root %lld, root id is not a subvolume tree", + key->offset); + return -EUCLEAN; + } + /* No such tree id */ if (unlikely(key->objectid == 0)) { if (is_root_item) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 73f9ea7672db..6aa9bf3661ac 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4078,14 +4078,6 @@ static int alloc_profile_is_valid(u64 flags, int extended) return has_single_bit_set(flags); } -static inline int balance_need_close(struct btrfs_fs_info *fs_info) -{ - /* cancel requested || normal exit path */ - return atomic_read(&fs_info->balance_cancel_req) || - (atomic_read(&fs_info->balance_pause_req) == 0 && - atomic_read(&fs_info->balance_cancel_req) == 0); -} - /* * Validate target profile against allowed profiles and return true if it's OK. * Otherwise print the error message and return false. @@ -4275,6 +4267,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, u64 num_devices; unsigned seq; bool reducing_redundancy; + bool paused = false; int i; if (btrfs_fs_closing(fs_info) || @@ -4405,6 +4398,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { btrfs_info(fs_info, "balance: paused"); btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + paused = true; } /* * Balance can be canceled by: @@ -4433,8 +4427,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, btrfs_update_ioctl_balance_args(fs_info, bargs); } - if ((ret && ret != -ECANCELED && ret != -ENOSPC) || - balance_need_close(fs_info)) { + /* We didn't pause, we can clean everything up. */ + if (!paused) { reset_balance_state(fs_info); btrfs_exclop_finish(fs_info); } @@ -4644,8 +4638,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) } } - BUG_ON(fs_info->balance_ctl || - test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; @@ -6404,7 +6397,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); - *mirror_num_ret = mirror_num; + if (mirror_num_ret) + *mirror_num_ret = mirror_num; *bioc_ret = NULL; ret = 0; goto out; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 85b8b332add9..72b90bc19a19 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -805,6 +805,9 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) return -EINVAL; } + btrfs_clear_and_info(info, DISCARD_ASYNC, + "zoned: async discard ignored and disabled for zoned mode"); + return 0; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4a2b39d9a61a..bdcffb04513f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2019,9 +2019,10 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) } } +WRAP_DIR_ITER(ceph_readdir) // FIXME! const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, - .iterate = ceph_readdir, + .iterate_shared = shared_ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, @@ -2033,7 +2034,7 @@ const struct file_operations ceph_dir_fops = { }; const struct file_operations ceph_snapdir_fops = { - .iterate = ceph_readdir, + .iterate_shared = shared_ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 66048a86c480..5fb367b1d4b0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4764,7 +4764,7 @@ static void delayed_work(struct work_struct *work) dout("mdsc delayed_work\n"); - if (mdsc->stopping) + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) return; mutex_lock(&mdsc->mutex); @@ -4943,7 +4943,7 @@ void send_flush_mdlog(struct ceph_mds_session *s) void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) { dout("pre_umount\n"); - mdsc->stopping = 1; + mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN; ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 724307ff89cd..86d2965e68a1 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -380,6 +380,11 @@ struct cap_wait { int want; }; +enum { + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHED = 2, +}; + /* * mds client state */ diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index cce78d769f55..6d3584f16f9a 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -216,7 +216,7 @@ static void metric_delayed_work(struct work_struct *work) struct ceph_mds_client *mdsc = container_of(m, struct ceph_mds_client, metric); - if (mdsc->stopping) + if (mdsc->stopping || disable_send_metrics) return; if (!m->session || !check_session_state(m->session)) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 3fc48b43cab0..a5f52013314d 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1374,6 +1374,16 @@ static void ceph_kill_sb(struct super_block *s) ceph_mdsc_pre_umount(fsc->mdsc); flush_fs_workqueues(fsc); + /* + * Though the kill_anon_super() will finally trigger the + * sync_filesystem() anyway, we still need to do it here + * and then bump the stage of shutdown to stop the work + * queue as earlier as possible. + */ + sync_filesystem(s); + + fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 8450b1bd354b..1b960de2bf39 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -429,21 +429,14 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) cfi = coda_ftoc(coda_file); host_file = cfi->cfi_container; - if (host_file->f_op->iterate || host_file->f_op->iterate_shared) { + if (host_file->f_op->iterate_shared) { struct inode *host_inode = file_inode(host_file); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { - if (host_file->f_op->iterate_shared) { - inode_lock_shared(host_inode); - ret = host_file->f_op->iterate_shared(host_file, ctx); - file_accessed(host_file); - inode_unlock_shared(host_inode); - } else { - inode_lock(host_inode); - ret = host_file->f_op->iterate(host_file, ctx); - file_accessed(host_file); - inode_unlock(host_inode); - } + inode_lock_shared(host_inode); + ret = host_file->f_op->iterate_shared(host_file, ctx); + file_accessed(host_file); + inode_unlock_shared(host_inode); } return ret; } @@ -585,10 +578,11 @@ const struct inode_operations coda_dir_inode_operations = { .setattr = coda_setattr, }; +WRAP_DIR_ITER(coda_readdir) // FIXME! const struct file_operations coda_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = coda_readdir, + .iterate_shared = shared_coda_readdir, .open = coda_open, .release = coda_release, .fsync = coda_fsync, diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2a29943fa5cc..cfad1eac7fd9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -148,7 +148,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, *maptype = 0; return inpage; } - kunmap_atomic(inpage); + kunmap_local(inpage); might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) @@ -162,7 +162,7 @@ docopy: src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); - kunmap_atomic(inpage); + kunmap_local(inpage); return ERR_PTR(-EFAULT); } @@ -173,9 +173,9 @@ docopy: min_t(unsigned int, total, PAGE_SIZE - *inputmargin); if (!inpage) - inpage = kmap_atomic(*in); + inpage = kmap_local_page(*in); memcpy(tmp, inpage + *inputmargin, page_copycnt); - kunmap_atomic(inpage); + kunmap_local(inpage); inpage = NULL; tmp += page_copycnt; total -= page_copycnt; @@ -214,7 +214,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, int ret, maptype; DBG_BUGON(*rq->in == NULL); - headpage = kmap_atomic(*rq->in); + headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { @@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { - kunmap_atomic(headpage); + kunmap_local(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & @@ -261,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } if (maptype == 0) { - kunmap_atomic(headpage); + kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { @@ -289,7 +289,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, /* one optimized fast path only for non bigpcluster cases yet */ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); + dst = kmap_local_page(*rq->out); dst_maptype = 0; goto dstmap_out; } @@ -311,7 +311,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, dstmap_out: ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); if (!dst_maptype) - kunmap_atomic(dst); + kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, ctx.outpages); return ret; @@ -328,7 +328,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, const unsigned int lefthalf = rq->outputsize - righthalf; const unsigned int interlaced_offset = rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - unsigned char *src, *dst; + u8 *src; if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); @@ -341,22 +341,19 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, } src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) { - dst = kmap_local_page(rq->out[0]); - memcpy(dst + rq->pageofs_out, src + interlaced_offset, - righthalf); - kunmap_local(dst); - } + if (rq->out[0]) + memcpy_to_page(rq->out[0], rq->pageofs_out, + src + interlaced_offset, righthalf); if (outpages > inpages) { DBG_BUGON(!rq->out[outpages - 1]); if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - dst = kmap_local_page(rq->out[outpages - 1]); - memcpy(dst, interlaced_offset ? src : - (src + righthalf), lefthalf); - kunmap_local(dst); + memcpy_to_page(rq->out[outpages - 1], 0, src + + (interlaced_offset ? 0 : righthalf), + lefthalf); } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); + flush_dcache_page(rq->in[inpages - 1]); } } kunmap_local(src); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d70b12b81507..e12592727a54 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -183,7 +183,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && - vi->datalayout == EROFS_INODE_FLAT_PLAIN) + (vi->datalayout == EROFS_INODE_FLAT_PLAIN || + vi->datalayout == EROFS_INODE_CHUNK_BASED)) inode->i_flags |= S_DAX; if (!nblks) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 9d6a3c6158bd..566f68ddfa36 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -889,8 +889,6 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi; - WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); - /* pseudo mount for anon inodes */ if (sb->s_flags & SB_KERNMOUNT) { kill_anon_super(sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5f1890e309c6..de4f12152b62 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1035,7 +1035,7 @@ hitted: */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - cur = end - min_t(unsigned int, offset + end - map->m_la, end); + cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); goto next_part; @@ -1144,10 +1144,11 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, struct z_erofs_bvec *bvec) { struct z_erofs_bvec_item *item; + unsigned int pgnr; - if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { - unsigned int pgnr; - + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && + (bvec->end == PAGE_SIZE || + bvec->offset + bvec->end == be->pcl->length)) { pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pgnr >= be->nr_pages); if (!be->decompressed_pages[pgnr]) { @@ -1841,7 +1842,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, } cur = map->m_la + map->m_llen - 1; - while (cur >= end) { + while ((cur >= end) && (cur < i_size_read(inode))) { pgoff_t index = cur >> PAGE_SHIFT; struct page *page; diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 9f42f25fab92..e918decb3735 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -69,7 +69,7 @@ static int exfat_allocate_bitmap(struct super_block *sb, } sbi->map_sectors = ((need_map_size - 1) >> (sb->s_blocksize_bits)) + 1; - sbi->vol_amap = kmalloc_array(sbi->map_sectors, + sbi->vol_amap = kvmalloc_array(sbi->map_sectors, sizeof(struct buffer_head *), GFP_KERNEL); if (!sbi->vol_amap) return -ENOMEM; @@ -84,7 +84,7 @@ static int exfat_allocate_bitmap(struct super_block *sb, while (j < i) brelse(sbi->vol_amap[j++]); - kfree(sbi->vol_amap); + kvfree(sbi->vol_amap); sbi->vol_amap = NULL; return -EIO; } @@ -138,7 +138,7 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi) for (i = 0; i < sbi->map_sectors; i++) __brelse(sbi->vol_amap[i]); - kfree(sbi->vol_amap); + kvfree(sbi->vol_amap); } int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync) diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 957574180a5e..e1586bba6d86 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -34,6 +34,7 @@ static int exfat_get_uniname_from_ext_entry(struct super_block *sb, { int i, err; struct exfat_entry_set_cache es; + unsigned int uni_len = 0, len; err = exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES); if (err) @@ -52,7 +53,10 @@ static int exfat_get_uniname_from_ext_entry(struct super_block *sb, if (exfat_get_entry_type(ep) != TYPE_EXTEND) break; - exfat_extract_uni_name(ep, uniname); + len = exfat_extract_uni_name(ep, uniname); + uni_len += len; + if (len != EXFAT_FILE_NAME_LEN || uni_len >= MAX_NAME_LENGTH) + break; uniname += EXFAT_FILE_NAME_LEN; } @@ -214,7 +218,10 @@ static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb) exfat_init_namebuf(nb); } -/* skip iterating emit_dots when dir is empty */ +/* + * Before calling dir_emit*(), sbi->s_lock should be released + * because page fault can occur in dir_emit*(). + */ #define ITER_POS_FILLED_DOTS (2) static int exfat_iterate(struct file *file, struct dir_context *ctx) { @@ -229,11 +236,10 @@ static int exfat_iterate(struct file *file, struct dir_context *ctx) int err = 0, fake_offset = 0; exfat_init_namebuf(nb); - mutex_lock(&EXFAT_SB(sb)->s_lock); cpos = ctx->pos; if (!dir_emit_dots(file, ctx)) - goto unlock; + goto out; if (ctx->pos == ITER_POS_FILLED_DOTS) { cpos = 0; @@ -245,16 +251,18 @@ static int exfat_iterate(struct file *file, struct dir_context *ctx) /* name buffer should be allocated before use */ err = exfat_alloc_namebuf(nb); if (err) - goto unlock; + goto out; get_new: + mutex_lock(&EXFAT_SB(sb)->s_lock); + if (ei->flags == ALLOC_NO_FAT_CHAIN && cpos >= i_size_read(inode)) goto end_of_dir; err = exfat_readdir(inode, &cpos, &de); if (err) { /* - * At least we tried to read a sector. Move cpos to next sector - * position (should be aligned). + * At least we tried to read a sector. + * Move cpos to next sector position (should be aligned). */ if (err == -EIO) { cpos += 1 << (sb->s_blocksize_bits); @@ -277,16 +285,10 @@ get_new: inum = iunique(sb, EXFAT_ROOT_INO); } - /* - * Before calling dir_emit(), sb_lock should be released. - * Because page fault can occur in dir_emit() when the size - * of buffer given from user is larger than one page size. - */ mutex_unlock(&EXFAT_SB(sb)->s_lock); if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum, (de.attr & ATTR_SUBDIR) ? DT_DIR : DT_REG)) - goto out_unlocked; - mutex_lock(&EXFAT_SB(sb)->s_lock); + goto out; ctx->pos = cpos; goto get_new; @@ -294,9 +296,8 @@ end_of_dir: if (!cpos && fake_offset) cpos = ITER_POS_FILLED_DOTS; ctx->pos = cpos; -unlock: mutex_unlock(&EXFAT_SB(sb)->s_lock); -out_unlocked: +out: /* * To improve performance, free namebuf after unlock sb_lock. * If namebuf is not allocated, this function do nothing @@ -305,10 +306,11 @@ out_unlocked: return err; } +WRAP_DIR_ITER(exfat_iterate) // FIXME! const struct file_operations exfat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = exfat_iterate, + .iterate_shared = shared_exfat_iterate, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, @@ -1079,7 +1081,8 @@ rewind: if (entry_type == TYPE_EXTEND) { unsigned short entry_uniname[16], unichar; - if (step != DIRENT_STEP_NAME) { + if (step != DIRENT_STEP_NAME || + name_len >= MAX_NAME_LENGTH) { step = DIRENT_STEP_FILE; continue; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 40e624cf7e92..d1dbe47c7975 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -315,7 +315,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child) goto out; error = -EINVAL; - if (!file->f_op->iterate && !file->f_op->iterate_shared) + if (!file->f_op->iterate_shared) goto out_close; buffer.sequence = 0; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a2475b8c9fb5..21b903fe546e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context * fls() instead since we need to know the actual length while modifying * goal length. */ - order = fls(ac->ac_g_ex.fe_len); + order = fls(ac->ac_g_ex.fe_len) - 1; min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; - if (1 << min_order < ac->ac_o_ex.fe_len) - min_order = fls(ac->ac_o_ex.fe_len) + 1; - if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of @@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context */ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) - min_order = fls(num_stripe_clusters); + /* + * We consider 1 order less because later we round + * up the goal len to num_stripe_clusters + */ + min_order = fls(num_stripe_clusters) - 1; } + if (1 << min_order < ac->ac_o_ex.fe_len) + min_order = fls(ac->ac_o_ex.fe_len); + for (i = order; i >= min_order; i--) { int frag_order; /* @@ -4761,8 +4765,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *tmp_pa, *cpa = NULL; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; + loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4770,47 +4774,151 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return false; - /* first, try per-file preallocation */ + /* + * first, try per-file preallocation by searching the inode pa rbtree. + * + * Here, we can't do a direct traversal of the tree because + * ext4_mb_discard_group_preallocation() can paralelly mark the pa + * deleted and that can cause direct traversal to skip some entries. + */ read_lock(&ei->i_prealloc_lock); + + if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { + goto try_group_pa; + } + + /* + * Step 1: Find a pa with logical start immediately adjacent to the + * original logical start. This could be on the left or right. + * + * (tmp_pa->pa_lstart never changes so we can skip locking for it). + */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, - tmp_pa_start, iter)) { + tmp_pa->pa_lstart, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); + } - /* all fields in this condition don't change, - * so we can skip locking for them */ - tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - /* original request start doesn't lie in this PA */ - if (ac->ac_o_ex.fe_logical < tmp_pa_start || - ac->ac_o_ex.fe_logical >= tmp_pa_end) - continue; + /* + * Step 2: The adjacent pa might be to the right of logical start, find + * the left adjacent pa. After this step we'd have a valid tmp_pa whose + * logical start is towards the left of original request's logical start + */ + if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + tmp = rb_prev(&tmp_pa->pa_node.inode_node); - /* non-extent files can't have physical blocks past 2^32 */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > - EXT4_MAX_BLOCK_FILE_PHYS)) { + if (tmp) { + tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, + pa_node.inode_node); + } else { /* - * Since PAs don't overlap, we won't find any - * other PA to satisfy this. + * If there is no adjacent pa to the left then finding + * an overlapping pa is not possible hence stop searching + * inode pa tree */ - break; + goto try_group_pa; } + } + + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); - /* found preallocated blocks, use them */ + /* + * Step 3: If the left adjacent pa is deleted, keep moving left to find + * the first non deleted adjacent pa. After this step we should have a + * valid tmp_pa which is guaranteed to be non deleted. + */ + for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { + if (!iter) { + /* + * no non deleted left adjacent pa, so stop searching + * inode pa tree + */ + goto try_group_pa; + } + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free && - likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { - atomic_inc(&tmp_pa->pa_count); - ext4_mb_use_inode_pa(ac, tmp_pa); + if (tmp_pa->pa_deleted == 0) { + /* + * We will keep holding the pa_lock from + * this point on because we don't want group discard + * to delete this pa underneath us. Since group + * discard is anyways an ENOSPC operation it + * should be okay for it to wait a few more cycles. + */ + break; + } else { spin_unlock(&tmp_pa->pa_lock); - read_unlock(&ei->i_prealloc_lock); - return true; } + } + + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); + BUG_ON(tmp_pa->pa_deleted == 1); + + /* + * Step 4: We now have the non deleted left adjacent pa. Only this + * pa can possibly satisfy the request hence check if it overlaps + * original logical start and stop searching if it doesn't. + */ + tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { spin_unlock(&tmp_pa->pa_lock); + goto try_group_pa; + } + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) { + /* + * Since PAs don't overlap, we won't find any other PA to + * satisfy this. + */ + spin_unlock(&tmp_pa->pa_lock); + goto try_group_pa; + } + + if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { + atomic_inc(&tmp_pa->pa_count); + ext4_mb_use_inode_pa(ac, tmp_pa); + spin_unlock(&tmp_pa->pa_lock); + read_unlock(&ei->i_prealloc_lock); + return true; + } else { + /* + * We found a valid overlapping pa but couldn't use it because + * it had no free blocks. This should ideally never happen + * because: + * + * 1. When a new inode pa is added to rbtree it must have + * pa_free > 0 since otherwise we won't actually need + * preallocation. + * + * 2. An inode pa that is in the rbtree can only have it's + * pa_free become zero when another thread calls: + * ext4_mb_new_blocks + * ext4_mb_use_preallocated + * ext4_mb_use_inode_pa + * + * 3. Further, after the above calls make pa_free == 0, we will + * immediately remove it from the rbtree in: + * ext4_mb_new_blocks + * ext4_mb_release_context + * ext4_mb_put_pa + * + * 4. Since the pa_free becoming 0 and pa_free getting removed + * from tree both happen in ext4_mb_new_blocks, which is always + * called with i_data_sem held for data allocations, we can be + * sure that another process will never see a pa in rbtree with + * pa_free == 0. + */ + WARN_ON_ONCE(tmp_pa->pa_free == 0); } + spin_unlock(&tmp_pa->pa_lock); +try_group_pa: read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 321e3a888c20..05151d61b00b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1782,6 +1782,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, memmove(here, (void *)here + size, (void *)last - (void *)here + sizeof(__u32)); memset(last, 0, size); + + /* + * Update i_inline_off - moved ibody region might contain + * system.data attribute. Handling a failure here won't + * cause other complications for setting an xattr. + */ + if (!is_block && ext4_has_inline_data(inode)) { + ret = ext4_find_inline_data_nolock(inode); + if (ret) { + ext4_warning_inode(inode, + "unable to update i_inline_off"); + goto out; + } + } } else if (s->not_found) { /* Insert new name. */ size_t size = EXT4_XATTR_LEN(name_len); diff --git a/fs/file.c b/fs/file.c index 7893ea161d77..3fd003a8604f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1036,16 +1036,30 @@ unsigned long __fdget_raw(unsigned int fd) return __fget_light(fd, 0); } +/* + * Try to avoid f_pos locking. We only need it if the + * file is marked for FMODE_ATOMIC_POS, and it can be + * accessed multiple ways. + * + * Always do it for directories, because pidfd_getfd() + * can make a file accessible even if it otherwise would + * not be, and for directories this is a correctness + * issue, not a "POSIX requirement". + */ +static inline bool file_needs_f_pos_lock(struct file *file) +{ + return (file->f_mode & FMODE_ATOMIC_POS) && + (file_count(file) > 1 || file->f_op->iterate_shared); +} + unsigned long __fdget_pos(unsigned int fd) { unsigned long v = __fdget(fd); struct file *file = (struct file *)(v & ~3); - if (file && (file->f_mode & FMODE_ATOMIC_POS)) { - if (file_count(file) > 1) { - v |= FDPUT_POS_UNLOCK; - mutex_lock(&file->f_pos_lock); - } + if (file && file_needs_f_pos_lock(file)) { + v |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); } return v; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 35bc174f9ba2..f67bef9d83c4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -258,7 +258,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) spin_unlock(&fi->lock); } kfree(forget); - if (ret == -ENOMEM) + if (ret == -ENOMEM || ret == -EINTR) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || fuse_stale_inode(inode, outarg.generation, &outarg.attr)) @@ -395,8 +395,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out_put_forget; err = -EIO; - if (!outarg->nodeid) - goto out_put_forget; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d66070af145d..f19d748890f0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1134,7 +1134,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, process_init_limits(fc, arg); if (arg->minor >= 6) { - u64 flags = arg->flags | (u64) arg->flags2 << 32; + u64 flags = arg->flags; + + if (flags & FUSE_INIT_EXT) + flags |= (u64) arg->flags2 << 32; ra_pages = arg->max_readahead / PAGE_SIZE; if (flags & FUSE_ASYNC_READ) @@ -1254,7 +1257,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | - FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP; + FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | + FUSE_HAS_EXPIRE_ONLY; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 8e01bfdfc430..726640fa439e 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -9,14 +9,23 @@ #include <linux/compat.h> #include <linux/fileattr.h> -static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args) +static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, + struct fuse_ioctl_out *outarg) { - ssize_t ret = fuse_simple_request(fm, args); + ssize_t ret; + + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; + + ret = fuse_simple_request(fm, args); /* Translate ENOSYS, which shouldn't be returned from fs */ if (ret == -ENOSYS) ret = -ENOTTY; + if (ret >= 0 && outarg->result == -ENOSYS) + outarg->result = -ENOTTY; + return ret; } @@ -264,13 +273,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } ap.args.out_numargs = 2; - ap.args.out_args[0].size = sizeof(outarg); - ap.args.out_args[0].value = &outarg; ap.args.out_args[1].size = out_size; ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_send_ioctl(fm, &ap.args); + transferred = fuse_send_ioctl(fm, &ap.args, &outarg); err = transferred; if (transferred < 0) goto out; @@ -399,12 +406,10 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.in_args[1].size = inarg.in_size; args.in_args[1].value = ptr; args.out_numargs = 2; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; args.out_args[1].size = inarg.out_size; args.out_args[1].value = ptr; - err = fuse_send_ioctl(fm, &args); + err = fuse_send_ioctl(fm, &args, &outarg); if (!err) { if (outarg.result < 0) err = outarg.result; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 1bf3c4453516..b43fa8b8fc05 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1578,7 +1578,7 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read = filemap_splice_read, + .splice_read = copy_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, @@ -1609,7 +1609,7 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, - .splice_read = filemap_splice_read, + .splice_read = copy_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ec1631257978..7e835be7032d 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -230,9 +230,11 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct super_block *sb = sdp->sd_vfs; struct gfs2_bufdata *bd; struct gfs2_meta_header *mh; struct gfs2_trans *tr = current->journal_info; + bool withdraw = false; lock_buffer(bh); if (buffer_pinned(bh)) { @@ -266,13 +268,15 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } - if (unlikely(test_bit(SDF_FROZEN, &sdp->sd_flags))) { - fs_info(sdp, "GFS2:adding buf while frozen\n"); - gfs2_assert_withdraw(sdp, 0); - } if (unlikely(gfs2_withdrawn(sdp))) { fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n", (unsigned long long)bd->bd_bh->b_blocknr); + goto out_unlock; + } + if (unlikely(sb->s_writers.frozen == SB_FREEZE_COMPLETE)) { + fs_info(sdp, "GFS2:adding buf while frozen\n"); + withdraw = true; + goto out_unlock; } gfs2_pin(sdp, bd->bd_bh); mh->__pad0 = cpu_to_be64(0); @@ -281,6 +285,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) tr->tr_num_buf_new++; out_unlock: gfs2_log_unlock(sdp); + if (withdraw) + gfs2_assert_withdraw(sdp, 0); out: unlock_buffer(bh); } diff --git a/fs/inode.c b/fs/inode.c index 8fefb69e1f84..67611a360031 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -16,7 +16,6 @@ #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> -#include <linux/prefetch.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> @@ -1041,8 +1040,6 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&sb->s_inode_list_lock); - inode = new_inode_pseudo(sb); if (inode) inode_sb_list_add(inode); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index adb92cdb24b0..aa8967cca1a3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -872,10 +872,10 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); - if (unlikely(ret < 0)) + if (unlikely(iter.pos == iocb->ki_pos)) return ret; ret = iter.pos - iocb->ki_pos; - iocb->ki_pos += ret; + iocb->ki_pos = iter.pos; return ret; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 51bd38da21cd..9ec91017a7f3 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -27,7 +27,7 @@ * * Called with j_list_lock held. */ -static inline void __buffer_unlink_first(struct journal_head *jh) +static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction = jh->b_cp_transaction; @@ -41,45 +41,6 @@ static inline void __buffer_unlink_first(struct journal_head *jh) } /* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - -/* - * Move a buffer from the checkpoint list to the checkpoint io list - * - * Called with j_list_lock held - */ -static inline void __buffer_relink_io(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - - if (!transaction->t_checkpoint_io_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_io_list; - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_io_list = jh; -} - -/* * Check a checkpoint buffer could be release or not. * * Requires j_list_lock @@ -183,6 +144,7 @@ __flush_batch(journal_t *journal, int *batch_count) struct buffer_head *bh = journal->j_chkpt_bhs[i]; BUFFER_TRACE(bh, "brelse"); __brelse(bh); + journal->j_chkpt_bhs[i] = NULL; } *batch_count = 0; } @@ -242,15 +204,6 @@ restart: jh = transaction->t_checkpoint_list; bh = jh2bh(jh); - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - goto retry; - } if (jh->b_transaction != NULL) { transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; @@ -285,30 +238,50 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } - if (!buffer_dirty(bh)) { + if (!trylock_buffer(bh)) { + /* + * The buffer is locked, it may be writing back, or + * flushing out in the last couple of cycles, or + * re-adding into a new transaction, need to check + * it again until it's unlocked. + */ + get_bh(bh); + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto retry; + } else if (!buffer_dirty(bh)) { + unlock_buffer(bh); BUFFER_TRACE(bh, "remove from checkpoint"); - if (__jbd2_journal_remove_checkpoint(jh)) - /* The transaction was released; we're done */ + /* + * If the transaction was released or the checkpoint + * list was empty, we're done. + */ + if (__jbd2_journal_remove_checkpoint(jh) || + !transaction->t_checkpoint_list) goto out; - continue; + } else { + unlock_buffer(bh); + /* + * We are about to write the buffer, it could be + * raced by some other transaction shrink or buffer + * re-log logic once we release the j_list_lock, + * leave it on the checkpoint list and check status + * again to make sure it's clean. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + journal->j_chkpt_bhs[batch_count++] = bh; + transaction->t_chp_stats.cs_written++; + transaction->t_checkpoint_list = jh->b_cpnext; } - /* - * Important: we are about to write the buffer, and - * possibly block, while still holding the journal - * lock. We cannot afford to let the transaction - * logic start messing around with this buffer before - * we write it to disk, as that would break - * recoverability. - */ - BUFFER_TRACE(bh, "queue"); - get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); - journal->j_chkpt_bhs[batch_count++] = bh; - __buffer_relink_io(jh); - transaction->t_chp_stats.cs_written++; + if ((batch_count == JBD2_NR_BATCH) || - need_resched() || - spin_needbreak(&journal->j_list_lock)) + need_resched() || spin_needbreak(&journal->j_list_lock) || + jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0]) goto unlock_and_flush; } @@ -322,38 +295,6 @@ restart: goto restart; } - /* - * Now we issued all of the transaction's buffers, let's deal - * with the buffers that are out for I/O. - */ -restart2: - /* Did somebody clean up the transaction in the meanwhile? */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - goto out; - - while (transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart2; - } - - /* - * Now in whatever state the buffer currently is, we - * know that it has been written out and so we can - * drop it from the list - */ - if (__jbd2_journal_remove_checkpoint(jh)) - break; - } out: spin_unlock(&journal->j_list_lock); result = jbd2_cleanup_journal_tail(journal); @@ -409,49 +350,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ /* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and - * release them. If 'destroy' is set, clean all buffers unconditionally. - * - * Called with j_list_lock held. - * Returns 1 if we freed the transaction, 0 otherwise. - */ -static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - - if (!destroy && __cp_buffer_busy(jh)) - return 0; - - if (__jbd2_journal_remove_checkpoint(jh)) - return 1; - /* - * This function only frees up some memory - * if possible so we dont have an obligation - * to finish processing. Bail out if preemption - * requested: - */ - if (need_resched()) - return 0; - } while (jh != last_jh); - - return 0; -} - -/* * journal_shrink_one_cp_list * - * Find 'nr_to_scan' written-back checkpoint buffers in the given list + * Find all the written-back checkpoint buffers in the given list * and try to release them. If the whole transaction is released, set * the 'released' parameter. Return the number of released checkpointed * buffers. @@ -459,15 +360,15 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - unsigned long *nr_to_scan, - bool *released) + bool destroy, bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; unsigned long nr_freed = 0; int ret; - if (!jh || *nr_to_scan == 0) + *released = false; + if (!jh) return 0; last_jh = jh->b_cpprev; @@ -475,12 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - (*nr_to_scan)--; - if (__cp_buffer_busy(jh)) - continue; + if (destroy) { + ret = __jbd2_journal_remove_checkpoint(jh); + } else { + ret = jbd2_journal_try_remove_checkpoint(jh); + if (ret < 0) + continue; + } nr_freed++; - ret = __jbd2_journal_remove_checkpoint(jh); if (ret) { *released = true; break; @@ -488,7 +392,7 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, if (need_resched()) break; - } while (jh != last_jh && *nr_to_scan); + } while (jh != last_jh); return nr_freed; } @@ -506,11 +410,11 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan) { transaction_t *transaction, *last_transaction, *next_transaction; - bool released; + bool __maybe_unused released; tid_t first_tid = 0, last_tid = 0, next_tid = 0; tid_t tid = 0; unsigned long nr_freed = 0; - unsigned long nr_scanned = *nr_to_scan; + unsigned long freed; again: spin_lock(&journal->j_list_lock); @@ -539,19 +443,11 @@ again: transaction = next_transaction; next_transaction = transaction->t_cpnext; tid = transaction->t_tid; - released = false; - - nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, - nr_to_scan, &released); - if (*nr_to_scan == 0) - break; - if (need_resched() || spin_needbreak(&journal->j_list_lock)) - break; - if (released) - continue; - nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, - nr_to_scan, &released); + freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, + false, &released); + nr_freed += freed; + (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) break; if (need_resched() || spin_needbreak(&journal->j_list_lock)) @@ -572,9 +468,8 @@ again: if (*nr_to_scan && next_tid) goto again; out: - nr_scanned -= *nr_to_scan; trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, - nr_freed, nr_scanned, next_tid); + nr_freed, next_tid); return nr_freed; } @@ -590,7 +485,7 @@ out: void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret; + bool released; transaction = journal->j_checkpoint_transactions; if (!transaction) @@ -601,8 +496,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction->t_checkpoint_list, - destroy); + journal_shrink_one_cp_list(transaction->t_checkpoint_list, + destroy, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -610,23 +505,12 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) */ if (need_resched()) return; - if (ret) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, destroy); - if (need_resched()) - return; /* * Stop scanning if we couldn't free the transaction. This * avoids pointless scanning of transactions which still * weren't checkpointed. */ - if (!ret) + if (!released) return; } while (transaction != last_transaction); } @@ -705,7 +589,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) jbd2_journal_put_journal_head(jh); /* Is this transaction empty? */ - if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) + if (transaction->t_checkpoint_list) return 0; /* @@ -737,6 +621,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) } /* + * Check the checkpoint buffer and try to remove it from the checkpoint + * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if + * it frees the transaction, 0 otherwise. + * + * This function is called with j_list_lock held. + */ +int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + if (!trylock_buffer(bh)) + return -EBUSY; + if (buffer_dirty(bh)) { + unlock_buffer(bh); + return -EBUSY; + } + unlock_buffer(bh); + + /* + * Buffer is clean and the IO has finished (we held the buffer + * lock) so the checkpoint is done. We can safely remove the + * buffer from this transaction. + */ + JBUFFER_TRACE(jh, "remove from checkpoint list"); + return __jbd2_journal_remove_checkpoint(jh); +} + +/* * journal_insert_checkpoint: put a committed buffer onto a checkpoint * list so that we know when it is safe to clean the transaction out of * the log. @@ -797,7 +709,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b33155dd7001..1073259902a6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1141,8 +1141,7 @@ restart_loop: spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; /* Check if the transaction can be dropped now that we are finished */ - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); jbd2_journal_free_transaction(commit_transaction); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 18611241f451..4d1fda1f7143 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) * Otherwise, if the buffer has been written to disk, * it is safe to remove the checkpoint and drop it. */ - if (!buffer_dirty(bh)) { - __jbd2_journal_remove_checkpoint(jh); + if (jbd2_journal_try_remove_checkpoint(jh) >= 0) { spin_unlock(&journal->j_list_lock); goto drop; } @@ -2100,35 +2099,6 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) __brelse(bh); } -/* - * Called from jbd2_journal_try_to_free_buffers(). - * - * Called under jh->b_state_lock - */ -static void -__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) -{ - struct journal_head *jh; - - jh = bh2jh(bh); - - if (buffer_locked(bh) || buffer_dirty(bh)) - goto out; - - if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) - goto out; - - spin_lock(&journal->j_list_lock); - if (jh->b_cp_transaction != NULL) { - /* written-back checkpointed metadata buffer */ - JBUFFER_TRACE(jh, "remove from checkpoint list"); - __jbd2_journal_remove_checkpoint(jh); - } - spin_unlock(&journal->j_list_lock); -out: - return; -} - /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -2186,7 +2156,13 @@ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) continue; spin_lock(&jh->b_state_lock); - __journal_try_to_free_buffer(journal, bh); + if (!jh->b_transaction && !jh->b_next_transaction) { + spin_lock(&journal->j_list_lock); + /* Remove written-back checkpointed metadata buffer */ + if (jh->b_cp_transaction != NULL) + jbd2_journal_try_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + } spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9b030297aa64..e98ddb2b1cf2 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1535,9 +1535,10 @@ const struct inode_operations jfs_dir_inode_operations = { #endif }; +WRAP_DIR_ITER(jfs_readdir) // FIXME! const struct file_operations jfs_dir_operations = { .read = generic_read_dir, - .iterate = jfs_readdir, + .iterate_shared = shared_jfs_readdir, .fsync = jfs_fsync, .unlocked_ioctl = jfs_ioctl, .compat_ioctl = compat_ptr_ioctl, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6e61fa3acaf1..3aefbad4cc09 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6341,8 +6341,6 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return status; - if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) - return status; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8a2321d19194..2c9074ab2315 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -956,10 +956,13 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, last_page = page + (offset + sd->len - 1) / PAGE_SIZE; for (page += offset / PAGE_SIZE; page <= last_page; page++) { /* - * Skip page replacement when extending the contents - * of the current page. + * Skip page replacement when extending the contents of the + * current page. But note that we may get two zero_pages in a + * row from shmem. */ - if (page == *(rqstp->rq_next_page - 1)) + if (page == *(rqstp->rq_next_page - 1) && + offset_in_page(rqstp->rq_res.page_base + + rqstp->rq_res.page_len)) continue; if (unlikely(!svc_rqst_replace_page(rqstp, page))) return -EIO; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index a8ce522ac747..35bc79305318 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1101,9 +1101,17 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty) int __nilfs_mark_inode_dirty(struct inode *inode, int flags) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; struct buffer_head *ibh; int err; + /* + * Do not dirty inodes after the log writer has been detached + * and its nilfs_root struct has been freed. + */ + if (unlikely(nilfs_purging(nilfs))) + return 0; + err = nilfs_load_inode_block(inode, &ibh); if (unlikely(err)) { nilfs_warn(inode->i_sb, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c2553024bd25..581691e4be49 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2845,6 +2845,7 @@ void nilfs_detach_log_writer(struct super_block *sb) nilfs_segctor_destroy(nilfs->ns_writer); nilfs->ns_writer = NULL; } + set_nilfs_purging(nilfs); /* Force to free the list of dirty files */ spin_lock(&nilfs->ns_inode_lock); @@ -2857,4 +2858,5 @@ void nilfs_detach_log_writer(struct super_block *sb) up_write(&nilfs->ns_segctor_sem); nilfs_dispose_list(nilfs, &garbage_list, 1); + clear_nilfs_purging(nilfs); } diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 47c7dfbb7ea5..cd4ae1b8ae16 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -29,6 +29,7 @@ enum { THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ THE_NILFS_GC_RUNNING, /* gc process is running */ THE_NILFS_SB_DIRTY, /* super block is dirty */ + THE_NILFS_PURGING, /* disposing dirty files for cleanup */ }; /** @@ -208,6 +209,7 @@ THE_NILFS_FNS(INIT, init) THE_NILFS_FNS(DISCONTINUED, discontinued) THE_NILFS_FNS(GC_RUNNING, gc_running) THE_NILFS_FNS(SB_DIRTY, sb_dirty) +THE_NILFS_FNS(PURGING, purging) /* * Mount option operations diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 52ccd34b1e79..a026dbd3593f 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -272,7 +272,7 @@ int unregister_nls(struct nls_table * nls) return -EINVAL; } -static struct nls_table *find_nls(char *charset) +static struct nls_table *find_nls(const char *charset) { struct nls_table *nls; spin_lock(&nls_lock); @@ -288,7 +288,7 @@ static struct nls_table *find_nls(char *charset) return nls; } -struct nls_table *load_nls(char *charset) +struct nls_table *load_nls(const char *charset) { return try_then_request_module(find_nls(charset), "nls_%s", charset); } diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 518c3a21a556..4596c90e7b7c 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1525,10 +1525,11 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, #endif /* NTFS_RW */ +WRAP_DIR_ITER(ntfs_readdir) // FIXME! const struct file_operations ntfs_dir_ops = { .llseek = generic_file_llseek, /* Seek inside directory. */ .read = generic_read_dir, /* Return -EISDIR. */ - .iterate = ntfs_readdir, /* Read directory contents. */ + .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ #ifdef NTFS_RW .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ #endif /* NTFS_RW */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 91a194596552..bf2c17ea96a0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2793,10 +2793,11 @@ const struct file_operations ocfs2_fops = { .remap_file_range = ocfs2_remap_file_range, }; +WRAP_DIR_ITER(ocfs2_readdir) // FIXME! const struct file_operations ocfs2_dops = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, @@ -2842,7 +2843,7 @@ const struct file_operations ocfs2_fops_no_plocks = { const struct file_operations ocfs2_dops_no_plocks = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, diff --git a/fs/open.c b/fs/open.c index 0c55c8e7f837..e6ead0f19964 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1322,7 +1322,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ - if (flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index ee5c4736480f..de39e067ae65 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -954,10 +954,11 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return 0; } +WRAP_DIR_ITER(ovl_iterate) // FIXME! const struct file_operations ovl_dir_operations = { .read = generic_read_dir, .open = ovl_dir_open, - .iterate = ovl_iterate, + .iterate_shared = shared_ovl_iterate, .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5b069f1a1e44..cc8977498c48 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1460,7 +1460,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) ovl_trusted_xattr_handlers; sb->s_fs_info = ofs; sb->s_flags |= SB_POSIXACL; - sb->s_iflags |= SB_I_SKIP_SYNC; + sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872..9df3f4839662 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2817,7 +2817,7 @@ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ \ static const struct file_operations proc_##LSM##_attr_dir_ops = { \ .read = generic_read_dir, \ - .iterate = proc_##LSM##_attr_dir_iterate, \ + .iterate_shared = proc_##LSM##_attr_dir_iterate, \ .llseek = default_llseek, \ }; \ \ diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 9cb32e1a78a0..23fc24d16b31 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -309,6 +309,8 @@ static void append_kcore_note(char *notes, size_t *i, const char *name, static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) { + struct file *file = iocb->ki_filp; + char *buf = file->private_data; loff_t *fpos = &iocb->ki_pos; size_t phdrs_offset, notes_offset, data_offset; size_t page_offline_frozen = 1; @@ -555,10 +557,21 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) case KCORE_VMEMMAP: case KCORE_TEXT: /* - * We use _copy_to_iter() to bypass usermode hardening - * which would otherwise prevent this operation. + * Sadly we must use a bounce buffer here to be able to + * make use of copy_from_kernel_nofault(), as these + * memory regions might not always be mapped on all + * architectures. */ - if (_copy_to_iter((char *)start, tsz, iter) != tsz) { + if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { + ret = -EFAULT; + goto out; + } + /* + * We know the bounce buffer is safe to copy from, so + * use _copy_to_iter() directly. + */ + } else if (_copy_to_iter(buf, tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -595,6 +608,10 @@ static int open_kcore(struct inode *inode, struct file *filp) if (ret) return ret; + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -605,9 +622,16 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } +static int release_kcore(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + static const struct proc_ops kcore_proc_ops = { .proc_read_iter = read_kcore_iter, .proc_open = open_kcore, + .proc_release = release_kcore, .proc_lseek = default_llseek, }; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cb80a7703d58..1fb213f379a5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -132,7 +132,7 @@ ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, u64 *ppos, bool encrypted) { unsigned long pfn, offset; - size_t nr_bytes; + ssize_t nr_bytes; ssize_t read = 0, tmp; int idx; diff --git a/fs/readdir.c b/fs/readdir.c index b264ce60114d..c8c46e294431 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -25,6 +25,53 @@ #include <asm/unaligned.h> /* + * Some filesystems were never converted to '->iterate_shared()' + * and their directory iterators want the inode lock held for + * writing. This wrapper allows for converting from the shared + * semantics to the exclusive inode use. + */ +int wrap_directory_iterator(struct file *file, + struct dir_context *ctx, + int (*iter)(struct file *, struct dir_context *)) +{ + struct inode *inode = file_inode(file); + int ret; + + /* + * We'd love to have an 'inode_upgrade_trylock()' operation, + * see the comment in mmap_upgrade_trylock() in mm/memory.c. + * + * But considering this is for "filesystems that never got + * converted", it really doesn't matter. + * + * Also note that since we have to return with the lock held + * for reading, we can't use the "killable()" locking here, + * since we do need to get the lock even if we're dying. + * + * We could do the write part killably and then get the read + * lock unconditionally if it mattered, but see above on why + * this does the very simplistic conversion. + */ + up_read(&inode->i_rwsem); + down_write(&inode->i_rwsem); + + /* + * Since we dropped the inode lock, we should do the + * DEADDIR test again. See 'iterate_dir()' below. + * + * Note that we don't need to re-do the f_pos games, + * since the file must be locked wrt f_pos anyway. + */ + ret = -ENOENT; + if (!IS_DEADDIR(inode)) + ret = iter(file, ctx); + + downgrade_write(&inode->i_rwsem); + return ret; +} +EXPORT_SYMBOL(wrap_directory_iterator); + +/* * Note the "unsafe_put_user() semantics: we goto a * label for errors. */ @@ -40,39 +87,28 @@ int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - bool shared = false; int res = -ENOTDIR; - if (file->f_op->iterate_shared) - shared = true; - else if (!file->f_op->iterate) + + if (!file->f_op->iterate_shared) goto out; res = security_file_permission(file, MAY_READ); if (res) goto out; - if (shared) - res = down_read_killable(&inode->i_rwsem); - else - res = down_write_killable(&inode->i_rwsem); + res = down_read_killable(&inode->i_rwsem); if (res) goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; - if (shared) - res = file->f_op->iterate_shared(file, ctx); - else - res = file->f_op->iterate(file, ctx); + res = file->f_op->iterate_shared(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } - if (shared) - inode_unlock_shared(inode); - else - inode_unlock(inode); + inode_unlock_shared(inode); out: return res; } diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index fb4162a52844..aec6e9137474 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -153,6 +153,11 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) in_flight(server), atomic_read(&server->in_send), atomic_read(&server->num_waiters)); +#ifdef CONFIG_NET_NS + if (server->net) + seq_printf(m, " Net namespace: %u ", server->net->ns.inum); +#endif /* NET_NS */ + } static inline const char *smb_speed_to_str(size_t bps) @@ -430,10 +435,15 @@ skip_rdma: server->reconnect_instance, server->srv_count, server->sec_mode, in_flight(server)); +#ifdef CONFIG_NET_NS + if (server->net) + seq_printf(m, " Net namespace: %u ", server->net->ns.inum); +#endif /* NET_NS */ seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d", atomic_read(&server->in_send), atomic_read(&server->num_waiters)); + if (server->leaf_fullpath) { seq_printf(m, "\nDFS leaf full path: %s", server->leaf_fullpath); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index d7274eefc666..15c8cc4b6680 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -159,6 +159,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 43 -#define CIFS_VERSION "2.43" +#define SMB3_PRODUCT_BUILD 44 +#define CIFS_VERSION "2.44" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index b5808fe3469a..657dee4b2c8c 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -532,7 +532,7 @@ struct smb_version_operations { /* Check for STATUS_IO_TIMEOUT */ bool (*is_status_io_timeout)(char *buf); /* Check for STATUS_NETWORK_NAME_DELETED */ - void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); + bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); }; struct smb_version_values { @@ -1062,6 +1062,7 @@ struct cifs_ses { unsigned long chans_need_reconnect; /* ========= end: protected by chan_lock ======== */ struct cifs_ses *dfs_root_ses; + struct nls_table *local_nls; }; static inline bool diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 19f7385abeec..25503f1a4fd2 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -129,7 +129,7 @@ again: } spin_unlock(&server->srv_lock); - nls_codepage = load_nls_default(); + nls_codepage = ses->local_nls; /* * need to prevent multiple threads trying to simultaneously @@ -200,7 +200,6 @@ out: rc = -EAGAIN; } - unload_nls(nls_codepage); return rc; } @@ -3184,7 +3183,7 @@ setAclRetry: param_offset = offsetof(struct smb_com_transaction2_spi_req, InformationLevel) - 4; offset = param_offset + params; - parm_data = ((char *) &pSMB->hdr.Protocol) + offset; + parm_data = ((char *)pSMB) + sizeof(pSMB->hdr.smb_buf_length) + offset; pSMB->ParameterOffset = cpu_to_le16(param_offset); /* convert to on the wire format for POSIX ACL */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 85dd1b373974..238538dde4e3 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -60,7 +60,7 @@ extern bool disable_legacy_dialects; #define TLINK_IDLE_EXPIRE (600 * HZ) /* Drop the connection to not overload the server */ -#define NUM_STATUS_IO_TIMEOUT 5 +#define MAX_STATUS_IO_TIMEOUT 5 static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); @@ -1117,6 +1117,7 @@ cifs_demultiplex_thread(void *p) struct mid_q_entry *mids[MAX_COMPOUND]; char *bufs[MAX_COMPOUND]; unsigned int noreclaim_flag, num_io_timeout = 0; + bool pending_reconnect = false; noreclaim_flag = memalloc_noreclaim_save(); cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); @@ -1156,6 +1157,8 @@ cifs_demultiplex_thread(void *p) cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) continue; + + pending_reconnect = false; next_pdu: server->pdu_size = pdu_length; @@ -1213,10 +1216,13 @@ next_pdu: if (server->ops->is_status_io_timeout && server->ops->is_status_io_timeout(buf)) { num_io_timeout++; - if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) { - cifs_reconnect(server, false); + if (num_io_timeout > MAX_STATUS_IO_TIMEOUT) { + cifs_server_dbg(VFS, + "Number of request timeouts exceeded %d. Reconnecting", + MAX_STATUS_IO_TIMEOUT); + + pending_reconnect = true; num_io_timeout = 0; - continue; } } @@ -1226,9 +1232,14 @@ next_pdu: if (mids[i] != NULL) { mids[i]->resp_buf_size = server->pdu_size; - if (bufs[i] && server->ops->is_network_name_deleted) - server->ops->is_network_name_deleted(bufs[i], - server); + if (bufs[i] != NULL) { + if (server->ops->is_network_name_deleted && + server->ops->is_network_name_deleted(bufs[i], + server)) { + cifs_server_dbg(FYI, + "Share deleted. Reconnect needed"); + } + } if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); @@ -1263,6 +1274,11 @@ next_pdu: buf = server->smallbuf; goto next_pdu; } + + /* do this reconnect at the very end after processing all MIDs */ + if (pending_reconnect) + cifs_reconnect(server, true); + } /* end while !EXITING */ /* buffer usually freed in free_mid - need to free it here on exit */ @@ -1826,6 +1842,10 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx) CIFS_MAX_PASSWORD_LEN)) return 0; } + + if (strcmp(ctx->local_nls->charset, ses->local_nls->charset)) + return 0; + return 1; } @@ -2270,6 +2290,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) ses->sectype = ctx->sectype; ses->sign = ctx->sign; + ses->local_nls = load_nls(ctx->local_nls->charset); /* add server as first channel */ spin_lock(&ses->chan_lock); diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 1403a2d1ab17..ee772c3d9f00 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -66,6 +66,12 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path) return rc; } +/* + * Track individual DFS referral servers used by new DFS mount. + * + * On success, their lifetime will be shared by final tcon (dfs_ses_list). + * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount(). + */ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; @@ -80,11 +86,12 @@ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) INIT_LIST_HEAD(&root_ses->list); spin_lock(&cifs_tcp_ses_lock); - ses->ses_count++; + cifs_smb_ses_inc_refcount(ses); spin_unlock(&cifs_tcp_ses_lock); root_ses->ses = ses; list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list); } + /* Select new DFS referral server so that new referrals go through it */ ctx->dfs_root_ses = ses; return 0; } @@ -170,8 +177,12 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); rc = dfs_get_referral(mnt_ctx, ref_path + 1, NULL, &tl); - if (rc) + if (rc) { + rc = cifs_mount_get_tcon(mnt_ctx); + if (!rc) + rc = cifs_is_path_remote(mnt_ctx); break; + } tit = dfs_cache_get_tgt_iterator(&tl); if (!tit) { @@ -242,7 +253,6 @@ out: int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - struct cifs_ses *ses; bool nodfs = ctx->nodfs; int rc; @@ -276,20 +286,8 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) } *isdfs = true; - /* - * Prevent DFS root session of being put in the first call to - * cifs_mount_put_conns(). If another DFS root server was not found - * while chasing the referrals (@ctx->dfs_root_ses == @ses), then we - * can safely put extra refcount of @ses. - */ - ses = mnt_ctx->ses; - mnt_ctx->ses = NULL; - mnt_ctx->server = NULL; - rc = __dfs_mount_share(mnt_ctx); - if (ses == ctx->dfs_root_ses) - cifs_put_smb_ses(ses); - - return rc; + add_root_smb_session(mnt_ctx); + return __dfs_mount_share(mnt_ctx); } /* Update dfs referral path of superblock */ diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 879bc8e6555c..6bc44f79d2e9 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1080,8 +1080,8 @@ int cifs_close(struct inode *inode, struct file *file) cfile = file->private_data; file->private_data = NULL; dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); - if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && - cinode->lease_granted && + if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG) + && cinode->lease_granted && !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { @@ -4681,9 +4681,9 @@ static int cifs_readpage_worker(struct file *file, struct page *page, io_error: kunmap(page); - unlock_page(page); read_complete: + unlock_page(page); return rc; } @@ -4878,9 +4878,11 @@ void cifs_oplock_break(struct work_struct *work) struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); struct inode *inode = d_inode(cfile->dentry); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cinode = CIFS_I(inode); - struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct tcon_link *tlink; int rc = 0; bool purge_cache = false, oplock_break_cancelled; __u64 persistent_fid, volatile_fid; @@ -4889,6 +4891,12 @@ void cifs_oplock_break(struct work_struct *work) wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + goto out; + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + server->ops->downgrade_oplock(server, cinode, cfile->oplock_level, cfile->oplock_epoch, &purge_cache); @@ -4938,18 +4946,19 @@ oplock_break_ack: /* * MS-SMB2 3.2.5.19.1 and 3.2.5.19.2 (and MS-CIFS 3.2.5.42) do not require * an acknowledgment to be sent when the file has already been closed. - * check for server null, since can race with kill_sb calling tree disconnect. */ spin_lock(&cinode->open_file_lock); - if (tcon->ses && tcon->ses->server && !oplock_break_cancelled && - !list_empty(&cinode->openFileList)) { + /* check list empty since can race with kill_sb calling tree disconnect */ + if (!oplock_break_cancelled && !list_empty(&cinode->openFileList)) { spin_unlock(&cinode->open_file_lock); - rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid, - volatile_fid, net_fid, cinode); + rc = server->ops->oplock_response(tcon, persistent_fid, + volatile_fid, net_fid, cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } else spin_unlock(&cinode->open_file_lock); + cifs_put_tlink(tlink); +out: cifs_done_oplock_break(cinode); } diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 4946a0c59600..67e16c2ac90e 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -231,6 +231,8 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c break; case Opt_sec_none: ctx->nullauth = 1; + kfree(ctx->username); + ctx->username = NULL; break; default: cifs_errorf(fc, "bad security option: %s\n", value); @@ -1201,6 +1203,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_user: kfree(ctx->username); ctx->username = NULL; + if (ctx->nullauth) + break; if (strlen(param->string) == 0) { /* null user, ie. anonymous authentication */ ctx->nullauth = 1; diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index fff092bbc7a3..f7160003e0ed 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -433,16 +433,21 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) * Dump encryption keys. This is an old ioctl that only * handles AES-128-{CCM,GCM}. */ - if (pSMBFile == NULL) - break; if (!capable(CAP_SYS_ADMIN)) { rc = -EACCES; break; } - tcon = tlink_tcon(pSMBFile->tlink); + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + tcon = tlink_tcon(tlink); if (!smb3_encryption_required(tcon)) { rc = -EOPNOTSUPP; + cifs_put_tlink(tlink); break; } pkey_inf.cipher_type = @@ -459,6 +464,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EFAULT; else rc = 0; + cifs_put_tlink(tlink); break; case CIFS_DUMP_FULL_KEY: /* @@ -470,8 +476,16 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EACCES; break; } - tcon = tlink_tcon(pSMBFile->tlink); + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + + tcon = tlink_tcon(tlink); rc = cifs_dump_full_key(tcon, (void __user *)arg); + cifs_put_tlink(tlink); break; case CIFS_IOC_NOTIFY: if (!S_ISDIR(inode->i_mode)) { diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 70dbfe6584f9..d7e85d9a2655 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -95,6 +95,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) return; } + unload_nls(buf_to_free->local_nls); atomic_dec(&sesInfoAllocCount); kfree(buf_to_free->serverOS); kfree(buf_to_free->serverDomain); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 335c078c42fb..c57ca2050b73 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -1013,6 +1013,7 @@ setup_ntlm_smb3_neg_ret: } +/* See MS-NLMP 2.2.1.3 */ int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, @@ -1047,7 +1048,8 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer, flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED; - + /* we only send version information in ntlmssp negotiate, so do not set this flag */ + flags = flags & ~NTLMSSP_NEGOTIATE_VERSION; tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 87abce010974..0f62bc373ad0 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2395,7 +2395,7 @@ smb2_is_status_io_timeout(char *buf) return false; } -static void +static bool smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; @@ -2404,7 +2404,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) struct cifs_tcon *tcon; if (shdr->Status != STATUS_NETWORK_NAME_DELETED) - return; + return false; /* If server is a channel, select the primary channel */ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; @@ -2419,11 +2419,13 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", tcon->tree_name); - return; + return true; } } } spin_unlock(&cifs_tcp_ses_lock); + + return false; } static int diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index e04766fe6f80..a457f07f820d 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -242,7 +242,7 @@ again: } spin_unlock(&server->srv_lock); - nls_codepage = load_nls_default(); + nls_codepage = ses->local_nls; /* * need to prevent multiple threads trying to simultaneously @@ -324,7 +324,6 @@ out: rc = -EAGAIN; } failed: - unload_nls(nls_codepage); return rc; } diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index c6db898dab7c..7676091b3e77 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -160,7 +160,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) spin_unlock(&ses->ses_lock); continue; } - ++ses->ses_count; + cifs_smb_ses_inc_refcount(ses); spin_unlock(&ses->ses_lock); return ses; } diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index fb8b2d566efb..b7521e41402e 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -352,7 +352,8 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_SHARE_FLAG_STREAMS BIT(11) #define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12) #define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13) -#define KSMBD_SHARE_FLAG_UPDATE BIT(14) +#define KSMBD_SHARE_FLAG_UPDATE BIT(14) +#define KSMBD_SHARE_FLAG_CROSSMNT BIT(15) /* * Tree connect request flags. diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index ced7a9e916f0..9df121bdf349 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -286,6 +286,7 @@ static void handle_ksmbd_work(struct work_struct *wk) static int queue_ksmbd_work(struct ksmbd_conn *conn) { struct ksmbd_work *work; + int err; work = ksmbd_alloc_work_struct(); if (!work) { @@ -297,7 +298,11 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) work->request_buf = conn->request_buf; conn->request_buf = NULL; - ksmbd_init_smb_server(work); + err = ksmbd_init_smb_server(work); + if (err) { + ksmbd_free_work_struct(work); + return 0; + } ksmbd_conn_enqueue_request(work); atomic_inc(&conn->r_count); diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 33b7e6c4ceff..e881df1d10cb 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -380,13 +380,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } if (smb2_req_struct_sizes[command] != pdu->StructureSize2) { - if (command == SMB2_OPLOCK_BREAK_HE && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { + if (!(command == SMB2_OPLOCK_BREAK_HE && + (le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_20 || + le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_21))) { /* special case for SMB2.1 lease break message */ ksmbd_debug(SMB, - "Illegal request size %d for oplock break\n", - le16_to_cpu(pdu->StructureSize2)); + "Illegal request size %u for command %d\n", + le16_to_cpu(pdu->StructureSize2), command); return 1; } } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index cf8822103f50..7cc1b0c47d0a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -87,9 +87,9 @@ struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn */ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) { - struct smb2_hdr *req_hdr = smb2_get_msg(work->request_buf); + struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); unsigned int cmd = le16_to_cpu(req_hdr->Command); - int tree_id; + unsigned int tree_id; if (cmd == SMB2_TREE_CONNECT_HE || cmd == SMB2_CANCEL_HE || @@ -114,7 +114,7 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) pr_err("The first operation in the compound does not have tcon\n"); return -EINVAL; } - if (work->tcon->id != tree_id) { + if (tree_id != UINT_MAX && work->tcon->id != tree_id) { pr_err("tree id(%u) is different with id(%u) in first operation\n", tree_id, work->tcon->id); return -EINVAL; @@ -559,9 +559,9 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work) */ int smb2_check_user_session(struct ksmbd_work *work) { - struct smb2_hdr *req_hdr = smb2_get_msg(work->request_buf); + struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned int cmd = conn->ops->get_cmd_val(work); + unsigned int cmd = le16_to_cpu(req_hdr->Command); unsigned long long sess_id; /* @@ -587,7 +587,7 @@ int smb2_check_user_session(struct ksmbd_work *work) pr_err("The first operation in the compound does not have sess\n"); return -EINVAL; } - if (work->sess->id != sess_id) { + if (sess_id != ULLONG_MAX && work->sess->id != sess_id) { pr_err("session id(%llu) is different with the first operation(%lld)\n", sess_id, work->sess->id); return -EINVAL; @@ -2324,9 +2324,16 @@ next: break; buf_len -= next; eabuf = (struct smb2_ea_info *)((char *)eabuf + next); - if (next < (u32)eabuf->EaNameLength + le16_to_cpu(eabuf->EaValueLength)) + if (buf_len < sizeof(struct smb2_ea_info)) { + rc = -EINVAL; break; + } + if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + + le16_to_cpu(eabuf->EaValueLength)) { + rc = -EINVAL; + break; + } } while (next != 0); kfree(attr_name); @@ -2467,8 +2474,9 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, } } -static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, - int open_flags, umode_t posix_mode, bool is_dir) +static int smb2_creat(struct ksmbd_work *work, struct path *parent_path, + struct path *path, char *name, int open_flags, + umode_t posix_mode, bool is_dir) { struct ksmbd_tree_connect *tcon = work->tcon; struct ksmbd_share_config *share = tcon->share_conf; @@ -2495,7 +2503,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, return rc; } - rc = ksmbd_vfs_kern_path_locked(work, name, 0, path, 0); + rc = ksmbd_vfs_kern_path_locked(work, name, 0, parent_path, path, 0); if (rc) { pr_err("cannot get linux path (%s), err = %d\n", name, rc); @@ -2565,7 +2573,7 @@ int smb2_open(struct ksmbd_work *work) struct ksmbd_tree_connect *tcon = work->tcon; struct smb2_create_req *req; struct smb2_create_rsp *rsp; - struct path path; + struct path path, parent_path; struct ksmbd_share_config *share = tcon->share_conf; struct ksmbd_file *fp = NULL; struct file *filp = NULL; @@ -2786,7 +2794,8 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, &path, 1); + rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, + &parent_path, &path, 1); if (!rc) { file_present = true; @@ -2906,7 +2915,8 @@ int smb2_open(struct ksmbd_work *work) /*create file if not present */ if (!file_present) { - rc = smb2_creat(work, &path, name, open_flags, posix_mode, + rc = smb2_creat(work, &parent_path, &path, name, open_flags, + posix_mode, req->CreateOptions & FILE_DIRECTORY_FILE_LE); if (rc) { if (rc == -ENOENT) { @@ -3321,8 +3331,9 @@ int smb2_open(struct ksmbd_work *work) err_out: if (file_present || created) { - inode_unlock(d_inode(path.dentry->d_parent)); - dput(path.dentry); + inode_unlock(d_inode(parent_path.dentry)); + path_put(&path); + path_put(&parent_path); } ksmbd_revert_fsids(work); err_out1: @@ -5545,7 +5556,7 @@ static int smb2_create_link(struct ksmbd_work *work, struct nls_table *local_nls) { char *link_name = NULL, *target_name = NULL, *pathname = NULL; - struct path path; + struct path path, parent_path; bool file_present = false; int rc; @@ -5575,7 +5586,7 @@ static int smb2_create_link(struct ksmbd_work *work, ksmbd_debug(SMB, "target name is %s\n", target_name); rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS, - &path, 0); + &parent_path, &path, 0); if (rc) { if (rc != -ENOENT) goto out; @@ -5605,8 +5616,9 @@ static int smb2_create_link(struct ksmbd_work *work, rc = -EINVAL; out: if (file_present) { - inode_unlock(d_inode(path.dentry->d_parent)); + inode_unlock(d_inode(parent_path.dentry)); path_put(&path); + path_put(&parent_path); } if (!IS_ERR(link_name)) kfree(link_name); @@ -6209,6 +6221,11 @@ int smb2_read(struct ksmbd_work *work) unsigned int max_read_size = conn->vals->max_read_size; WORK_BUFFERS(work, req, rsp); + if (work->next_smb2_rcv_hdr_off) { + work->send_no_response = 1; + err = -EOPNOTSUPP; + goto out; + } if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) { @@ -8609,7 +8626,8 @@ int smb3_decrypt_req(struct ksmbd_work *work) struct smb2_transform_hdr *tr_hdr = smb2_get_msg(buf); int rc = 0; - if (buf_data_size < sizeof(struct smb2_hdr)) { + if (pdu_length < sizeof(struct smb2_transform_hdr) || + buf_data_size < sizeof(struct smb2_hdr)) { pr_err("Transform message is too small (%u)\n", pdu_length); return -ECONNABORTED; diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index ef20f63e55e6..c2b75d898852 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -388,26 +388,29 @@ static struct smb_version_cmds smb1_server_cmds[1] = { [SMB_COM_NEGOTIATE_EX] = { .proc = smb1_negotiate, }, }; -static void init_smb1_server(struct ksmbd_conn *conn) +static int init_smb1_server(struct ksmbd_conn *conn) { conn->ops = &smb1_server_ops; conn->cmds = smb1_server_cmds; conn->max_cmds = ARRAY_SIZE(smb1_server_cmds); + return 0; } -void ksmbd_init_smb_server(struct ksmbd_work *work) +int ksmbd_init_smb_server(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; __le32 proto; - if (conn->need_neg == false) - return; - proto = *(__le32 *)((struct smb_hdr *)work->request_buf)->Protocol; + if (conn->need_neg == false) { + if (proto == SMB1_PROTO_NUMBER) + return -EINVAL; + return 0; + } + if (proto == SMB1_PROTO_NUMBER) - init_smb1_server(conn); - else - init_smb3_11_server(conn); + return init_smb1_server(conn); + return init_smb3_11_server(conn); } int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index aeca0f46068f..f1092519c0c2 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -427,7 +427,7 @@ bool ksmbd_smb_request(struct ksmbd_conn *conn); int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); -void ksmbd_init_smb_server(struct ksmbd_work *work); +int ksmbd_init_smb_server(struct ksmbd_work *work); struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index e35914457350..3d5d652153a5 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -63,13 +63,13 @@ int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, char *pathname, unsigned int flags, + struct path *parent_path, struct path *path) { struct qstr last; struct filename *filename; struct path *root_share_path = &share_conf->vfs_path; int err, type; - struct path parent_path; struct dentry *d; if (pathname[0] == '\0') { @@ -84,7 +84,7 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, return PTR_ERR(filename); err = vfs_path_parent_lookup(filename, flags, - &parent_path, &last, &type, + parent_path, &last, &type, root_share_path); if (err) { putname(filename); @@ -92,13 +92,13 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, } if (unlikely(type != LAST_NORM)) { - path_put(&parent_path); + path_put(parent_path); putname(filename); return -ENOENT; } - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl(&last, parent_path->dentry, 0); if (IS_ERR(d)) goto err_out; @@ -108,15 +108,22 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, } path->dentry = d; - path->mnt = share_conf->vfs_path.mnt; - path_put(&parent_path); - putname(filename); + path->mnt = mntget(parent_path->mnt); + if (test_share_config_flag(share_conf, KSMBD_SHARE_FLAG_CROSSMNT)) { + err = follow_down(path, 0); + if (err < 0) { + path_put(path); + goto err_out; + } + } + + putname(filename); return 0; err_out: - inode_unlock(parent_path.dentry->d_inode); - path_put(&parent_path); + inode_unlock(d_inode(parent_path->dentry)); + path_put(parent_path); putname(filename); return -ENOENT; } @@ -412,7 +419,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, { char *stream_buf = NULL, *wbuf; struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); - size_t size, v_len; + size_t size; + ssize_t v_len; int err = 0; ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n", @@ -429,9 +437,9 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, fp->stream.name, fp->stream.size, &stream_buf); - if ((int)v_len < 0) { + if (v_len < 0) { pr_err("not found stream in xattr : %zd\n", v_len); - err = (int)v_len; + err = v_len; goto out; } @@ -1194,14 +1202,14 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, * Return: 0 on success, otherwise error */ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, - unsigned int flags, struct path *path, - bool caseless) + unsigned int flags, struct path *parent_path, + struct path *path, bool caseless) { struct ksmbd_share_config *share_conf = work->tcon->share_conf; int err; - struct path parent_path; - err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path); + err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, parent_path, + path); if (!err) return 0; @@ -1216,10 +1224,10 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, path_len = strlen(filepath); remain_len = path_len; - parent_path = share_conf->vfs_path; - path_get(&parent_path); + *parent_path = share_conf->vfs_path; + path_get(parent_path); - while (d_can_lookup(parent_path.dentry)) { + while (d_can_lookup(parent_path->dentry)) { char *filename = filepath + path_len - remain_len; char *next = strchrnul(filename, '/'); size_t filename_len = next - filename; @@ -1228,7 +1236,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, if (filename_len == 0) break; - err = ksmbd_vfs_lookup_in_dir(&parent_path, filename, + err = ksmbd_vfs_lookup_in_dir(parent_path, filename, filename_len, work->conn->um); if (err) @@ -1245,8 +1253,8 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, goto out2; else if (is_last) goto out1; - path_put(&parent_path); - parent_path = *path; + path_put(parent_path); + *parent_path = *path; next[0] = '/'; remain_len -= filename_len + 1; @@ -1254,16 +1262,17 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, err = -EINVAL; out2: - path_put(&parent_path); + path_put(parent_path); out1: kfree(filepath); } if (!err) { - err = ksmbd_vfs_lock_parent(parent_path.dentry, path->dentry); - if (err) - dput(path->dentry); - path_put(&parent_path); + err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry); + if (err) { + path_put(path); + path_put(parent_path); + } } return err; } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index 80039312c255..72f9fb4b48d1 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -115,8 +115,8 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, const struct path *path, char *attr_name); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, - unsigned int flags, struct path *path, - bool caseless); + unsigned int flags, struct path *parent_path, + struct path *path, bool caseless); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, diff --git a/fs/splice.c b/fs/splice.c index 004eb1c4ce31..3e2a31e1ce6a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -876,6 +876,8 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, msg.msg_flags |= MSG_MORE; if (remain && pipe_occupancy(pipe->head, tail) > 0) msg.msg_flags |= MSG_MORE; + if (out->f_flags & O_NONBLOCK) + msg.msg_flags |= MSG_DONTWAIT; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, len - remain); diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 075f15c43c78..5f1a14d5b927 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -179,9 +179,10 @@ static int vboxsf_dir_iterate(struct file *dir, struct dir_context *ctx) return 0; } +WRAP_DIR_ITER(vboxsf_dir_iterate) // FIXME! const struct file_operations vboxsf_dir_fops = { .open = vboxsf_dir_open, - .iterate = vboxsf_dir_iterate, + .iterate_shared = shared_vboxsf_dir_iterate, .release = vboxsf_dir_release, .read = generic_read_dir, .llseek = generic_file_llseek, diff --git a/fs/vboxsf/shfl_hostintf.h b/fs/vboxsf/shfl_hostintf.h index aca829062c12..069a019c9247 100644 --- a/fs/vboxsf/shfl_hostintf.h +++ b/fs/vboxsf/shfl_hostintf.h @@ -68,9 +68,9 @@ struct shfl_string { /** UTF-8 or UTF-16 string. Nul terminated. */ union { - u8 utf8[2]; - u16 utf16[1]; - u16 ucs2[1]; /* misnomer, use utf16. */ + u8 legacy_padding[2]; + DECLARE_FLEX_ARRAY(u8, utf8); + DECLARE_FLEX_ARRAY(u16, utf16); } string; }; VMMDEV_ASSERT_SIZE(shfl_string, 6); diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 25e2841084e1..f9015f88eca7 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -591,7 +591,7 @@ struct xfs_attr_shortform { uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ uint8_t nameval[]; /* name & value bytes concatenated */ - } list[1]; /* variable sized array */ + } list[]; /* variable sized array */ }; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ @@ -620,19 +620,29 @@ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ typedef struct xfs_attr_leaf_name_local { __be16 valuelen; /* number of bytes in value */ __u8 namelen; /* length of name bytes */ - __u8 nameval[1]; /* name/value bytes */ + /* + * In Linux 6.5 this flex array was converted from nameval[1] to + * nameval[]. Be very careful here about extra padding at the end; + * see xfs_attr_leaf_entsize_local() for details. + */ + __u8 nameval[]; /* name/value bytes */ } xfs_attr_leaf_name_local_t; typedef struct xfs_attr_leaf_name_remote { __be32 valueblk; /* block number of value bytes */ __be32 valuelen; /* number of bytes in value */ __u8 namelen; /* length of name bytes */ - __u8 name[1]; /* name bytes */ + /* + * In Linux 6.5 this flex array was converted from name[1] to name[]. + * Be very careful here about extra padding at the end; see + * xfs_attr_leaf_entsize_remote() for details. + */ + __u8 name[]; /* name bytes */ } xfs_attr_leaf_name_remote_t; typedef struct xfs_attr_leafblock { xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ - xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_entry_t entries[]; /* sorted on key, not name */ /* * The rest of the block contains the following structures after the * leaf entries, growing from the bottom up. The variables are never @@ -664,7 +674,7 @@ struct xfs_attr3_leaf_hdr { struct xfs_attr3_leafblock { struct xfs_attr3_leaf_hdr hdr; - struct xfs_attr_leaf_entry entries[1]; + struct xfs_attr_leaf_entry entries[]; /* * The rest of the block contains the following structures after the @@ -747,14 +757,61 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) */ static inline int xfs_attr_leaf_entsize_remote(int nlen) { - return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 + - nlen, XFS_ATTR_LEAF_NAME_ALIGN); + /* + * Prior to Linux 6.5, struct xfs_attr_leaf_name_remote ended with + * name[1], which was used as a flexarray. The layout of this struct + * is 9 bytes of fixed-length fields followed by a __u8 flex array at + * offset 9. + * + * On most architectures, struct xfs_attr_leaf_name_remote had two + * bytes of implicit padding at the end of the struct to make the + * struct length 12. After converting name[1] to name[], there are + * three implicit padding bytes and the struct size remains 12. + * However, there are compiler configurations that do not add implicit + * padding at all (m68k) and have been broken for years. + * + * This entsize computation historically added (the xattr name length) + * to (the padded struct length - 1) and rounded that sum up to the + * nearest multiple of 4 (NAME_ALIGN). IOWs, round_up(11 + nlen, 4). + * This is encoded in the ondisk format, so we cannot change this. + * + * Compute the entsize from offsetof of the flexarray and manually + * adding bytes for the implicit padding. + */ + const size_t remotesize = + offsetof(struct xfs_attr_leaf_name_remote, name) + 2; + + return round_up(remotesize + nlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) { - return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 + - nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); + /* + * Prior to Linux 6.5, struct xfs_attr_leaf_name_local ended with + * nameval[1], which was used as a flexarray. The layout of this + * struct is 3 bytes of fixed-length fields followed by a __u8 flex + * array at offset 3. + * + * struct xfs_attr_leaf_name_local had zero bytes of implicit padding + * at the end of the struct to make the struct length 4. On most + * architectures, after converting nameval[1] to nameval[], there is + * one implicit padding byte and the struct size remains 4. However, + * there are compiler configurations that do not add implicit padding + * at all (m68k) and would break. + * + * This entsize computation historically added (the xattr name and + * value length) to (the padded struct length - 1) and rounded that sum + * up to the nearest multiple of 4 (NAME_ALIGN). IOWs, the formula is + * round_up(3 + nlen + vlen, 4). This is encoded in the ondisk format, + * so we cannot change this. + * + * Compute the entsize from offsetof of the flexarray and manually + * adding bytes for the implicit padding. + */ + const size_t localsize = + offsetof(struct xfs_attr_leaf_name_local, nameval); + + return round_up(localsize + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local_max(int bsize) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 9c60ebb328b4..2cbf9ea39b8c 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -592,12 +592,12 @@ typedef struct xfs_attrlist_cursor { struct xfs_attrlist { __s32 al_count; /* number of entries in attrlist */ __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ + __s32 al_offset[]; /* byte offsets of attrs [var-sized] */ }; struct xfs_attrlist_ent { /* data from attr_list() */ __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ + char a_name[]; /* attr name (NULL terminated) */ }; typedef struct xfs_fsop_attrlist_handlereq { diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 9737b5a9f405..c4cc99b70dd3 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -56,7 +56,7 @@ xfs_check_ondisk_structs(void) /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 80); XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64); @@ -88,7 +88,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); + XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 92c9aaae3663..789cfb74c146 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -341,77 +341,6 @@ static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek_size(file, offset, whence, isize, isize); } -struct zonefs_zone_append_bio { - /* The target inode of the BIO */ - struct inode *inode; - - /* For sync writes, the target append write offset */ - u64 append_offset; - - /* - * This member must come last, bio_alloc_bioset will allocate enough - * bytes for entire zonefs_bio but relies on bio being last. - */ - struct bio bio; -}; - -static inline struct zonefs_zone_append_bio * -zonefs_zone_append_bio(struct bio *bio) -{ - return container_of(bio, struct zonefs_zone_append_bio, bio); -} - -static void zonefs_file_zone_append_dio_bio_end_io(struct bio *bio) -{ - struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio); - struct zonefs_zone *z = zonefs_inode_zone(za_bio->inode); - sector_t za_sector; - - if (bio->bi_status != BLK_STS_OK) - goto bio_end; - - /* - * If the file zone was written underneath the file system, the zone - * append operation can still succedd (if the zone is not full) but - * the write append location will not be where we expect it to be. - * Check that we wrote where we intended to, that is, at z->z_wpoffset. - */ - za_sector = z->z_sector + (za_bio->append_offset >> SECTOR_SHIFT); - if (bio->bi_iter.bi_sector != za_sector) { - zonefs_warn(za_bio->inode->i_sb, - "Invalid write sector %llu for zone at %llu\n", - bio->bi_iter.bi_sector, z->z_sector); - bio->bi_status = BLK_STS_IOERR; - } - -bio_end: - iomap_dio_bio_end_io(bio); -} - -static void zonefs_file_zone_append_dio_submit_io(const struct iomap_iter *iter, - struct bio *bio, - loff_t file_offset) -{ - struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio); - struct inode *inode = iter->inode; - struct zonefs_zone *z = zonefs_inode_zone(inode); - - /* - * Issue a zone append BIO to process sync dio writes. The append - * file offset is saved to check the zone append write location - * on completion of the BIO. - */ - za_bio->inode = inode; - za_bio->append_offset = file_offset; - - bio->bi_opf &= ~REQ_OP_WRITE; - bio->bi_opf |= REQ_OP_ZONE_APPEND; - bio->bi_iter.bi_sector = z->z_sector; - bio->bi_end_io = zonefs_file_zone_append_dio_bio_end_io; - - submit_bio(bio); -} - static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { @@ -442,14 +371,6 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, return 0; } -static struct bio_set zonefs_zone_append_bio_set; - -static const struct iomap_dio_ops zonefs_zone_append_dio_ops = { - .submit_io = zonefs_file_zone_append_dio_submit_io, - .end_io = zonefs_file_write_dio_end_io, - .bio_set = &zonefs_zone_append_bio_set, -}; - static const struct iomap_dio_ops zonefs_write_dio_ops = { .end_io = zonefs_file_write_dio_end_io, }; @@ -533,9 +454,6 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) struct zonefs_inode_info *zi = ZONEFS_I(inode); struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; - const struct iomap_dio_ops *dio_ops; - bool sync = is_sync_kiocb(iocb); - bool append = false; ssize_t ret, count; /* @@ -543,7 +461,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) * as this can cause write reordering (e.g. the first aio gets EAGAIN * on the inode lock but the second goes through but is now unaligned). */ - if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) + if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) && + (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -573,18 +492,6 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) goto inode_unlock; } mutex_unlock(&zi->i_truncate_mutex); - append = sync; - } - - if (append) { - unsigned int max = bdev_max_zone_append_sectors(sb->s_bdev); - - max = ALIGN_DOWN(max << SECTOR_SHIFT, sb->s_blocksize); - iov_iter_truncate(from, max); - - dio_ops = &zonefs_zone_append_dio_ops; - } else { - dio_ops = &zonefs_write_dio_ops; } /* @@ -593,7 +500,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) * the user can make sense of the error. */ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, - dio_ops, 0, NULL, 0); + &zonefs_write_dio_ops, 0, NULL, 0); if (ret == -ENOTBLK) ret = -EBUSY; @@ -938,15 +845,3 @@ const struct file_operations zonefs_file_operations = { .splice_write = iter_file_splice_write, .iopoll = iocb_bio_iopoll, }; - -int zonefs_file_bioset_init(void) -{ - return bioset_init(&zonefs_zone_append_bio_set, BIO_POOL_SIZE, - offsetof(struct zonefs_zone_append_bio, bio), - BIOSET_NEED_BVECS); -} - -void zonefs_file_bioset_exit(void) -{ - bioset_exit(&zonefs_zone_append_bio_set); -} diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index bbe44a26a8e5..9350221abfc5 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1412,13 +1412,9 @@ static int __init zonefs_init(void) BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE); - ret = zonefs_file_bioset_init(); - if (ret) - return ret; - ret = zonefs_init_inodecache(); if (ret) - goto destroy_bioset; + return ret; ret = zonefs_sysfs_init(); if (ret) @@ -1434,8 +1430,6 @@ sysfs_exit: zonefs_sysfs_exit(); destroy_inodecache: zonefs_destroy_inodecache(); -destroy_bioset: - zonefs_file_bioset_exit(); return ret; } @@ -1445,7 +1439,6 @@ static void __exit zonefs_exit(void) unregister_filesystem(&zonefs_type); zonefs_sysfs_exit(); zonefs_destroy_inodecache(); - zonefs_file_bioset_exit(); } MODULE_AUTHOR("Damien Le Moal"); diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index f663b8ebc2cb..8175652241b5 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -279,8 +279,6 @@ extern const struct file_operations zonefs_dir_operations; extern const struct address_space_operations zonefs_file_aops; extern const struct file_operations zonefs_file_operations; int zonefs_file_truncate(struct inode *inode, loff_t isize); -int zonefs_file_bioset_init(void); -void zonefs_file_bioset_exit(void); /* In sysfs.c */ int zonefs_sysfs_register(struct super_block *sb); |