diff options
Diffstat (limited to 'fs/ocfs2')
64 files changed, 2219 insertions, 1715 deletions
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 5d11380d8724..2514d36cbe01 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -2,11 +2,13 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on INET && SYSFS && CONFIGFS_FS + select BUFFER_HEAD select JBD2 select CRC32 select QUOTA select QUOTA_TREE select FS_POSIX_ACL + select LEGACY_DIRECT_IO help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -16,9 +18,9 @@ config OCFS2_FS You'll want to install the ocfs2-tools package in order to at least get "mount.ocfs2". - Project web page: https://oss.oracle.com/projects/ocfs2 - Tools web page: https://oss.oracle.com/projects/ocfs2-tools - OCFS2 mailing lists: https://oss.oracle.com/projects/ocfs2/mailman/ + Project web page: https://ocfs2.wiki.kernel.org/ + Tools web page: https://github.com/markfasheh/ocfs2-tools + OCFS2 mailing lists: https://subspace.kernel.org/lists.linux.dev.html For more information on OCFS2, see the file <file:Documentation/filesystems/ocfs2.rst>. diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 9f19cf9a5a9f..af1e2cedb217 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/fs_struct.h> #include <cluster/masklog.h> @@ -191,10 +192,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, } inode->i_mode = new_mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); di->i_mode = cpu_to_le16(inode->i_mode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); @@ -260,7 +261,7 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; @@ -274,7 +275,7 @@ int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (type == ACL_TYPE_ACCESS && acl) { umode_t mode; - status = posix_acl_update_mode(&init_user_ns, inode, &mode, + status = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (status) goto unlock; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index a897c4e41b26..667c6f03fa60 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -17,7 +17,7 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 51c93929a146..b267ec580da9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -566,7 +566,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec); /* - * Reset the actual path elements so that we can re-use the structure + * Reset the actual path elements so that we can reuse the structure * to build another path. Generally, this involves freeing the buffer * heads. */ @@ -967,7 +967,14 @@ int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) el = &eb->h_list; } - BUG_ON(el->l_tree_depth != 0); + if (el->l_tree_depth != 0) { + retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)last_eb_blk, + le16_to_cpu(el->l_tree_depth)); + goto bail; + } retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: @@ -1175,7 +1182,7 @@ static int ocfs2_add_branch(handle_t *handle, /* * If there is a gap before the root end and the real end - * of the righmost leaf block, we need to remove the gap + * of the rightmost leaf block, we need to remove the gap * between new_cpos and root_end first so that the tree * is consistent after we add a new branch(it will start * from new_cpos). @@ -1231,7 +1238,7 @@ static int ocfs2_add_branch(handle_t *handle, /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. - * conversly, new_eb_bhs[0] is the new bottommost leaf. + * conversely, new_eb_bhs[0] is the new bottommost leaf. * * when we leave the loop, new_last_eb_blk will point to the * newest leaf, and next_blkno will point to the topmost extent @@ -1796,6 +1803,14 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, el = root_el; while (el->l_tree_depth) { + if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has invalid tree depth %u in extent list\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; + } if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), "Owner %llu has empty extent list at depth %u\n", @@ -3705,7 +3720,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, * update split_index here. * * When the split_index is zero, we need to merge it to the - * prevoius extent block. It is more efficient and easier + * previous extent block. It is more efficient and easier * if we do merge_right first and merge_left later. */ ret = ocfs2_merge_rec_right(path, handle, et, split_rec, @@ -4510,7 +4525,7 @@ static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, } /* - * This should only be called against the righmost leaf extent list. + * This should only be called against the rightmost leaf extent list. * * ocfs2_figure_appending_type() will figure out whether we'll have to * insert at the tail of the rightmost leaf. @@ -4760,7 +4775,7 @@ bail: } /* - * Allcate and add clusters into the extent b-tree. + * Allocate and add clusters into the extent b-tree. * The new clusters(clusters_to_add) will be inserted at logical_offset. * The extent b-tree's root is specified by et, and * it is not limited to the file storage. Any extent tree can use this @@ -6147,6 +6162,9 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, int status; struct inode *inode = NULL; struct buffer_head *bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + unsigned int tl_count; inode = ocfs2_get_system_file_inode(osb, TRUNCATE_LOG_SYSTEM_INODE, @@ -6164,6 +6182,18 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } + di = (struct ocfs2_dinode *)bh->b_data; + tl = &di->id2.i_dealloc; + tl_count = le16_to_cpu(tl->tl_count); + if (unlikely(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || + tl_count == 0)) { + status = -EFSCORRUPTED; + iput(inode); + brelse(bh); + mlog_errno(status); + goto bail; + } + *tl_inode = inode; *tl_bh = bh; bail: @@ -6801,27 +6831,27 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) return 0; } -void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys) +void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle, + size_t from, size_t to, struct folio *folio, int zero, + u64 *phys) { int ret, partial = 0; - loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from; + loff_t start_byte = folio_pos(folio) + from; loff_t length = to - from; - ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); + ret = ocfs2_map_folio_blocks(folio, phys, inode, from, to, 0); if (ret) mlog_errno(ret); if (zero) - zero_user_segment(page, from, to); + folio_zero_segment(folio, from, to); /* * Need to set the buffers we zero'd into uptodate * here if they aren't - ocfs2_map_page_blocks() * might've skipped some */ - ret = walk_page_buffers(handle, page_buffers(page), + ret = walk_page_buffers(handle, folio_buffers(folio), from, to, &partial, ocfs2_zero_func); if (ret < 0) @@ -6834,92 +6864,88 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, } if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); - flush_dcache_page(page); + flush_dcache_folio(folio); } -static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, - loff_t end, struct page **pages, - int numpages, u64 phys, handle_t *handle) +static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start, + loff_t end, struct folio **folios, int numfolios, + u64 phys, handle_t *handle) { int i; - struct page *page; - unsigned int from, to = PAGE_SIZE; struct super_block *sb = inode->i_sb; BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); - if (numpages == 0) + if (numfolios == 0) goto out; - to = PAGE_SIZE; - for(i = 0; i < numpages; i++) { - page = pages[i]; + for (i = 0; i < numfolios; i++) { + struct folio *folio = folios[i]; + size_t to = folio_size(folio); + size_t from = offset_in_folio(folio, start); - from = start & (PAGE_SIZE - 1); - if ((end >> PAGE_SHIFT) == page->index) - to = end & (PAGE_SIZE - 1); + if (to > end - folio_pos(folio)) + to = end - folio_pos(folio); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1, + &phys); - ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, - &phys); - - start = (page->index + 1) << PAGE_SHIFT; + start = folio_next_pos(folio); } out: - if (pages) - ocfs2_unlock_and_free_pages(pages, numpages); + if (folios) + ocfs2_unlock_and_free_folios(folios, numfolios); } -int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, + struct folio **folios, int *num) { - int numpages, ret = 0; + int numfolios, ret = 0; struct address_space *mapping = inode->i_mapping; unsigned long index; loff_t last_page_bytes; BUG_ON(start > end); - numpages = 0; + numfolios = 0; last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_SHIFT; do { - pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); - if (!pages[numpages]) { - ret = -ENOMEM; + folios[numfolios] = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folios[numfolios])) { + ret = PTR_ERR(folios[numfolios]); mlog_errno(ret); + folios[numfolios] = NULL; goto out; } - numpages++; - index++; + index = folio_next_index(folios[numfolios]); + numfolios++; } while (index < (last_page_bytes >> PAGE_SHIFT)); out: if (ret != 0) { - if (pages) - ocfs2_unlock_and_free_pages(pages, numpages); - numpages = 0; + ocfs2_unlock_and_free_folios(folios, numfolios); + numfolios = 0; } - *num = numpages; + *num = numfolios; return ret; } -static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +static int ocfs2_grab_eof_folios(struct inode *inode, loff_t start, loff_t end, + struct folio **folios, int *num) { struct super_block *sb = inode->i_sb; BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); - return ocfs2_grab_pages(inode, start, end, pages, num); + return ocfs2_grab_folios(inode, start, end, folios, num); } /* @@ -6927,14 +6953,14 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because - * otherwise block_write_full_page() will skip writeout of pages past + * otherwise block_write_full_folio() will skip writeout of pages past * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, u64 range_start, u64 range_end) { - int ret = 0, numpages; - struct page **pages = NULL; + int ret = 0, numfolios; + struct folio **folios = NULL; u64 phys; unsigned int ext_flags; struct super_block *sb = inode->i_sb; @@ -6947,17 +6973,17 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, return 0; /* - * Avoid zeroing pages fully beyond current i_size. It is pointless as - * underlying blocks of those pages should be already zeroed out and + * Avoid zeroing folios fully beyond current i_size. It is pointless as + * underlying blocks of those folios should be already zeroed out and * page writeback will skip them anyway. */ range_end = min_t(u64, range_end, i_size_read(inode)); if (range_start >= range_end) return 0; - pages = kcalloc(ocfs2_pages_per_cluster(sb), - sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { + folios = kcalloc(ocfs2_pages_per_cluster(sb), + sizeof(struct folio *), GFP_NOFS); + if (folios == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; @@ -6978,18 +7004,18 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) goto out; - ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, - &numpages); + ret = ocfs2_grab_eof_folios(inode, range_start, range_end, folios, + &numfolios); if (ret) { mlog_errno(ret); goto out; } - ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, - numpages, phys, handle); + ocfs2_zero_cluster_folios(inode, range_start, range_end, folios, + numfolios, phys, handle); /* - * Initiate writeout of the pages we zero'd here. We don't + * Initiate writeout of the folios we zero'd here. We don't * wait on them - the truncate_inode_pages() call later will * do that for us. */ @@ -6999,7 +7025,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, mlog_errno(ret); out: - kfree(pages); + kfree(folios); return ret; } @@ -7052,7 +7078,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { - int ret, has_data, num_pages = 0; + int ret, has_data, num_folios = 0; int need_free = 0; u32 bit_off, num; handle_t *handle; @@ -7061,7 +7087,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_alloc_context *data_ac = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct ocfs2_extent_tree et; int did_quota = 0; @@ -7112,12 +7138,12 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, /* * Save two copies, one for insert, and one that can - * be changed by ocfs2_map_and_dirty_page() below. + * be changed by ocfs2_map_and_dirty_folio() below. */ block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page, - &num_pages); + ret = ocfs2_grab_eof_folios(inode, 0, page_end, &folio, + &num_folios); if (ret) { mlog_errno(ret); need_free = 1; @@ -7128,15 +7154,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * This should populate the 1st page for us and mark * it up to date. */ - ret = ocfs2_read_inline_data(inode, page, di_bh); + ret = ocfs2_read_inline_data(inode, folio, di_bh); if (ret) { mlog_errno(ret); need_free = 1; goto out_unlock; } - ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0, - &phys); + ocfs2_map_and_dirty_folio(inode, handle, 0, page_end, folio, 0, + &phys); } spin_lock(&oi->ip_lock); @@ -7167,8 +7193,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_unlock: - if (page) - ocfs2_unlock_and_free_pages(&page, num_pages); + if (folio) + ocfs2_unlock_and_free_folios(&folio, num_folios); out_commit: if (ret < 0 && did_quota) @@ -7436,10 +7462,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, } inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); @@ -7642,7 +7668,7 @@ out_mutex: goto next_group; } out: - range->len = trimmed * sb->s_blocksize; + range->len = trimmed * osb->s_clustersize; return ret; } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 4af7abaa6e40..1c0c83362904 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -254,11 +254,9 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) return !rec->e_leaf_clusters; } -int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num); -void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys); +void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle, + size_t from, size_t to, struct folio *folio, int zero, + u64 *phys); /* * Structures which describe a path through a btree, and functions to * manipulate them. diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1d65f6ef00ca..76c86f1c2b1c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -46,7 +46,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh = NULL; struct buffer_head *buffer_cache_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - void *kaddr; trace_ocfs2_symlink_get_block( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -91,17 +90,11 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, * could've happened. Since we've got a reference on * the bh, even if it commits while we're doing the * copy, the data is still good. */ - if (buffer_jbd(buffer_cache_bh) - && ocfs2_inode_is_new(inode)) { - kaddr = kmap_atomic(bh_result->b_page); - if (!kaddr) { - mlog(ML_ERROR, "couldn't kmap!\n"); - goto bail; - } - memcpy(kaddr + (bh_result->b_size * iblock), - buffer_cache_bh->b_data, - bh_result->b_size); - kunmap_atomic(kaddr); + if (buffer_jbd(buffer_cache_bh) && ocfs2_inode_is_new(inode)) { + memcpy_to_folio(bh_result->b_folio, + bh_result->b_size * iblock, + buffer_cache_bh->b_data, + bh_result->b_size); set_buffer_uptodate(bh_result); } brelse(buffer_cache_bh); @@ -156,9 +149,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count, &ext_flags); if (err) { - mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " - "%llu, NULL)\n", err, inode, (unsigned long long)iblock, - (unsigned long long)p_blkno); + mlog(ML_ERROR, "get_blocks() failed, inode: 0x%p, " + "block: %llu\n", inode, (unsigned long long)iblock); goto bail; } @@ -216,10 +208,9 @@ bail: return err; } -int ocfs2_read_inline_data(struct inode *inode, struct page *page, +int ocfs2_read_inline_data(struct inode *inode, struct folio *folio, struct buffer_head *di_bh) { - void *kaddr; loff_t size; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; @@ -231,7 +222,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); - if (size > PAGE_SIZE || + if (size > folio_size(folio) || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu\n", @@ -240,25 +231,18 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, return -EROFS; } - kaddr = kmap_atomic(page); - if (size) - memcpy(kaddr, di->id2.i_data.id_data, size); - /* Clear the remaining part of the page */ - memset(kaddr + size, 0, PAGE_SIZE - size); - flush_dcache_page(page); - kunmap_atomic(kaddr); - - SetPageUptodate(page); + folio_fill_tail(folio, 0, di->id2.i_data.id_data, size); + folio_mark_uptodate(folio); return 0; } -static int ocfs2_readpage_inline(struct inode *inode, struct page *page) +static int ocfs2_readpage_inline(struct inode *inode, struct folio *folio) { int ret; struct buffer_head *di_bh = NULL; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); ret = ocfs2_read_inode_block(inode, &di_bh); @@ -267,9 +251,9 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) goto out; } - ret = ocfs2_read_inline_data(inode, page, di_bh); + ret = ocfs2_read_inline_data(inode, folio, di_bh); out: - unlock_page(page); + folio_unlock(folio); brelse(di_bh); return ret; @@ -284,7 +268,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index); - ret = ocfs2_inode_lock_with_page(inode, NULL, 0, &folio->page); + ret = ocfs2_inode_lock_with_folio(inode, NULL, 0, folio); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -306,7 +290,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) } /* - * i_size might have just been updated as we grabed the meta lock. We + * i_size might have just been updated as we grabbed the meta lock. We * might now be discovering a truncate that hit on another node. * block_read_full_folio->get_block freaks out if it is asked to read * beyond the end of a file, so we check here. Callers @@ -323,7 +307,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) } if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - ret = ocfs2_readpage_inline(inode, &folio->page); + ret = ocfs2_readpage_inline(inode, folio); else ret = block_read_full_folio(folio, ocfs2_get_block); unlock = 0; @@ -389,21 +373,18 @@ out_unlock: /* Note: Because we don't support holes, our allocation has * already happened (allocation writes zeros to the file data) * so we don't have to worry about ordered writes in - * ocfs2_writepage. + * ocfs2_writepages. * - * ->writepage is called during the process of invalidating the page cache + * ->writepages is called during the process of invalidating the page cache * during blocked lock processing. It can't block on any cluster locks * to during block mapping. It's relying on the fact that the block * mapping can't have disappeared under the dirty pages that it is * being asked to write back. */ -static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) +static int ocfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - trace_ocfs2_writepage( - (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno, - page->index); - - return block_write_full_page(page, ocfs2_get_block, wbc); + return mpage_writepages(mapping, wbc, ocfs2_get_block); } /* Taken from ext3. We don't necessarily need the full blown @@ -538,7 +519,7 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, * * from == to == 0 is code for "zero the entire cluster region" */ -static void ocfs2_clear_page_regions(struct page *page, +static void ocfs2_clear_folio_regions(struct folio *folio, struct ocfs2_super *osb, u32 cpos, unsigned from, unsigned to) { @@ -547,7 +528,7 @@ static void ocfs2_clear_page_regions(struct page *page, ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); if (from || to) { if (from > cluster_start) @@ -558,20 +539,20 @@ static void ocfs2_clear_page_regions(struct page *page, memset(kaddr + cluster_start, 0, cluster_end - cluster_start); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); } /* * Nonsparse file systems fully allocate before we get to the write * code. This prevents ocfs2_write() from tagging the write as an - * allocating one, which means ocfs2_map_page_blocks() might try to + * allocating one, which means ocfs2_map_folio_blocks() might try to * read-in the blocks at the tail of our file. Avoid reading them by * testing i_size against each block offset. */ -static int ocfs2_should_read_blk(struct inode *inode, struct page *page, +static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio, unsigned int block_start) { - u64 offset = page_offset(page) + block_start; + u64 offset = folio_pos(folio) + block_start; if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) return 1; @@ -589,7 +570,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page, * * This will also skip zeroing, which is handled externally. */ -int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, +int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new) { @@ -598,10 +579,10 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, unsigned int block_end, block_start; unsigned int bsize = i_blocksize(inode); - if (!page_has_buffers(page)) - create_empty_buffers(page, bsize, 0); + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, bsize, 0); - head = page_buffers(page); for (bh = head, block_start = 0; bh != head || !block_start; bh = bh->b_this_page, block_start += bsize) { block_end = block_start + bsize; @@ -613,7 +594,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, * they may belong to unallocated clusters. */ if (block_start >= to || block_end <= from) { - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); continue; } @@ -630,11 +611,11 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, clean_bdev_bh_alias(bh); } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_new(bh) && - ocfs2_should_read_blk(inode, page, block_start) && + ocfs2_should_read_blk(inode, folio, block_start) && (block_start < from || block_end > to)) { bh_read_nowait(bh, 0); *wait_bh++=bh; @@ -668,7 +649,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (block_start >= to) break; - zero_user(page, block_start, bh->b_size); + folio_zero_range(folio, block_start, bh->b_size); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -732,24 +713,24 @@ struct ocfs2_write_ctxt { unsigned int w_large_pages; /* - * Pages involved in this write. + * Folios involved in this write. * - * w_target_page is the page being written to by the user. + * w_target_folio is the folio being written to by the user. * - * w_pages is an array of pages which always contains - * w_target_page, and in the case of an allocating write with + * w_folios is an array of folios which always contains + * w_target_folio, and in the case of an allocating write with * page_size < cluster size, it will contain zero'd and mapped - * pages adjacent to w_target_page which need to be written + * pages adjacent to w_target_folio which need to be written * out in so that future reads from that region will get * zero's. */ - unsigned int w_num_pages; - struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; - struct page *w_target_page; + unsigned int w_num_folios; + struct folio *w_folios[OCFS2_MAX_CTXT_PAGES]; + struct folio *w_target_folio; /* * w_target_locked is used for page_mkwrite path indicating no unlocking - * against w_target_page in ocfs2_write_end_nolock. + * against w_target_folio in ocfs2_write_end_nolock. */ unsigned int w_target_locked:1; @@ -774,40 +755,40 @@ struct ocfs2_write_ctxt { unsigned int w_unwritten_count; }; -void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) +void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios) { int i; - for(i = 0; i < num_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - mark_page_accessed(pages[i]); - put_page(pages[i]); - } + for(i = 0; i < num_folios; i++) { + if (!folios[i]) + continue; + folio_unlock(folios[i]); + folio_mark_accessed(folios[i]); + folio_put(folios[i]); } } -static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) +static void ocfs2_unlock_folios(struct ocfs2_write_ctxt *wc) { int i; /* * w_target_locked is only set to true in the page_mkwrite() case. * The intent is to allow us to lock the target page from write_begin() - * to write_end(). The caller must hold a ref on w_target_page. + * to write_end(). The caller must hold a ref on w_target_folio. */ if (wc->w_target_locked) { - BUG_ON(!wc->w_target_page); - for (i = 0; i < wc->w_num_pages; i++) { - if (wc->w_target_page == wc->w_pages[i]) { - wc->w_pages[i] = NULL; + BUG_ON(!wc->w_target_folio); + for (i = 0; i < wc->w_num_folios; i++) { + if (wc->w_target_folio == wc->w_folios[i]) { + wc->w_folios[i] = NULL; break; } } - mark_page_accessed(wc->w_target_page); - put_page(wc->w_target_page); + folio_mark_accessed(wc->w_target_folio); + folio_put(wc->w_target_folio); } - ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); + ocfs2_unlock_and_free_folios(wc->w_folios, wc->w_num_folios); } static void ocfs2_free_unwritten_list(struct inode *inode, @@ -829,7 +810,7 @@ static void ocfs2_free_write_ctxt(struct inode *inode, struct ocfs2_write_ctxt *wc) { ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); - ocfs2_unlock_pages(wc); + ocfs2_unlock_folios(wc); brelse(wc->w_di_bh); kfree(wc); } @@ -872,29 +853,30 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit. */ -static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) +static void ocfs2_zero_new_buffers(struct folio *folio, size_t from, size_t to) { unsigned int block_start, block_end; struct buffer_head *head, *bh; - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) + BUG_ON(!folio_test_locked(folio)); + head = folio_buffers(folio); + if (!head) return; - bh = head = page_buffers(page); + bh = head; block_start = 0; do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { unsigned start, end; start = max(from, block_start); end = min(to, block_end); - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); set_buffer_uptodate(bh); } @@ -919,29 +901,26 @@ static void ocfs2_write_failure(struct inode *inode, int i; unsigned from = user_pos & (PAGE_SIZE - 1), to = user_pos + user_len; - struct page *tmppage; - if (wc->w_target_page) - ocfs2_zero_new_buffers(wc->w_target_page, from, to); + if (wc->w_target_folio) + ocfs2_zero_new_buffers(wc->w_target_folio, from, to); - for(i = 0; i < wc->w_num_pages; i++) { - tmppage = wc->w_pages[i]; + for (i = 0; i < wc->w_num_folios; i++) { + struct folio *folio = wc->w_folios[i]; - if (tmppage && page_has_buffers(tmppage)) { + if (folio && folio_buffers(folio)) { if (ocfs2_should_order_data(inode)) ocfs2_jbd2_inode_add_write(wc->w_handle, inode, user_pos, user_len); - block_commit_write(tmppage, from, to); + block_commit_write(folio, from, to); } } } -static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, - struct ocfs2_write_ctxt *wc, - struct page *page, u32 cpos, - loff_t user_pos, unsigned user_len, - int new) +static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno, + struct ocfs2_write_ctxt *wc, struct folio *folio, u32 cpos, + loff_t user_pos, unsigned user_len, int new) { int ret; unsigned int map_from = 0, map_to = 0; @@ -954,20 +933,19 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, /* treat the write as new if the a hole/lseek spanned across * the page boundary. */ - new = new | ((i_size_read(inode) <= page_offset(page)) && - (page_offset(page) <= user_pos)); + new = new | ((i_size_read(inode) <= folio_pos(folio)) && + (folio_pos(folio) <= user_pos)); - if (page == wc->w_target_page) { + if (folio == wc->w_target_folio) { map_from = user_pos & (PAGE_SIZE - 1); map_to = map_from + user_len; if (new) - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, - new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + cluster_start, cluster_end, new); else - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - map_from, map_to, new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + map_from, map_to, new); if (ret) { mlog_errno(ret); goto out; @@ -981,7 +959,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, } } else { /* - * If we haven't allocated the new page yet, we + * If we haven't allocated the new folio yet, we * shouldn't be writing it out without copying user * data. This is likely a math error from the caller. */ @@ -990,8 +968,8 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, map_from = cluster_start; map_to = cluster_end; - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + cluster_start, cluster_end, new); if (ret) { mlog_errno(ret); goto out; @@ -999,20 +977,20 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, } /* - * Parts of newly allocated pages need to be zero'd. + * Parts of newly allocated folios need to be zero'd. * * Above, we have also rewritten 'to' and 'from' - as far as * the rest of the function is concerned, the entire cluster - * range inside of a page needs to be written. + * range inside of a folio needs to be written. * - * We can skip this if the page is up to date - it's already + * We can skip this if the folio is uptodate - it's already * been zero'd from being read in as a hole. */ - if (new && !PageUptodate(page)) - ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), + if (new && !folio_test_uptodate(folio)) + ocfs2_clear_folio_regions(folio, OCFS2_SB(inode->i_sb), cpos, user_data_from, user_data_to); - flush_dcache_page(page); + flush_dcache_folio(folio); out: return ret; @@ -1021,11 +999,9 @@ out: /* * This function will only grab one clusters worth of pages. */ -static int ocfs2_grab_pages_for_write(struct address_space *mapping, - struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, - unsigned user_len, int new, - struct page *mmap_page) +static int ocfs2_grab_folios_for_write(struct address_space *mapping, + struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, + unsigned user_len, int new, struct folio *mmap_folio) { int ret = 0, i; unsigned long start, target_index, end_index, index; @@ -1042,7 +1018,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, * last page of the write. */ if (new) { - wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); + wc->w_num_folios = ocfs2_pages_per_cluster(inode->i_sb); start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); /* * We need the index *past* the last page we could possibly @@ -1052,15 +1028,15 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, last_byte = max(user_pos + user_len, i_size_read(inode)); BUG_ON(last_byte < 1); end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1; - if ((start + wc->w_num_pages) > end_index) - wc->w_num_pages = end_index - start; + if ((start + wc->w_num_folios) > end_index) + wc->w_num_folios = end_index - start; } else { - wc->w_num_pages = 1; + wc->w_num_folios = 1; start = target_index; } end_index = (user_pos + user_len - 1) >> PAGE_SHIFT; - for(i = 0; i < wc->w_num_pages; i++) { + for(i = 0; i < wc->w_num_folios; i++) { index = start + i; if (index >= target_index && index <= end_index && @@ -1070,37 +1046,39 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, * and wants us to directly use the page * passed in. */ - lock_page(mmap_page); + folio_lock(mmap_folio); /* Exit and let the caller retry */ - if (mmap_page->mapping != mapping) { - WARN_ON(mmap_page->mapping); - unlock_page(mmap_page); + if (mmap_folio->mapping != mapping) { + WARN_ON(mmap_folio->mapping); + folio_unlock(mmap_folio); ret = -EAGAIN; goto out; } - get_page(mmap_page); - wc->w_pages[i] = mmap_page; + folio_get(mmap_folio); + wc->w_folios[i] = mmap_folio; wc->w_target_locked = true; } else if (index >= target_index && index <= end_index && wc->w_type == OCFS2_WRITE_DIRECT) { /* Direct write has no mapping page. */ - wc->w_pages[i] = NULL; + wc->w_folios[i] = NULL; continue; } else { - wc->w_pages[i] = find_or_create_page(mapping, index, - GFP_NOFS); - if (!wc->w_pages[i]) { - ret = -ENOMEM; + wc->w_folios[i] = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + GFP_NOFS); + if (IS_ERR(wc->w_folios[i])) { + ret = PTR_ERR(wc->w_folios[i]); mlog_errno(ret); + wc->w_folios[i] = NULL; goto out; } } - wait_for_stable_page(wc->w_pages[i]); + folio_wait_stable(wc->w_folios[i]); if (index == target_index) - wc->w_target_page = wc->w_pages[i]; + wc->w_target_folio = wc->w_folios[i]; } out: if (ret) @@ -1184,19 +1162,18 @@ static int ocfs2_write_cluster(struct address_space *mapping, if (!should_zero) p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1); - for(i = 0; i < wc->w_num_pages; i++) { + for (i = 0; i < wc->w_num_folios; i++) { int tmpret; /* This is the direct io target page. */ - if (wc->w_pages[i] == NULL) { - p_blkno++; + if (wc->w_folios[i] == NULL) { + p_blkno += (1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits)); continue; } - tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, - wc->w_pages[i], cpos, - user_pos, user_len, - should_zero); + tmpret = ocfs2_prepare_folio_for_write(inode, &p_blkno, wc, + wc->w_folios[i], cpos, user_pos, user_len, + should_zero); if (tmpret) { mlog_errno(tmpret); if (ret == 0) @@ -1475,7 +1452,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct page *page; + struct folio *folio; handle_t *handle; struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; @@ -1486,19 +1463,21 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, goto out; } - page = find_or_create_page(mapping, 0, GFP_NOFS); - if (!page) { + folio = __filemap_get_folio(mapping, 0, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { ocfs2_commit_trans(osb, handle); - ret = -ENOMEM; + ret = PTR_ERR(folio); mlog_errno(ret); goto out; } /* - * If we don't set w_num_pages then this page won't get unlocked + * If we don't set w_num_folios then this folio won't get unlocked * and freed on cleanup of the write context. */ - wc->w_pages[0] = wc->w_target_page = page; - wc->w_num_pages = 1; + wc->w_target_folio = folio; + wc->w_folios[0] = folio; + wc->w_num_folios = 1; ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1512,8 +1491,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) ocfs2_set_inode_data_inline(inode, di); - if (!PageUptodate(page)) { - ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh); + if (!folio_test_uptodate(folio)) { + ret = ocfs2_read_inline_data(inode, folio, wc->w_di_bh); if (ret) { ocfs2_commit_trans(osb, handle); @@ -1536,9 +1515,8 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size) } static int ocfs2_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, loff_t pos, - unsigned len, struct page *mmap_page, - struct ocfs2_write_ctxt *wc) + struct inode *inode, loff_t pos, size_t len, + struct folio *mmap_folio, struct ocfs2_write_ctxt *wc) { int ret, written = 0; loff_t end = pos + len; @@ -1553,7 +1531,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, * Handle inodes which already have inline data 1st. */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - if (mmap_page == NULL && + if (mmap_folio == NULL && ocfs2_size_fits_inline_data(wc->w_di_bh, end)) goto do_inline_write; @@ -1577,7 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, * Check whether the write can fit. */ di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; - if (mmap_page || + if (mmap_folio || end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) return 0; @@ -1644,9 +1622,9 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, } int ocfs2_write_begin_nolock(struct address_space *mapping, - loff_t pos, unsigned len, ocfs2_write_type_t type, - struct page **pagep, void **fsdata, - struct buffer_head *di_bh, struct page *mmap_page) + loff_t pos, unsigned len, ocfs2_write_type_t type, + struct folio **foliop, void **fsdata, + struct buffer_head *di_bh, struct folio *mmap_folio) { int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0; @@ -1669,7 +1647,7 @@ try_again: if (ocfs2_supports_inline_data(osb)) { ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len, - mmap_page, wc); + mmap_folio, wc); if (ret == 1) { ret = 0; goto success; @@ -1721,7 +1699,7 @@ try_again: (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), - pos, len, type, mmap_page, + pos, len, type, mmap_folio, clusters_to_alloc, extents_to_split); /* @@ -1792,21 +1770,21 @@ try_again: } /* - * Fill our page array first. That way we've grabbed enough so + * Fill our folio array first. That way we've grabbed enough so * that we can zero and flush if we error after adding the * extent. */ - ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, - cluster_of_pages, mmap_page); + ret = ocfs2_grab_folios_for_write(mapping, wc, wc->w_cpos, pos, len, + cluster_of_pages, mmap_folio); if (ret) { /* - * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock - * the target page. In this case, we exit with no error and no target - * page. This will trigger the caller, page_mkwrite(), to re-try - * the operation. + * ocfs2_grab_folios_for_write() returns -EAGAIN if it + * could not lock the target folio. In this case, we exit + * with no error and no target folio. This will trigger + * the caller, page_mkwrite(), to re-try the operation. */ if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) { - BUG_ON(wc->w_target_page); + BUG_ON(wc->w_target_folio); ret = 0; goto out_quota; } @@ -1828,8 +1806,8 @@ try_again: ocfs2_free_alloc_context(meta_ac); success: - if (pagep) - *pagep = wc->w_target_page; + if (foliop) + *foliop = wc->w_target_folio; *fsdata = wc; return 0; out_quota: @@ -1848,7 +1826,7 @@ out: * to VM code. */ if (wc->w_target_locked) - unlock_page(mmap_page); + folio_unlock(mmap_folio); ocfs2_free_write_ctxt(inode, wc); @@ -1879,9 +1857,10 @@ out: return ret; } -static int ocfs2_write_begin(struct file *file, struct address_space *mapping, +static int ocfs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; struct buffer_head *di_bh = NULL; @@ -1903,7 +1882,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, - pagep, fsdata, di_bh, NULL); + foliop, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); goto out_fail; @@ -1927,18 +1906,15 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, struct ocfs2_dinode *di, struct ocfs2_write_ctxt *wc) { - void *kaddr; - if (unlikely(*copied < len)) { - if (!PageUptodate(wc->w_target_page)) { + if (!folio_test_uptodate(wc->w_target_folio)) { *copied = 0; return; } } - kaddr = kmap_atomic(wc->w_target_page); - memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); - kunmap_atomic(kaddr); + memcpy_from_folio(di->id2.i_data.id_data + pos, wc->w_target_folio, + pos, *copied); trace_ocfs2_write_end_inline( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1947,17 +1923,16 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, le16_to_cpu(di->i_dyn_features)); } -int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, void *fsdata) +int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, void *fsdata) { int i, ret; - unsigned from, to, start = pos & (PAGE_SIZE - 1); + size_t from, to, start = pos & (PAGE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_write_ctxt *wc = fsdata; struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; handle_t *handle = wc->w_handle; - struct page *tmppage; BUG_ON(!list_empty(&wc->w_unwritten_list)); @@ -1976,29 +1951,44 @@ int ocfs2_write_end_nolock(struct address_space *mapping, goto out_write_size; } - if (unlikely(copied < len) && wc->w_target_page) { - if (!PageUptodate(wc->w_target_page)) + if (unlikely(copied < len) && wc->w_target_folio) { + loff_t new_isize; + + if (!folio_test_uptodate(wc->w_target_folio)) copied = 0; - ocfs2_zero_new_buffers(wc->w_target_page, start+copied, - start+len); + new_isize = max_t(loff_t, i_size_read(inode), pos + copied); + if (new_isize > folio_pos(wc->w_target_folio)) + ocfs2_zero_new_buffers(wc->w_target_folio, start+copied, + start+len); + else { + /* + * When folio is fully beyond new isize (data copy + * failed), do not bother zeroing the folio. Invalidate + * it instead so that writeback does not get confused + * put page & buffer dirty bits into inconsistent + * state. + */ + block_invalidate_folio(wc->w_target_folio, 0, + folio_size(wc->w_target_folio)); + } } - if (wc->w_target_page) - flush_dcache_page(wc->w_target_page); + if (wc->w_target_folio) + flush_dcache_folio(wc->w_target_folio); - for(i = 0; i < wc->w_num_pages; i++) { - tmppage = wc->w_pages[i]; + for (i = 0; i < wc->w_num_folios; i++) { + struct folio *folio = wc->w_folios[i]; - /* This is the direct io target page. */ - if (tmppage == NULL) + /* This is the direct io target folio */ + if (folio == NULL) continue; - if (tmppage == wc->w_target_page) { + if (folio == wc->w_target_folio) { from = wc->w_target_from; to = wc->w_target_to; - BUG_ON(from > PAGE_SIZE || - to > PAGE_SIZE || + BUG_ON(from > folio_size(folio) || + to > folio_size(folio) || to < from); } else { /* @@ -2007,19 +1997,17 @@ int ocfs2_write_end_nolock(struct address_space *mapping, * to flush their entire range. */ from = 0; - to = PAGE_SIZE; + to = folio_size(folio); } - if (page_has_buffers(tmppage)) { + if (folio_buffers(folio)) { if (handle && ocfs2_should_order_data(inode)) { - loff_t start_byte = - ((loff_t)tmppage->index << PAGE_SHIFT) + - from; + loff_t start_byte = folio_pos(folio) + from; loff_t length = to - from; ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); } - block_commit_write(tmppage, from, to); + block_commit_write(folio, from, to); } } @@ -2033,9 +2021,9 @@ out_write_size: } inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode)); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); if (handle) ocfs2_update_inode_fsync_trans(handle, inode, 1); } @@ -2048,7 +2036,7 @@ out: * this lock and will ask for the page lock when flushing the data. * put it here to preserve the unlock order. */ - ocfs2_unlock_pages(wc); + ocfs2_unlock_folios(wc); if (handle) ocfs2_commit_trans(osb, handle); @@ -2061,9 +2049,10 @@ out: return copied; } -static int ocfs2_write_end(struct file *file, struct address_space *mapping, +static int ocfs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; struct inode *inode = mapping->host; @@ -2270,8 +2259,6 @@ unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out: - if (ret < 0) - ret = -EIO; return ret; } @@ -2355,6 +2342,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode, } list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_assure_trans_credits(handle, credits); + if (ret < 0) { + mlog_errno(ret); + break; + } ret = ocfs2_mark_extent_written(inode, &et, handle, ue->ue_cpos, 1, ue->ue_phys, @@ -2448,14 +2440,14 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, - ocfs2_dio_end_io, NULL, 0); + ocfs2_dio_end_io, 0); } const struct address_space_operations ocfs2_aops = { .dirty_folio = block_dirty_folio, .read_folio = ocfs2_read_folio, .readahead = ocfs2_readahead, - .writepage = ocfs2_writepage, + .writepages = ocfs2_writepages, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, .bmap = ocfs2_bmap, @@ -2464,5 +2456,5 @@ const struct address_space_operations ocfs2_aops = { .release_folio = ocfs2_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 3a520117fa59..114efc9111e4 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -8,16 +8,11 @@ #include <linux/fs.h> -handle_t *ocfs2_start_walk_page_trans(struct inode *inode, - struct page *page, - unsigned from, - unsigned to); - -int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, +int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new); -void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages); +void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios); int walk_page_buffers( handle_t *handle, struct buffer_head *head, @@ -37,11 +32,11 @@ typedef enum { } ocfs2_write_type_t; int ocfs2_write_begin_nolock(struct address_space *mapping, - loff_t pos, unsigned len, ocfs2_write_type_t type, - struct page **pagep, void **fsdata, - struct buffer_head *di_bh, struct page *mmap_page); + loff_t pos, unsigned len, ocfs2_write_type_t type, + struct folio **foliop, void **fsdata, + struct buffer_head *di_bh, struct folio *mmap_folio); -int ocfs2_read_inline_data(struct inode *inode, struct page *page, +int ocfs2_read_inline_data(struct inode *inode, struct folio *folio, struct buffer_head *di_bh); int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); @@ -70,6 +65,8 @@ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_NUM_LOCKS }; +#define ocfs2_iocb_init_rw_locked(iocb) \ + (iocb->private = NULL) #define ocfs2_iocb_clear_rw_locked(iocb) \ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 196638a22b48..8f714406528d 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -158,7 +158,7 @@ read_failure: if (new_bh && bh) { /* If middle bh fails, let previous bh * finish its read and then put it to - * aovoid bh leak + * avoid bh leak */ if (!buffer_jbd(bh)) wait_on_buffer(bh); @@ -235,7 +235,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, if (bhs[i] == NULL) { bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { - ocfs2_metadata_cache_io_unlock(ci); status = -ENOMEM; mlog_errno(status); /* Don't forget to put previous bh! */ @@ -345,7 +344,7 @@ read_failure: if (new_bh && bh) { /* If middle bh fails, let previous bh * finish its read and then put it to - * aovoid bh leak + * avoid bh leak */ if (!buffer_jbd(bh)) wait_on_buffer(bh); @@ -389,7 +388,8 @@ read_failure: /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - ocfs2_set_buffer_uptodate(ci, bh); + if (bh) + ocfs2_set_buffer_uptodate(ci, bh); } ocfs2_metadata_cache_io_unlock(ci); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 60b97c92e2b2..724350925aff 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -3,6 +3,7 @@ * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ +#include "linux/kstrtox.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/jiffies.h> @@ -213,7 +214,7 @@ struct o2hb_region { unsigned int hr_num_pages; struct page **hr_slot_data; - struct block_device *hr_bdev; + struct file *hr_bdev_file; struct o2hb_disk_slot *hr_slots; /* live node map of this region */ @@ -261,6 +262,11 @@ struct o2hb_region { int hr_last_hb_status; }; +static inline struct block_device *reg_bdev(struct o2hb_region *reg) +{ + return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL; +} + struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; @@ -286,7 +292,7 @@ static void o2hb_write_timeout(struct work_struct *work) hr_write_timeout_work.work); mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u " - "milliseconds\n", reg->hr_bdev, + "milliseconds\n", reg_bdev(reg), jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { @@ -383,7 +389,7 @@ static void o2hb_nego_timeout(struct work_struct *work) if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); set_bit(master_node, reg->hr_nego_node_bitmap); } if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, @@ -398,7 +404,8 @@ static void o2hb_nego_timeout(struct work_struct *work) } printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), + reg_bdev(reg)); /* approve negotiate timeout request. */ o2hb_arm_timeout(reg); @@ -419,7 +426,7 @@ static void o2hb_nego_timeout(struct work_struct *work) /* negotiate timeout with master node. */ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), - reg->hr_bdev, master_node); + reg_bdev(reg), master_node); ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, master_node); if (ret) @@ -436,7 +443,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, nego_msg = (struct o2hb_nego_msg *)msg->buf; printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n", - nego_msg->node_num, config_item_name(®->hr_item), reg->hr_bdev); + nego_msg->node_num, config_item_name(®->hr_item), + reg_bdev(reg)); if (nego_msg->node_num < O2NM_MAX_NODES) set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); else @@ -451,7 +459,7 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, struct o2hb_region *reg = data; printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); o2hb_arm_timeout(reg); return 0; } @@ -515,7 +523,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC); + bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -687,7 +695,7 @@ static int o2hb_check_own_slot(struct o2hb_region *reg) errstr = ERRSTR3; mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), " - "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev, + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg), slot->ds_node_num, (unsigned long long)slot->ds_last_generation, (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), @@ -861,7 +869,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) goto unlock; printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -920,7 +928,7 @@ static int o2hb_check_slot(struct o2hb_region *reg, * consider it a transient miss but don't populate any * other values as they may be junk. */ mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n", - slot->ds_node_num, reg->hr_bdev); + slot->ds_node_num, reg_bdev(reg)); o2hb_dump_slot(hb_block); slot->ds_equal_samples++; @@ -1003,8 +1011,8 @@ fire_callbacks: "of %u ms, but our count is %u ms.\n" "Please double check your configuration values " "for 'O2CB_HEARTBEAT_THRESHOLD'\n", - slot->ds_node_num, reg->hr_bdev, slot_dead_ms, - dead_ms); + slot->ds_node_num, reg_bdev(reg), + slot_dead_ms, dead_ms); } goto out; } @@ -1013,7 +1021,7 @@ fire_callbacks: if (list_empty(&slot->ds_live_item)) goto out; - /* live nodes only go dead after enough consequtive missed + /* live nodes only go dead after enough consecutive missed * samples.. reset the missed counter whenever we see * activity */ if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { @@ -1143,7 +1151,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * can't be sure that the new block ever made it to * disk */ mlog(ML_ERROR, "Write error %d on device \"%pg\"\n", - write_wc.wc_error, reg->hr_bdev); + write_wc.wc_error, reg_bdev(reg)); ret = write_wc.wc_error; goto bail; } @@ -1169,7 +1177,7 @@ bail: printk(KERN_NOTICE "o2hb: Unable to stabilize " "heartbeat on region %s (%pg)\n", config_item_name(®->hr_item), - reg->hr_bdev); + reg_bdev(reg)); atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); @@ -1489,7 +1497,7 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); - mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev); + mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg)); kfree(reg->hr_tmp_block); @@ -1502,8 +1510,8 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slot_data); } - if (reg->hr_bdev) - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + if (reg->hr_bdev_file) + fput(reg->hr_bdev_file); kfree(reg->hr_slots); @@ -1528,10 +1536,11 @@ static int o2hb_read_block_input(struct o2hb_region *reg, { unsigned long bytes; char *p = (char *)page; + int ret; - bytes = simple_strtoul(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 0, &bytes); + if (ret) + return ret; /* Heartbeat and fs min / max block sizes are the same. */ if (bytes > 4096 || bytes < 512) @@ -1562,7 +1571,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item, unsigned long block_bytes; unsigned int block_bits; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; status = o2hb_read_block_input(reg, page, &block_bytes, @@ -1591,7 +1600,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item, char *p = (char *)page; ssize_t ret; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; ret = kstrtoull(p, 0, &tmp); @@ -1615,13 +1624,14 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item, struct o2hb_region *reg = to_o2hb_region(item); unsigned long tmp; char *p = (char *)page; + int ret; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; - tmp = simple_strtoul(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 0, &tmp); + if (ret) + return ret; if (tmp > O2NM_MAX_NODES || tmp == 0) return -ERANGE; @@ -1635,8 +1645,8 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (to_o2hb_region(item)->hr_bdev) - ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev); + if (to_o2hb_region(item)->hr_bdev_file) + ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item))); return ret; } @@ -1745,7 +1755,10 @@ out: return ret; } -/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ +/* + * this is acting as commit; we set up all of hr_bdev_file and hr_task or + * nothing + */ static ssize_t o2hb_region_dev_store(struct config_item *item, const char *page, size_t count) @@ -1755,45 +1768,44 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, long fd; int sectsize; char *p = (char *)page; - struct fd f; ssize_t ret = -EINVAL; int live_threshold; - if (reg->hr_bdev) - goto out; + if (reg->hr_bdev_file) + return -EINVAL; /* We can't heartbeat without having had our node number * configured yet. */ if (o2nm_this_node() == O2NM_MAX_NODES) - goto out; + return -EINVAL; - fd = simple_strtol(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - goto out; + ret = kstrtol(p, 0, &fd); + if (ret < 0) + return -EINVAL; if (fd < 0 || fd >= INT_MAX) - goto out; + return -EINVAL; - f = fdget(fd); - if (f.file == NULL) - goto out; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EINVAL; if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || reg->hr_block_bytes == 0) - goto out2; + return -EINVAL; - if (!S_ISBLK(f.file->f_mapping->host->i_mode)) - goto out2; + if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode)) + return -EINVAL; - reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, - FMODE_WRITE | FMODE_READ, NULL); - if (IS_ERR(reg->hr_bdev)) { - ret = PTR_ERR(reg->hr_bdev); - reg->hr_bdev = NULL; - goto out2; + reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev, + BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(reg->hr_bdev_file)) { + ret = PTR_ERR(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; + return ret; } - sectsize = bdev_logical_block_size(reg->hr_bdev); + sectsize = bdev_logical_block_size(reg_bdev(reg)); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, "blocksize %u incorrect for device, expected %d", @@ -1889,16 +1901,13 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, if (hb_task && o2hb_global_heartbeat_active()) printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); out3: if (ret < 0) { - blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE); - reg->hr_bdev = NULL; + fput(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; } -out2: - fdput(f); -out: return ret; } @@ -2084,7 +2093,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n", ((atomic_read(®->hr_steady_iterations) == 0) ? "stopped" : "start aborted"), config_item_name(item), - reg->hr_bdev); + reg_bdev(reg)); } /* @@ -2130,10 +2139,11 @@ static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *ite { unsigned long tmp; char *p = (char *)page; + int ret; - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 10, &tmp); + if (ret) + return ret; /* this will validate ranges for us. */ o2hb_dead_threshold_set((unsigned int) tmp); diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index b73fc42e46ff..630bd5a3dd0d 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -29,7 +29,7 @@ * just calling printk() so that this can eventually make its way through * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. * The inline tests and macro dance give GCC the opportunity to quite cleverly - * only emit the appropriage printk() when the caller passes in a constant + * only emit the appropriate printk() when the caller passes in a constant * mask, as is almost always the case. * * All this bitmask nonsense is managed from the files under diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 35c05c18de59..bc27301eab6d 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -44,17 +44,17 @@ static LIST_HEAD(send_tracking); void o2net_debug_add_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&nst->st_net_debug_item, &send_tracking); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); if (!list_empty(&nst->st_net_debug_item)) list_del_init(&nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } static struct o2net_send_tracking @@ -84,9 +84,9 @@ static void *nst_seq_start(struct seq_file *seq, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; } @@ -95,13 +95,13 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); list_del_init(&dummy_nst->st_net_debug_item); if (nst) list_add(&dummy_nst->st_net_debug_item, &nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; /* unused, just needs to be null when done */ } @@ -112,7 +112,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) ktime_t now; s64 sock, send, status; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); if (!nst) goto out; @@ -145,7 +145,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) (long long)status); out: - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } @@ -191,16 +191,16 @@ static const struct file_operations nst_seq_fops = { void o2net_debug_add_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&sc->sc_net_debug_item, &sock_containers); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_del_init(&sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } struct o2net_sock_debug { @@ -236,9 +236,9 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; } @@ -248,12 +248,12 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); list_del_init(&dummy_sc->sc_net_debug_item); if (sc) list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; /* unused, just needs to be null when done */ } @@ -349,7 +349,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); if (sc) { @@ -359,7 +359,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) sc_show_sock_stats(seq, sc); } - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 189c111bc371..bfb8b456876c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -23,7 +23,7 @@ * race between when we see a node start heartbeating and when we connect * to it. * - * So nodes that are in this transtion put a hold on the quorum decision + * So nodes that are in this transition put a hold on the quorum decision * with a counter. As they fall out of this transition they drop the count * and if they're the last, they fire off the decision. */ @@ -60,7 +60,7 @@ static void o2quo_fence_self(void) switch (o2nm_single_cluster->cl_fence_method) { case O2NM_FENCE_PANIC: panic("*** ocfs2 is very sorry to be fencing this system by " - "panicing ***\n"); + "panicking ***\n"); break; default: WARN_ON(o2nm_single_cluster->cl_fence_method >= @@ -93,7 +93,7 @@ static void o2quo_make_decision(struct work_struct *work) int lowest_hb, lowest_reachable = 0, fence = 0; struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); if (lowest_hb != O2NM_MAX_NODES) @@ -146,14 +146,14 @@ static void o2quo_make_decision(struct work_struct *work) out: if (fence) { - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); o2quo_fence_self(); } else { mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " "connected: %d, lowest: %d (%sreachable)\n", qs->qs_heartbeating, qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } @@ -189,14 +189,14 @@ static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) } /* as a node comes up we delay the quorum decision until we know the fate of - * the connection. the hold will be droped in conn_up or hb_down. it might be + * the connection. the hold will be dropped in conn_up or hb_down. it might be * perpetuated by con_err until hb_down. if we already have a conn, we might * be dropping a hold that conn_up got. */ void o2quo_hb_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating++; mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, @@ -211,7 +211,7 @@ void o2quo_hb_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* hb going down releases any holds we might have had due to this node from @@ -220,7 +220,7 @@ void o2quo_hb_down(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating--; mlog_bug_on_msg(qs->qs_heartbeating < 0, @@ -233,7 +233,7 @@ void o2quo_hb_down(u8 node) o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* this tells us that we've decided that the node is still heartbeating @@ -245,18 +245,18 @@ void o2quo_hb_still_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); mlog(0, "node %u\n", node); qs->qs_pending = 1; o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* This is analogous to hb_up. as a node's connection comes up we delay the - * quorum decision until we see it heartbeating. the hold will be droped in + * quorum decision until we see it heartbeating. the hold will be dropped in * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if * it's already heartbeating we might be dropping a hold that conn_up got. * */ @@ -264,7 +264,7 @@ void o2quo_conn_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_connected++; mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, @@ -279,7 +279,7 @@ void o2quo_conn_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* we've decided that we won't ever be connecting to the node again. if it's @@ -290,7 +290,7 @@ void o2quo_conn_err(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); if (test_bit(node, qs->qs_conn_bm)) { qs->qs_connected--; @@ -307,7 +307,7 @@ void o2quo_conn_err(u8 node) mlog(0, "node %u, %d total\n", node, qs->qs_connected); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } void o2quo_init(void) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a07b24d170f2..79b281e32f4c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -5,13 +5,13 @@ * * ---- * - * Callers for this were originally written against a very simple synchronus + * Callers for this were originally written against a very simple synchronous * API. This implementation reflects those simple callers. Some day I'm sure * we'll need to move to a more robust posting/callback mechanism. * * Transmit calls pass in kernel virtual addresses and block copying this into * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting - * for a failed socket to timeout. TX callers can also pass in a poniter to an + * for a failed socket to timeout. TX callers can also pass in a pointer to an * 'int' which gets filled with an errno off the wire in response to the * message they send. * @@ -46,6 +46,7 @@ #include <linux/net.h> #include <linux/export.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include <linux/uaccess.h> @@ -100,7 +101,7 @@ static struct socket *o2net_listen_sock; * o2net_wq. teardown detaches the callbacks before destroying the workqueue. * quorum work is queued as sock containers are shutdown.. stop_listening * tears down all the node's sock containers, preventing future shutdowns - * and queued quroum work, before canceling delayed quorum work and + * and queued quorum work, before canceling delayed quorum work and * destroying the work queue. */ static struct workqueue_struct *o2net_wq; @@ -585,6 +586,8 @@ static void o2net_data_ready(struct sock *sk) void (*ready)(struct sock *sk); struct o2net_sock_container *sc; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); sc = sk->sk_user_data; if (sc) { @@ -721,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work) if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { /* we shouldn't flush as we're in the thread, the * races with pending sc work structs are harmless */ - del_timer_sync(&sc->sc_idle_timeout); + timer_delete_sync(&sc->sc_idle_timeout); o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); sc_put(sc); kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); @@ -927,19 +930,22 @@ out: } static void o2net_sendpage(struct o2net_sock_container *sc, - void *kmalloced_virt, - size_t size) + void *virt, size_t size) { struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + struct msghdr msg = {}; + struct bio_vec bv; ssize_t ret; + bvec_set_virt(&bv, virt, size); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size); + while (1) { + msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES; mutex_lock(&sc->sc_send_lock); - ret = sc->sc_sock->ops->sendpage(sc->sc_sock, - virt_to_page(kmalloced_virt), - offset_in_page(kmalloced_virt), - size, MSG_DONTWAIT); + ret = sock_sendmsg(sc->sc_sock, &msg); mutex_unlock(&sc->sc_send_lock); + if (ret == size) break; if (ret == (ssize_t)-EAGAIN) { @@ -1413,7 +1419,7 @@ out: return ret; } -/* this work func is triggerd by data ready. it reads until it can read no +/* this work func is triggered by data ready. it reads until it can read no * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing * our work the work struct will be marked and we'll be called again. */ static void o2net_rx_until_empty(struct work_struct *work) @@ -1477,12 +1483,13 @@ static void o2net_sc_send_keep_req(struct work_struct *work) sc_put(sc); } -/* socket shutdown does a del_timer_sync against this as it tears down. +/* socket shutdown does a timer_delete_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ static void o2net_idle_timer(struct timer_list *t) { - struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout); + struct o2net_sock_container *sc = timer_container_of(sc, t, + sc_idle_timeout); struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); #ifdef CONFIG_DEBUG_FS unsigned long msecs = ktime_to_ms(ktime_get()) - @@ -1608,7 +1615,7 @@ static void o2net_start_connect(struct work_struct *work) myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; myaddr.sin_port = htons(0); /* any port */ - ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&myaddr, sizeof(myaddr)); if (ret) { mlog(ML_ERROR, "bind failed with %d at address %pI4\n", @@ -1631,7 +1638,7 @@ static void o2net_start_connect(struct work_struct *work) remoteaddr.sin_port = node->nd_ipv4_port; ret = sc->sc_sock->ops->connect(sc->sc_sock, - (struct sockaddr *)&remoteaddr, + (struct sockaddr_unsized *)&remoteaddr, sizeof(remoteaddr), O_NONBLOCK); if (ret == -EINPROGRESS) @@ -1778,6 +1785,9 @@ static int o2net_accept_one(struct socket *sock, int *more) struct o2nm_node *node = NULL; struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + }; struct o2net_node *nn; unsigned int nofs_flag; @@ -1796,7 +1806,7 @@ static int o2net_accept_one(struct socket *sock, int *more) new_sock->type = sock->type; new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false); + ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; @@ -1931,6 +1941,8 @@ static void o2net_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (ready == NULL) { /* check for teardown race */ @@ -1990,7 +2002,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) INIT_WORK(&o2net_listen_work, o2net_accept_many); sock->sk->sk_reuse = SK_CAN_REUSE; - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); if (ret < 0) { printk(KERN_ERR "o2net: Error %d while binding socket at " "%pI4:%u\n", ret, &addr, ntohs(port)); @@ -2082,18 +2094,24 @@ void o2net_stop_listening(struct o2nm_node *node) int o2net_init(void) { + struct folio *folio; + void *p; unsigned long i; o2quo_init(); - o2net_debugfs_init(); - o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); - o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0); + if (!folio) goto out; + p = folio_address(folio); + o2net_hand = p; + p += sizeof(struct o2net_handshake); + o2net_keep_req = p; + p += sizeof(struct o2net_msg); + o2net_keep_resp = p; + o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); o2net_hand->connector_id = cpu_to_be64(1); @@ -2119,9 +2137,6 @@ int o2net_init(void) return 0; out: - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); o2net_debugfs_exit(); o2quo_exit(); return -ENOMEM; @@ -2130,8 +2145,6 @@ out: void o2net_exit(void) { o2quo_exit(); - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); o2net_debugfs_exit(); + folio_put(virt_to_folio(o2net_hand)); } diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 04fc8344063a..1873bbbb7e5b 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -32,7 +32,8 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) } -static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; int ret = 0; /* if all else fails, just return false */ @@ -44,8 +45,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) inode = d_inode(dentry); osb = OCFS2_SB(dentry->d_sb); - trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, - dentry->d_name.name); + trace_ocfs2_dentry_revalidate(dentry, name->len, name->name); /* For a negative dentry - * check the generation number of the parent and compare with the @@ -53,12 +53,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) */ if (inode == NULL) { unsigned long gen = (unsigned long) dentry->d_fsdata; - unsigned long pgen; - spin_lock(&dentry->d_lock); - pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; - spin_unlock(&dentry->d_lock); - trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, - dentry->d_name.name, + unsigned long pgen = OCFS2_I(dir)->ip_dir_lock_gen; + trace_ocfs2_dentry_revalidate_negative(name->len, name->name, pgen, gen); if (gen != pgen) goto bail; @@ -124,17 +120,10 @@ static int ocfs2_match_dentry(struct dentry *dentry, if (!dentry->d_fsdata) return 0; - if (!dentry->d_parent) - return 0; - if (skip_unhashed && d_unhashed(dentry)) return 0; parent = d_inode(dentry->d_parent); - /* Negative parent dentry? */ - if (!parent) - return 0; - /* Name is in a different directory. */ if (OCFS2_I(parent)->ip_blkno != parent_blkno) return 0; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 694471fc46b8..2785ff245e79 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -294,13 +294,29 @@ out: * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ -static int ocfs2_check_dir_entry(struct inode * dir, - struct ocfs2_dir_entry * de, - struct buffer_head * bh, +static int ocfs2_check_dir_entry(struct inode *dir, + struct ocfs2_dir_entry *de, + struct buffer_head *bh, + char *buf, + unsigned int size, unsigned long offset) { const char *error_msg = NULL; - const int rlen = le16_to_cpu(de->rec_len); + unsigned long next_offset; + int rlen; + + if (offset > size - OCFS2_DIR_REC_LEN(1)) { + /* Dirent is (maybe partially) beyond the buffer + * boundaries so touching 'de' members is unsafe. + */ + mlog(ML_ERROR, "directory entry (#%llu: offset=%lu) " + "too close to end or out-of-bounds", + (unsigned long long)OCFS2_I(dir)->ip_blkno, offset); + return 0; + } + + rlen = le16_to_cpu(de->rec_len); + next_offset = ((char *) de - buf) + rlen; if (unlikely(rlen < OCFS2_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; @@ -308,9 +324,11 @@ static int ocfs2_check_dir_entry(struct inode * dir, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely( - ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)) - error_msg = "directory entry across blocks"; + else if (unlikely(next_offset > size)) + error_msg = "directory entry overrun"; + else if (unlikely(next_offset > size - OCFS2_DIR_REC_LEN(1)) && + next_offset != size) + error_msg = "directory entry too close to end"; if (unlikely(error_msg != NULL)) mlog(ML_ERROR, "bad entry in directory #%llu: %s - " @@ -352,16 +370,17 @@ static inline int ocfs2_search_dirblock(struct buffer_head *bh, de_buf = first_de; dlimit = de_buf + bytes; - while (de_buf < dlimit) { + while (de_buf < dlimit - OCFS2_DIR_MEMBER_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ de = (struct ocfs2_dir_entry *) de_buf; - if (de_buf + namelen <= dlimit && + if (de->name + namelen <= dlimit && ocfs2_match(namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + if (!ocfs2_check_dir_entry(dir, de, bh, first_de, + bytes, offset)) { ret = -1; goto bail; } @@ -772,6 +791,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_rec *rec = NULL; + if (le16_to_cpu(el->l_count) != + ocfs2_extent_recs_per_dx_root(inode->i_sb)) { + ret = ocfs2_error(inode->i_sb, + "Inode %lu has invalid extent list length %u\n", + inode->i_ino, le16_to_cpu(el->l_count)); + goto out; + } + if (el->l_tree_depth) { ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, &eb_bh); @@ -792,6 +819,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } } + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ret = ocfs2_error(inode->i_sb, + "Inode %lu has empty extent list at depth %u\n", + inode->i_ino, + le16_to_cpu(el->l_tree_depth)); + goto out; + } + found = 0; for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { rec = &el->l_recs[i]; @@ -1059,26 +1094,39 @@ int ocfs2_find_entry(const char *name, int namelen, { struct buffer_head *bh; struct ocfs2_dir_entry *res_dir = NULL; + int ret = 0; if (ocfs2_dir_indexed(dir)) return ocfs2_find_entry_dx(name, namelen, dir, lookup); + if (unlikely(i_size_read(dir) <= 0)) { + ret = -EFSCORRUPTED; + mlog_errno(ret); + goto out; + } /* * The unindexed dir code only uses part of the lookup * structure, so there's no reason to push it down further * than this. */ - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (unlikely(i_size_read(dir) > dir->i_sb->s_blocksize)) { + ret = -EFSCORRUPTED; + mlog_errno(ret); + goto out; + } bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); - else + } else { bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); + } if (bh == NULL) return -ENOENT; lookup->dl_leaf_bh = bh; lookup->dl_entry = res_dir; - return 0; +out: + return ret; } /* @@ -1138,7 +1186,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, pde = NULL; de = (struct ocfs2_dir_entry *) first_de; while (i < bytes) { - if (!ocfs2_check_dir_entry(dir, de, bh, i)) { + if (!ocfs2_check_dir_entry(dir, de, bh, first_de, bytes, i)) { status = -EIO; mlog_errno(status); goto bail; @@ -1593,9 +1641,6 @@ int __ocfs2_add_entry(handle_t *handle, struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; - if (!namelen) - return -EINVAL; - if (ocfs2_dir_indexed(dir)) { struct buffer_head *bh; @@ -1638,7 +1683,8 @@ int __ocfs2_add_entry(handle_t *handle, /* These checks should've already been passed by the * prepare function, but I guess we can leave them * here anyway. */ - if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { + if (!ocfs2_check_dir_entry(dir, de, insert_bh, data_start, + size, offset)) { retval = -ENOENT; goto bail; } @@ -1658,7 +1704,8 @@ int __ocfs2_add_entry(handle_t *handle, offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); if (ocfs2_dirent_would_fit(de, rec_len)) { - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, + inode_set_ctime_current(dir)); retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (retval < 0) { mlog_errno(retval); @@ -1776,7 +1823,8 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, } de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos); - if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) { + if (!ocfs2_check_dir_entry(inode, de, di_bh, (char *)data->id_data, + i_size_read(inode), ctx->pos)) { /* On error, skip the f_pos to the end. */ ctx->pos = i_size_read(inode); break; @@ -1869,7 +1917,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, while (ctx->pos < i_size_read(inode) && offset < sb->s_blocksize) { de = (struct ocfs2_dir_entry *) (bh->b_data + offset); - if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { + if (!ocfs2_check_dir_entry(inode, de, bh, bh->b_data, + sb->s_blocksize, offset)) { /* On error, skip the f_pos to the next block. */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; @@ -1925,6 +1974,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) { int error = 0; struct inode *inode = file_inode(file); + struct ocfs2_file_private *fp = file->private_data; int lock_level = 0; trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1945,7 +1995,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) goto bail_nolock; } - error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false); + error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false); ocfs2_inode_unlock(inode, lock_level); if (error) @@ -2002,6 +2052,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, * * Return 0 if the name does not exist * Return -EEXIST if the directory contains the name + * Return -EFSCORRUPTED if found corruption * * Callers should have i_rwsem + a cluster lock on dir */ @@ -2015,9 +2066,12 @@ int ocfs2_check_dir_for_entry(struct inode *dir, trace_ocfs2_check_dir_for_entry( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); - if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) { + ret = ocfs2_find_entry(name, namelen, dir, &lookup); + if (ret == 0) { ret = -EEXIST; mlog_errno(ret); + } else if (ret == -ENOENT) { + ret = 0; } ocfs2_free_dir_lookup_result(&lookup); @@ -2962,11 +3016,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dinode_new_extent_list(dir, di); i_size_write(dir, sb->s_blocksize); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); di->i_size = cpu_to_le64(sb->s_blocksize); - di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir)); ocfs2_update_inode_fsync_trans(handle, dir, 1); /* @@ -3341,7 +3395,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, struct super_block *sb = dir->i_sb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_dir_entry *de, *last_de = NULL; - char *de_buf, *limit; + char *first_de, *de_buf, *limit; unsigned long offset = 0; unsigned int rec_len, new_rec_len, free_space; @@ -3354,14 +3408,16 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, else free_space = dir->i_sb->s_blocksize - i_size_read(dir); - de_buf = di->id2.i_data.id_data; + first_de = di->id2.i_data.id_data; + de_buf = first_de; limit = de_buf + i_size_read(dir); rec_len = OCFS2_DIR_REC_LEN(namelen); while (de_buf < limit) { de = (struct ocfs2_dir_entry *)de_buf; - if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) { + if (!ocfs2_check_dir_entry(dir, de, di_bh, first_de, + i_size_read(dir), offset)) { ret = -ENOENT; goto out; } @@ -3388,6 +3444,14 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, offset += le16_to_cpu(de->rec_len); } + if (!last_de) { + ret = ocfs2_error(sb, "Directory entry (#%llu: size=%lld) " + "is unexpectedly short", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + i_size_read(dir)); + goto out; + } + /* * We're going to require expansion of the directory - figure * out how many blocks we'll need so that a place for the @@ -3443,7 +3507,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, /* move to next block */ de = (struct ocfs2_dir_entry *) bh->b_data; } - if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + if (!ocfs2_check_dir_entry(dir, de, bh, bh->b_data, blocksize, + offset)) { status = -ENOENT; goto bail; } @@ -3501,16 +3566,6 @@ static int dx_leaf_sort_cmp(const void *a, const void *b) return 0; } -static void dx_leaf_sort_swap(void *a, void *b, int size) -{ - struct ocfs2_dx_entry *entry1 = a; - struct ocfs2_dx_entry *entry2 = b; - - BUG_ON(size != sizeof(*entry1)); - - swap(*entry1, *entry2); -} - static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) { struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; @@ -3771,7 +3826,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, */ sort(dx_leaf->dl_list.de_entries, num_used, sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, - dx_leaf_sort_swap); + NULL); ocfs2_journal_dirty(handle, dx_leaf_bh); @@ -4078,10 +4133,15 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, } dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; - memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - - offsetof(struct ocfs2_dx_root_block, dr_list)); + + dx_root->dr_list.l_tree_depth = 0; dx_root->dr_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + dx_root->dr_list.l_next_free_rec = 0; + memset(&dx_root->dr_list.l_recs, 0, + osb->sb->s_blocksize - + (offsetof(struct ocfs2_dx_root_block, dr_list) + + offsetof(struct ocfs2_extent_list, l_recs))); /* This should never fail considering we start with an empty * dx_root. */ @@ -4244,12 +4304,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, trace_ocfs2_prepare_dir_for_insert( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); - if (!namelen) { - ret = -EINVAL; - mlog_errno(ret); - goto out; - } - /* * Do this up front to reduce confusion. * diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index bae60ca2672a..1969db8ffa9c 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -62,8 +62,6 @@ enum dlm_status { DLM_MAXSTATS, /* 41: upper limit for return code validation */ }; -/* for pretty-printing dlm_status error messages */ -const char *dlm_errmsg(enum dlm_status err); /* for pretty-printing dlm_status error names */ const char *dlm_errname(enum dlm_status err); @@ -120,7 +118,7 @@ struct dlm_lockstatus { #define LKM_VALBLK 0x00000100 /* lock value block request */ #define LKM_NOQUEUE 0x00000200 /* non blocking request */ #define LKM_CONVERT 0x00000400 /* conversion request */ -#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */ +#define LKM_NODLCKWT 0x00000800 /* this lock won't deadlock (U) */ #define LKM_UNLOCK 0x00001000 /* deallocate this lock */ #define LKM_CANCEL 0x00002000 /* cancel conversion request */ #define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index be5e9ed7da8d..fe4fdd09bae3 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/export.h> +#include <linux/string_choices.h> #include "../cluster/heartbeat.h" #include "../cluster/nodemanager.h" @@ -90,12 +91,12 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) buf, res->owner, res->state); printk(" last used: %lu, refcnt: %u, on purge list: %s\n", res->last_used, kref_read(&res->refs), - list_empty(&res->purge) ? "no" : "yes"); + str_no_yes(list_empty(&res->purge))); printk(" on dirty list: %s, on reco list: %s, " "migrating pending: %s\n", - list_empty(&res->dirty) ? "no" : "yes", - list_empty(&res->recovering) ? "no" : "yes", - res->migration_pending ? "yes" : "no"); + str_no_yes(list_empty(&res->dirty)), + str_no_yes(list_empty(&res->recovering)), + str_yes_no(res->migration_pending)); printk(" inflight locks: %d, asts reserved: %d\n", res->inflight_locks, atomic_read(&res->asts_reserved)); dlm_print_lockres_refmap(res); @@ -164,59 +165,6 @@ static const char *dlm_errnames[] = { [DLM_MAXSTATS] = "DLM_MAXSTATS", }; -static const char *dlm_errmsgs[] = { - [DLM_NORMAL] = "request in progress", - [DLM_GRANTED] = "request granted", - [DLM_DENIED] = "request denied", - [DLM_DENIED_NOLOCKS] = "request denied, out of system resources", - [DLM_WORKING] = "async request in progress", - [DLM_BLOCKED] = "lock request blocked", - [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock", - [DLM_DENIED_GRACE_PERIOD] = "topological change in progress", - [DLM_SYSERR] = "system error", - [DLM_NOSUPPORT] = "unsupported", - [DLM_CANCELGRANT] = "can't cancel convert: already granted", - [DLM_IVLOCKID] = "bad lockid", - [DLM_SYNC] = "synchronous request granted", - [DLM_BADTYPE] = "bad resource type", - [DLM_BADRESOURCE] = "bad resource handle", - [DLM_MAXHANDLES] = "no more resource handles", - [DLM_NOCLINFO] = "can't contact cluster manager", - [DLM_NOLOCKMGR] = "can't contact lock manager", - [DLM_NOPURGED] = "can't contact purge daemon", - [DLM_BADARGS] = "bad api args", - [DLM_VOID] = "no status", - [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed", - [DLM_IVBUFLEN] = "invalid resource name length", - [DLM_CVTUNGRANT] = "attempted to convert ungranted lock", - [DLM_BADPARAM] = "invalid lock mode specified", - [DLM_VALNOTVALID] = "value block has been invalidated", - [DLM_REJECTED] = "request rejected, unrecognized client", - [DLM_ABORT] = "blocked lock request cancelled", - [DLM_CANCEL] = "conversion request cancelled", - [DLM_IVRESHANDLE] = "invalid resource handle", - [DLM_DEADLOCK] = "deadlock recovery refused this request", - [DLM_DENIED_NOASTS] = "failed to allocate AST", - [DLM_FORWARD] = "request must wait for primary's response", - [DLM_TIMEOUT] = "timeout value for lock has expired", - [DLM_IVGROUPID] = "invalid group specification", - [DLM_VERS_CONFLICT] = "version conflicts prevent request handling", - [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong", - [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device", - [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ", - [DLM_RECOVERING] = "lock resource being recovered", - [DLM_MIGRATING] = "lock resource being migrated", - [DLM_MAXSTATS] = "invalid error number", -}; - -const char *dlm_errmsg(enum dlm_status err) -{ - if (err >= DLM_MAXSTATS || err < 0) - return dlm_errmsgs[DLM_MAXSTATS]; - return dlm_errmsgs[err]; -} -EXPORT_SYMBOL_GPL(dlm_errmsg); - const char *dlm_errname(enum dlm_status err) { if (err >= DLM_MAXSTATS || err < 0) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 5c04dde99981..2347a50f079b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1274,7 +1274,7 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, { struct dlm_query_nodeinfo *qn; struct dlm_ctxt *dlm = NULL; - int locked = 0, status = -EINVAL; + int status = -EINVAL; qn = (struct dlm_query_nodeinfo *) msg->buf; @@ -1290,12 +1290,11 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qn->qn_nodenum) { mlog(ML_ERROR, "Node %d queried nodes on domain %s but " "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, dlm->joining_node); - goto bail; + goto unlock; } /* Support for node query was added in 1.1 */ @@ -1305,14 +1304,14 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qn->qn_nodenum, qn->qn_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto unlock; } status = dlm_match_nodes(dlm, qn); +unlock: + spin_unlock(&dlm->spinlock); bail: - if (locked) - spin_unlock(&dlm->spinlock); spin_unlock(&dlm_domain_lock); return status; @@ -1528,7 +1527,6 @@ static void dlm_send_join_asserts(struct dlm_ctxt *dlm, { int status, node, live; - status = 0; node = -1; while ((node = find_next_bit(node_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { @@ -1878,7 +1876,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) dlm_debug_init(dlm); snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); - dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index d610da8e2f24..4145e06d2c08 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -21,7 +21,7 @@ #include <linux/inet.h> #include <linux/spinlock.h> #include <linux/delay.h> - +#include <linux/string_choices.h> #include "../cluster/heartbeat.h" #include "../cluster/nodemanager.h" @@ -1477,7 +1477,6 @@ way_up_top: goto send_response; } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { spin_unlock(&res->spinlock); - // mlog(0, "node %u is the master\n", res->owner); response = DLM_MASTER_RESP_NO; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1493,7 +1492,6 @@ way_up_top: BUG(); } - // mlog(0, "lockres is in progress...\n"); spin_lock(&dlm->master_lock); found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { @@ -1503,8 +1501,6 @@ way_up_top: set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->type == DLM_MLE_BLOCK) { - // mlog(0, "this node is waiting for " - // "lockres to be mastered\n"); response = DLM_MASTER_RESP_NO; } else if (tmpmle->type == DLM_MLE_MIGRATION) { mlog(0, "node %u is master, but trying to migrate to " @@ -1531,8 +1527,6 @@ way_up_top: } else response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "this node is attempting to " - // "master lockres\n"); response = DLM_MASTER_RESP_MAYBE; } if (set_maybe) @@ -1559,7 +1553,6 @@ way_up_top: found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { /* this lockid has never been seen on this node yet */ - // mlog(0, "no mle found\n"); if (!mle) { spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -1573,8 +1566,6 @@ way_up_top: goto way_up_top; } - // mlog(0, "this is second time thru, already allocated, " - // "add the block.\n"); dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); __dlm_insert_mle(dlm, mle); @@ -1897,8 +1888,6 @@ ok: spin_unlock(&res->spinlock); } - // mlog(0, "woo! got an assert_master from node %u!\n", - // assert->node_idx); if (mle) { int extra_ref = 0; int nn = -1; @@ -2859,7 +2848,7 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, dlm_lockres_release_ast(dlm, res); mlog(0, "about to wait on migration_wq, dirty=%s\n", - res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); + str_yes_no(res->state & DLM_LOCK_RES_DIRTY)); /* if the extra ref we just put was the final one, this * will pass thru immediately. otherwise, we need to wait * for the last ast to finish. */ @@ -2869,12 +2858,12 @@ again: msecs_to_jiffies(1000)); if (ret < 0) { mlog(0, "woken again: migrating? %s, dead? %s\n", - res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", - test_bit(target, dlm->domain_map) ? "no":"yes"); + str_yes_no(res->state & DLM_LOCK_RES_MIGRATING), + str_no_yes(test_bit(target, dlm->domain_map))); } else { mlog(0, "all is well: migrating? %s, dead? %s\n", - res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", - test_bit(target, dlm->domain_map) ? "no":"yes"); + str_yes_no(res->state & DLM_LOCK_RES_MIGRATING), + str_no_yes(test_bit(target, dlm->domain_map))); } if (!dlm_migration_can_proceed(dlm, res, target)) { mlog(0, "trying again...\n"); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 50da8af988c1..843ee02bd85f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -22,7 +22,7 @@ #include <linux/timer.h> #include <linux/kthread.h> #include <linux/delay.h> - +#include <linux/string_choices.h> #include "../cluster/heartbeat.h" #include "../cluster/nodemanager.h" @@ -207,7 +207,7 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) * 1) all recovery threads cluster wide will work on recovering * ONE node at a time * 2) negotiate who will take over all the locks for the dead node. - * thats right... ALL the locks. + * that's right... ALL the locks. * 3) once a new master is chosen, everyone scans all locks * and moves aside those mastered by the dead guy * 4) each of these locks should be locked until recovery is done @@ -464,7 +464,6 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { - // mlog(0, "nothing to recover! sleeping now!\n"); spin_unlock(&dlm->spinlock); /* return to main thread loop and sleep. */ return 0; @@ -581,8 +580,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) msecs_to_jiffies(1000)); mlog(0, "waited 1 sec for %u, " "dead? %s\n", ndata->node_num, - dlm_is_node_dead(dlm, ndata->node_num) ? - "yes" : "no"); + str_yes_no(dlm_is_node_dead(dlm, ndata->node_num))); } else { /* -ENOMEM on the other node */ mlog(0, "%s: node %u returned " @@ -677,7 +675,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) spin_unlock(&dlm_reco_state_lock); mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass, - all_nodes_done?"yes":"no"); + str_yes_no(all_nodes_done)); if (all_nodes_done) { int ret; @@ -1469,7 +1467,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, * The first one is handled at the end of this function. The * other two are handled in the worker thread after locks have * been attached. Yes, we don't wait for purge time to match - * kref_init. The lockres will still have atleast one ref + * kref_init. The lockres will still have at least one ref * added because it is in the hash __dlm_insert_lockres() */ extra_refs++; @@ -1735,7 +1733,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&res->spinlock); } } else { - /* put.. incase we are not the master */ + /* put.. in case we are not the master */ spin_unlock(&res->spinlock); dlm_lockres_put(res); } @@ -2633,7 +2631,7 @@ again: dlm_reco_master_ready(dlm), msecs_to_jiffies(1000)); if (!dlm_reco_master_ready(dlm)) { - mlog(0, "%s: reco master taking awhile\n", + mlog(0, "%s: reco master taking a while\n", dlm->name); goto again; } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8b2020f92b5f..339f0b11cdc8 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/pagemap.h> #include <linux/types.h> #include <linux/slab.h> @@ -80,8 +81,7 @@ static int param_set_dlmfs_capabilities(const char *val, static int param_get_dlmfs_capabilities(char *buffer, const struct kernel_param *kp) { - return strlcpy(buffer, DLMFS_CAPABILITIES, - strlen(DLMFS_CAPABILITIES) + 1); + return sysfs_emit(buffer, DLMFS_CAPABILITIES); } module_param_call(capabilities, param_set_dlmfs_capabilities, param_get_dlmfs_capabilities, NULL, 0444); @@ -188,18 +188,18 @@ static int dlmfs_file_release(struct inode *inode, * We do ->setattr() just to override size changes. Our size is the size * of the LVB and nothing else. */ -static int dlmfs_file_setattr(struct user_namespace *mnt_userns, +static int dlmfs_file_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -336,8 +336,8 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, NULL, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); + simple_inode_init_ts(inode); inc_nlink(inode); inode->i_fop = &simple_dir_operations; @@ -359,8 +359,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, parent, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode_init_owner(&nop_mnt_idmap, inode, parent, mode); + simple_inode_init_ts(inode); ip = DLMFS_I(inode); ip->ip_conn = DLMFS_I(parent)->ip_conn; @@ -402,10 +402,10 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct user_namespace * mnt_userns, - struct inode * dir, - struct dentry * dentry, - umode_t mode) +static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap, + struct inode * dir, + struct dentry * dentry, + umode_t mode) { int status; struct inode *inode = NULL; @@ -441,17 +441,16 @@ static int dlmfs_mkdir(struct user_namespace * mnt_userns, ip->ip_conn = conn; inc_nlink(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); status = 0; bail: if (status < 0) iput(inode); - return status; + return ERR_PTR(status); } -static int dlmfs_create(struct user_namespace *mnt_userns, +static int dlmfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -480,8 +479,7 @@ static int dlmfs_create(struct user_namespace *mnt_userns, goto bail; } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); bail: return status; } @@ -507,9 +505,7 @@ bail: return status; } -static int dlmfs_fill_super(struct super_block * sb, - void * data, - int silent) +static int dlmfs_fill_super(struct super_block *sb, struct fs_context *fc) { sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; @@ -549,7 +545,7 @@ static const struct super_operations dlmfs_ops = { .alloc_inode = dlmfs_alloc_inode, .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { @@ -557,17 +553,27 @@ static const struct inode_operations dlmfs_file_inode_operations = { .setattr = dlmfs_file_setattr, }; -static struct dentry *dlmfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int dlmfs_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, dlmfs_fill_super); +} + +static const struct fs_context_operations dlmfs_context_ops = { + .get_tree = dlmfs_get_tree, +}; + +static int dlmfs_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, dlmfs_fill_super); + fc->ops = &dlmfs_context_ops; + + return 0; } static struct file_system_type dlmfs_fs_type = { .owner = THIS_MODULE, .name = "ocfs2_dlmfs", - .mount = dlmfs_mount, - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, + .init_fs_context = dlmfs_init_fs_context, }; MODULE_ALIAS_FS("ocfs2_dlmfs"); @@ -579,7 +585,7 @@ static int __init init_dlmfs_fs(void) dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; @@ -587,7 +593,8 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0); + user_dlm_worker = alloc_workqueue("user_dlm", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c28bc983a7b1..619ff03b15d6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -19,6 +19,7 @@ #include <linux/delay.h> #include <linux/quotaops.h> #include <linux/sched/signal.h> +#include <linux/string_choices.h> #define MLOG_MASK_PREFIX ML_DLM_GLUE #include <cluster/masklog.h> @@ -221,12 +222,12 @@ struct ocfs2_lock_res_ops { */ #define LOCK_TYPE_USES_LVB 0x2 -static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { +static const struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { .get_osb = ocfs2_get_inode_osb, .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { +static const struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { .get_osb = ocfs2_get_inode_osb, .check_downconvert = ocfs2_check_meta_downconvert, .set_lvb = ocfs2_set_meta_lvb, @@ -234,50 +235,50 @@ static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; -static struct ocfs2_lock_res_ops ocfs2_super_lops = { +static const struct ocfs2_lock_res_ops ocfs2_super_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH, }; -static struct ocfs2_lock_res_ops ocfs2_rename_lops = { +static const struct ocfs2_lock_res_ops ocfs2_rename_lops = { .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { +static const struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = { +static const struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; -static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { +static const struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; -static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { +static const struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .get_osb = ocfs2_get_dentry_osb, .post_unlock = ocfs2_dentry_post_unlock, .downconvert_worker = ocfs2_dentry_convert_worker, .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { +static const struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { .get_osb = ocfs2_get_inode_osb, .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_flock_lops = { +static const struct ocfs2_lock_res_ops ocfs2_flock_lops = { .get_osb = ocfs2_get_file_osb, .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { +static const struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { .set_lvb = ocfs2_set_qinfo_lvb, .get_osb = ocfs2_get_qinfo_osb, .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, }; -static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { +static const struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { .check_downconvert = ocfs2_check_refcount_downconvert, .downconvert_worker = ocfs2_refcount_convert_worker, .flags = 0, @@ -510,7 +511,7 @@ static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, struct ocfs2_lock_res *res, enum ocfs2_lock_type type, - struct ocfs2_lock_res_ops *ops, + const struct ocfs2_lock_res_ops *ops, void *priv) { res->l_type = type; @@ -553,7 +554,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, unsigned int generation, struct inode *inode) { - struct ocfs2_lock_res_ops *ops; + const struct ocfs2_lock_res_ops *ops; switch(type) { case OCFS2_LOCK_TYPE_RW: @@ -794,7 +795,7 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) /* * Keep a list of processes who have interest in a lockres. - * Note: this is now only uesed for check recursive cluster locking. + * Note: this is now only used for check recursive cluster locking. */ static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, struct ocfs2_lock_holder *oh) @@ -1615,7 +1616,7 @@ update_holders: unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); - /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -2162,6 +2163,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ts; lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2182,12 +2184,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); - lvb->lvb_iatime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); - lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); - lvb->lvb_imtime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + ts = inode_get_atime(inode); + lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = inode_get_ctime(inode); + lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = inode_get_mtime(inode); + lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); @@ -2208,6 +2210,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ts; mlog_meta_lvb(0, lockres); @@ -2234,12 +2237,12 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); - ocfs2_unpack_timespec(&inode->i_atime, - be64_to_cpu(lvb->lvb_iatime_packed)); - ocfs2_unpack_timespec(&inode->i_mtime, - be64_to_cpu(lvb->lvb_imtime_packed)); - ocfs2_unpack_timespec(&inode->i_ctime, - be64_to_cpu(lvb->lvb_ictime_packed)); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed)); + inode_set_atime_to_ts(inode, ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed)); + inode_set_mtime_to_ts(inode, ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed)); + inode_set_ctime_to_ts(inode, ts); spin_unlock(&oi->ip_lock); return 0; } @@ -2484,7 +2487,7 @@ update: * which hasn't been populated yet, so clear the refresh flag * and let the caller handle it. */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { status = 0; if (lockres) ocfs2_complete_lock_res_refresh(lockres, 0); @@ -2527,30 +2530,28 @@ bail: /* * This is working around a lock inversion between tasks acquiring DLM - * locks while holding a page lock and the downconvert thread which - * blocks dlm lock acquiry while acquiring page locks. + * locks while holding a folio lock and the downconvert thread which + * blocks dlm lock acquiry while acquiring folio locks. * - * ** These _with_page variantes are only intended to be called from aop - * methods that hold page locks and return a very specific *positive* error + * ** These _with_folio variants are only intended to be called from aop + * methods that hold folio locks and return a very specific *positive* error * code that aop methods pass up to the VFS -- test for errors with != 0. ** * * The DLM is called such that it returns -EAGAIN if it would have * blocked waiting for the downconvert thread. In that case we unlock - * our page so the downconvert thread can make progress. Once we've + * our folio so the downconvert thread can make progress. Once we've * done this we have to return AOP_TRUNCATED_PAGE so the aop method * that called us can bubble that back up into the VFS who will then * immediately retry the aop call. */ -int ocfs2_inode_lock_with_page(struct inode *inode, - struct buffer_head **ret_bh, - int ex, - struct page *page) +int ocfs2_inode_lock_with_folio(struct inode *inode, + struct buffer_head **ret_bh, int ex, struct folio *folio) { int ret; ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { - unlock_page(page); + folio_unlock(folio); /* * If we can't get inode lock immediately, we should not return * directly here, since this will lead to a softlockup problem. @@ -2628,7 +2629,7 @@ void ocfs2_inode_unlock(struct inode *inode, } /* - * This _tracker variantes are introduced to deal with the recursive cluster + * This _tracker variants are introduced to deal with the recursive cluster * locking issue. The idea is to keep track of a lock holder on the stack of * the current process. If there's a lock holder on the stack, we know the * task context is already protected by cluster locking. Currently, they're @@ -2733,7 +2734,7 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, struct ocfs2_lock_res *lockres; lockres = &OCFS2_I(inode)->ip_inode_lockres; - /* had_lock means that the currect process already takes the cluster + /* had_lock means that the current process already takes the cluster * lock previously. * If had_lock is 1, we have nothing to do here. * If had_lock is 0, we will release the lock. @@ -3108,6 +3109,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) struct ocfs2_lock_res *iter = v; struct ocfs2_lock_res *dummy = &priv->p_iter_res; + (*pos)++; spin_lock(&ocfs2_dlm_tracking_lock); iter = ocfs2_dlm_next_res(iter, priv); list_del_init(&dummy->l_debug_list); @@ -3149,11 +3151,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) #ifdef CONFIG_OCFS2_FS_STATS if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) { now = ktime_to_us(ktime_get_real()); - if (lockres->l_lock_prmode.ls_last > - lockres->l_lock_exmode.ls_last) - last = lockres->l_lock_prmode.ls_last; - else - last = lockres->l_lock_exmode.ls_last; + last = max(lockres->l_lock_prmode.ls_last, + lockres->l_lock_exmode.ls_last); /* * Use d_filter_secs field to filter lock resources dump, * the default d_filter_secs(0) value filters nothing, @@ -3802,9 +3801,9 @@ recheck: * set when the ast is received for an upconvert just before the * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast * on the heels of the ast, we want to delay the downconvert just - * enough to allow the up requestor to do its task. Because this + * enough to allow the up requester to do its task. Because this * lock is in the blocked queue, the lock will be downconverted - * as soon as the requestor is done with the lock. + * as soon as the requester is done with the lock. */ if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) goto leave_requeue; @@ -4339,7 +4338,7 @@ unqueue: ocfs2_schedule_blocked_lock(osb, lockres); mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name, - ctl.requeue ? "yes" : "no"); + str_yes_no(ctl.requeue)); spin_unlock_irqrestore(&lockres->l_lock, flags); if (ctl.unblock_action != UNBLOCK_CONTINUE diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index e5da5809ed95..a3ebd7303ea2 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -137,10 +137,8 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, int ex, int arg_flags, int subclass); -int ocfs2_inode_lock_with_page(struct inode *inode, - struct buffer_head **ret_bh, - int ex, - struct page *page); +int ocfs2_inode_lock_with_folio(struct inode *inode, + struct buffer_head **ret_bh, int ex, struct folio *folio); /* Variants without special locking class or flags */ #define ocfs2_inode_lock_full(i, r, e, f)\ ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL) diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index eaa8c80ace3c..b95724b767e1 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -255,9 +255,9 @@ static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb, if (fh_len < 3 || fh_type > 2) return NULL; - handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32; - handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]); - handle.ih_generation = le32_to_cpu(fid->raw[2]); + handle.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[0]) << 32; + handle.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[1]); + handle.ih_generation = le32_to_cpu((__force __le32)fid->raw[2]); return ocfs2_get_dentry(sb, &handle); } @@ -269,9 +269,9 @@ static struct dentry *ocfs2_fh_to_parent(struct super_block *sb, if (fh_type != 2 || fh_len < 6) return NULL; - parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32; - parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]); - parent.ih_generation = le32_to_cpu(fid->raw[5]); + parent.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[3]) << 32; + parent.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[4]); + parent.ih_generation = le32_to_cpu((__force __le32)fid->raw[5]); return ocfs2_get_dentry(sb, &parent); } diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 70a768b623cf..ef147e8b3271 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -435,6 +435,16 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, } } + if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { + ocfs2_error(inode->i_sb, + "Inode %lu has an invalid extent (next_free_rec %u, count %u)\n", + inode->i_ino, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + i = ocfs2_search_extent_list(el, v_cluster); if (i == -1) { /* @@ -696,6 +706,8 @@ out: * it not only handles the fiemap for inlined files, but also deals * with the fast symlink, cause they have no difference for extent * mapping per se. + * + * Must be called with ip_alloc_sem semaphore held. */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, @@ -707,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, u64 phys; u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; struct ocfs2_inode_info *oi = OCFS2_I(inode); + lockdep_assert_held_read(&oi->ip_alloc_sem); di = (struct ocfs2_dinode *)di_bh->b_data; if (ocfs2_inode_is_fast_symlink(inode)) @@ -722,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) return ret; } @@ -792,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; - + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, len_bytes, fe_flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret) break; @@ -973,7 +991,13 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, } while (done < nr) { - down_read(&OCFS2_I(inode)->ip_alloc_sem); + if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { + rc = -EAGAIN; + mlog(ML_ERROR, + "Inode #%llu ip_alloc_sem is temporarily unavailable\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + break; + } rc = ocfs2_extent_map_get_blocks(inode, v_block + done, &p_block, &p_count, NULL); up_read(&OCFS2_I(inode)->ip_alloc_sem); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5c60b6bc85bf..21d797ccccd0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -232,15 +232,19 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if (vfsmnt->mnt_flags & MNT_RELATIME) { - if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) || - (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 atime = inode_get_atime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + + if ((timespec64_compare(&atime, &mtime) <= 0) || + (timespec64_compare(&atime, &ctime) <= 0)) return 1; return 0; } now = current_time(inode); - if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) + if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum)) return 0; else return 1; @@ -273,9 +277,9 @@ int ocfs2_update_inode_atime(struct inode *inode, * have i_rwsem to guard against concurrent changes to other * inode fields. */ - inode->i_atime = current_time(inode); - di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + inode_set_atime_to_ts(inode, current_time(inode)); + di->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); @@ -294,7 +298,7 @@ int ocfs2_set_inode_size(handle_t *handle, i_size_write(inode, new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { @@ -415,12 +419,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, } i_size_write(inode, new_i_size); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); di = (struct ocfs2_dinode *) fe_bh->b_data; di->i_size = cpu_to_le64(new_i_size); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -751,7 +755,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; unsigned long index = abs_from >> PAGE_SHIFT; handle_t *handle; int ret = 0; @@ -770,18 +774,19 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, goto out; } - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); mlog_errno(ret); goto out_commit_trans; } - /* Get the offsets within the page that we want to zero */ - zero_from = abs_from & (PAGE_SIZE - 1); - zero_to = abs_to & (PAGE_SIZE - 1); + /* Get the offsets within the folio that we want to zero */ + zero_from = offset_in_folio(folio, abs_from); + zero_to = offset_in_folio(folio, abs_to); if (!zero_to) - zero_to = PAGE_SIZE; + zero_to = folio_size(folio); trace_ocfs2_write_zero_page( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -799,7 +804,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = __block_write_begin(page, block_start + 1, 0, + ret = __block_write_begin(folio, block_start + 1, 0, ocfs2_get_block); if (ret < 0) { mlog_errno(ret); @@ -808,25 +813,20 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - ret = block_commit_write(page, block_start + 1, - block_start + 1); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + block_commit_write(folio, block_start + 1, block_start + 1); } /* * fs-writeback will release the dirty pages without page lock * whose offset are over inode size, the release happens at - * block_write_full_page(). + * block_write_full_folio(). */ i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); di->i_mtime_nsec = di->i_ctime_nsec; if (handle) { ocfs2_journal_dirty(handle, di_bh); @@ -834,8 +834,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_commit_trans: if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); @@ -1111,7 +1111,7 @@ out: return ret; } -int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; @@ -1129,9 +1129,12 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, trace_ocfs2_setattr(inode, dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, dentry->d_name.len, dentry->d_name.name, - attr->ia_valid, attr->ia_mode, - from_kuid(&init_user_ns, attr->ia_uid), - from_kgid(&init_user_ns, attr->ia_gid)); + attr->ia_valid, + attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0, + attr->ia_valid & ATTR_UID ? + from_kuid(&init_user_ns, attr->ia_uid) : 0, + attr->ia_valid & ATTR_GID ? + from_kgid(&init_user_ns, attr->ia_gid) : 0); /* ensuring we don't even attempt to truncate a symlink */ if (S_ISLNK(inode->i_mode)) @@ -1142,11 +1145,11 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = setattr_prepare(&init_user_ns, dentry, attr); + status = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (status) return status; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { status = dquot_initialize(inode); if (status) return status; @@ -1265,7 +1268,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); @@ -1302,7 +1305,7 @@ bail: return status; } -int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -1317,7 +1320,7 @@ int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, goto bail; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -1334,7 +1337,7 @@ bail: return err; } -int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret, had_lock; @@ -1360,7 +1363,7 @@ int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, dump_stack(); } - ret = generic_permission(&init_user_ns, inode, mask); + ret = generic_permission(&nop_mnt_idmap, inode, mask); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: @@ -1784,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode, return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di); + + if (byte_start > id_count || byte_start + byte_len > id_count) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, byte_start + byte_len, 0); if (ret) { @@ -1937,6 +1948,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, inode_lock(inode); + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); /* * This prevents concurrent writes on other nodes */ @@ -1991,7 +2004,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } } - if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { + if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -2043,7 +2056,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; } - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) mlog_errno(ret); @@ -2100,14 +2113,20 @@ static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, struct ocfs2_space_resv sr; int change_size = 1; int cmd = OCFS2_IOC_RESVSP64; + int ret = 0; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!ocfs2_writes_unwritten_extents(osb)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_KEEP_SIZE) + if (mode & FALLOC_FL_KEEP_SIZE) { change_size = 0; + } else { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + return ret; + } if (mode & FALLOC_FL_PUNCH_HOLE) cmd = OCFS2_IOC_UNRESVSP64; @@ -2279,7 +2298,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (setattr_should_drop_suidgid(&init_user_ns, inode)) { + if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) { if (meta_level == 0) { ocfs2_inode_unlock_for_extent_tree(inode, &di_bh, @@ -2379,6 +2398,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } else inode_lock(inode); + ocfs2_iocb_init_rw_locked(iocb); + /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". @@ -2525,6 +2546,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, if (!direct_io && nowait) return -EOPNOTSUPP; + ocfs2_iocb_init_rw_locked(iocb); + /* * buffered reads protect themselves in ->read_folio(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. @@ -2552,7 +2575,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * * Take and drop the meta data lock to update inode fields * like i_size. This allows the checks down below - * generic_file_read_iter() a chance of actually working. + * copy_splice_read() a chance of actually working. */ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, !nowait); @@ -2581,6 +2604,43 @@ bail: return ret; } +static ssize_t ocfs2_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + ssize_t ret = 0; + int lock_level = 0; + + trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name, + flags); + + /* + * We're fine letting folks race truncates and extending writes with + * read across the cluster, just like they can locally. Hence no + * rw_lock during read. + * + * Take and drop the meta data lock to update inode fields like i_size. + * This allows the checks down below filemap_splice_read() a chance of + * actually working. + */ + ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level, 1); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto bail; + } + ocfs2_inode_unlock(inode, lock_level); + + ret = filemap_splice_read(in, ppos, pipe, len, flags); + trace_filemap_splice_read_ret(ret); +bail: + return ret; +} + /* Refer generic_file_llseek_unlocked() */ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) { @@ -2706,6 +2766,13 @@ out_unlock: return remapped > 0 ? remapped : ret; } +static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct ocfs2_file_private *fp = file->private_data; + + return generic_llseek_cookie(file, offset, whence, &fp->cookie); +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2721,6 +2788,7 @@ const struct inode_operations ocfs2_file_iops = { const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, + .listxattr = ocfs2_listxattr, .permission = ocfs2_permission, .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, @@ -2732,7 +2800,7 @@ const struct inode_operations ocfs2_special_file_iops = { */ const struct file_operations ocfs2_fops = { .llseek = ocfs2_file_llseek, - .mmap = ocfs2_mmap, + .mmap_prepare = ocfs2_mmap_prepare, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, @@ -2744,16 +2812,18 @@ const struct file_operations ocfs2_fops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, - .splice_read = generic_file_splice_read, + .splice_read = ocfs2_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, + .fop_flags = FOP_ASYNC_LOCK, }; +WRAP_DIR_ITER(ocfs2_readdir) // FIXME! const struct file_operations ocfs2_dops = { - .llseek = generic_file_llseek, + .llseek = ocfs2_dir_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, @@ -2763,6 +2833,7 @@ const struct file_operations ocfs2_dops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, + .fop_flags = FOP_ASYNC_LOCK, }; /* @@ -2779,7 +2850,7 @@ const struct file_operations ocfs2_dops = { */ const struct file_operations ocfs2_fops_no_plocks = { .llseek = ocfs2_file_llseek, - .mmap = ocfs2_mmap, + .mmap_prepare = ocfs2_mmap_prepare, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, @@ -2790,16 +2861,16 @@ const struct file_operations ocfs2_fops_no_plocks = { .compat_ioctl = ocfs2_compat_ioctl, #endif .flock = ocfs2_flock, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, }; const struct file_operations ocfs2_dops_no_plocks = { - .llseek = generic_file_llseek, + .llseek = ocfs2_dir_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 71db8f3aa027..41e65e45a9f3 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -20,6 +20,7 @@ struct ocfs2_alloc_context; enum ocfs2_alloc_restarted; struct ocfs2_file_private { + u64 cookie; struct file *fp_file; struct mutex fp_mutex; struct ocfs2_lock_res fp_flock; @@ -49,11 +50,11 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int ocfs2_permission(struct user_namespace *mnt_userns, +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 1ad7106741f8..3ad7baf67658 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -505,5 +505,5 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - return (!ret ? count : ret); + return ret ?: count; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index bb116c39b581..8340525e5589 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -50,8 +50,6 @@ struct ocfs2_find_inode_args unsigned int fi_sysfile_type; }; -static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; - static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args); static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); @@ -154,8 +152,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, mlog_errno(PTR_ERR(inode)); goto bail; } - trace_ocfs2_iget5_locked(inode->i_state); - if (inode->i_state & I_NEW) { + trace_ocfs2_iget5_locked(inode_state_read_once(inode)); + if (inode_state_read_once(inode) & I_NEW) { rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } @@ -200,6 +198,22 @@ bail: return inode; } +static int ocfs2_dinode_has_extents(struct ocfs2_dinode *di) +{ + /* inodes flagged with other stuff in id2 */ + if (le32_to_cpu(di->i_flags) & + (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL | OCFS2_CHAIN_FL | + OCFS2_DEALLOC_FL)) + return 0; + /* i_flags doesn't indicate when id2 is a fast symlink */ + if (S_ISLNK(le16_to_cpu(di->i_mode)) && le64_to_cpu(di->i_size) && + !le32_to_cpu(di->i_clusters)) + return 0; + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) + return 0; + + return 1; +} /* * here's how inodes get read from disk: @@ -236,14 +250,77 @@ bail: static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = opaque; +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, ocfs2_file_ip_alloc_sem_key; +#endif inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; - if (args->fi_sysfile_type != 0) +#ifdef CONFIG_LOCKDEP + switch (args->fi_sysfile_type) { + case BAD_BLOCK_SYSTEM_INODE: + break; + case GLOBAL_INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]); + break; + case SLOT_MAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]); + break; + case HEARTBEAT_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]); + break; + case GLOBAL_BITMAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]); + break; + case USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]); + break; + case GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]); + break; + case ORPHAN_DIR_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]); + break; + case EXTENT_ALLOC_SYSTEM_INODE: lockdep_set_class(&inode->i_rwsem, - &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); + &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]); + break; + case INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]); + break; + case JOURNAL_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]); + break; + case LOCAL_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]); + break; + case TRUNCATE_LOG_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]); + break; + case LOCAL_USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]); + break; + case LOCAL_GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]); + break; + default: + WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type); + } if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || @@ -253,6 +330,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) else lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_file_ip_alloc_sem_key); +#endif return 0; } @@ -302,12 +380,12 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_mapping->a_ops = &ocfs2_aops; } - inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); - inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); - inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); - inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_atime(inode, le64_to_cpu(fe->i_atime), + le32_to_cpu(fe->i_atime_nsec)); + inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), + le32_to_cpu(fe->i_mtime_nsec)); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) mlog(ML_ERROR, @@ -1122,7 +1200,7 @@ static void ocfs2_clear_inode(struct inode *inode) dquot_drop(inode); - /* To preven remote deletes we hold open lock before, now it + /* To prevent remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); @@ -1205,12 +1283,17 @@ static void ocfs2_clear_inode(struct inode *inode) * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ + if (!osb->journal) + return; + jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } void ocfs2_evict_inode(struct inode *inode) { + write_inode_now(inode, 1); + if (!inode->i_nlink || (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); @@ -1220,27 +1303,6 @@ void ocfs2_evict_inode(struct inode *inode) ocfs2_clear_inode(inode); } -/* Called under inode_lock, with no more references on the - * struct inode, so it's safe here to check the flags field - * and to manipulate i_nlink without any other locks. */ -int ocfs2_drop_inode(struct inode *inode) -{ - struct ocfs2_inode_info *oi = OCFS2_I(inode); - - trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, - inode->i_nlink, oi->ip_flags); - - assert_spin_locked(&inode->i_lock); - inode->i_state |= I_WILL_FREE; - spin_unlock(&inode->i_lock); - write_inode_now(inode, 1); - spin_lock(&inode->i_lock); - WARN_ON(inode->i_state & I_NEW); - inode->i_state &= ~I_WILL_FREE; - - return 1; -} - /* * This is called from our getattr. */ @@ -1312,12 +1374,12 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_uid = cpu_to_le32(i_uid_read(inode)); fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); - fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); ocfs2_journal_dirty(handle, bh); ocfs2_update_inode_fsync_trans(handle, inode, 1); @@ -1348,12 +1410,12 @@ void ocfs2_refresh_inode(struct inode *inode, inode->i_blocks = 0; else inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); - inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); - inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); - inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_atime(inode, le64_to_cpu(fe->i_atime), + le32_to_cpu(fe->i_atime_nsec)); + inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), + le32_to_cpu(fe->i_mtime_nsec)); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); spin_unlock(&OCFS2_I(inode)->ip_lock); } @@ -1400,7 +1462,7 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } - if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + if (!(le32_to_cpu(di->i_flags) & OCFS2_VALID_FL)) { rc = ocfs2_error(sb, "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", (unsigned long long)bh->b_blocknr); @@ -1416,6 +1478,49 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } + if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) { + rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(di->i_suballoc_slot)); + goto bail; + } + + if ((le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) && + le32_to_cpu(di->i_clusters)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: %u clusters\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_clusters)); + goto bail; + } + + if (le32_to_cpu(di->i_flags) & OCFS2_CHAIN_FL) { + struct ocfs2_chain_list *cl = &di->id2.i_chain; + u16 bpc = 1 << (OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits); + + if (le16_to_cpu(cl->cl_count) != ocfs2_chain_recs_per_inode(sb)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: chain list count %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_count)); + goto bail; + } + if (le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: chain list index %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_next_free_rec)); + goto bail; + } + if (OCFS2_SB(sb)->bitmap_blkno && + OCFS2_SB(sb)->bitmap_blkno != le64_to_cpu(di->i_blkno) && + le16_to_cpu(cl->cl_bpc) != bpc) { + rc = ocfs2_error(sb, "Invalid dinode %llu: bits per cluster %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_bpc)); + goto bail; + } + } + rc = 0; bail: @@ -1437,7 +1542,7 @@ static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, * Call ocfs2_validate_meta_ecc() first since it has ecc repair * function, but we should not return error immediately when ecc * validation fails, because the reason is quite likely the invalid - * inode number inputed. + * inode number inputted. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); if (rc) { @@ -1547,6 +1652,16 @@ static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, le32_to_cpu(di->i_fs_generation)); } + if (ocfs2_dinode_has_extents(di) && + le16_to_cpu(di->id2.i_list.l_next_free_rec) > le16_to_cpu(di->id2.i_list.l_count)) { + di->id2.i_list.l_next_free_rec = di->id2.i_list.l_count; + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: l_next_free_rec to %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(di->id2.i_list.l_next_free_rec)); + } + if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); mark_buffer_dirty(bh); @@ -1593,6 +1708,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_validate_inode_block); + if (rc < 0) + make_bad_inode(inode); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; @@ -1621,6 +1738,7 @@ static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info } static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) +__acquires(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); @@ -1628,6 +1746,7 @@ static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) } static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) +__releases(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 82b28fdacc7e..07bd838e7843 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -65,7 +65,7 @@ struct ocfs2_inode_info tid_t i_sync_tid; tid_t i_datasync_tid; - struct dquot *i_dquot[MAXQUOTAS]; + struct dquot __rcu *i_dquot[MAXQUOTAS]; }; /* @@ -116,7 +116,6 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) } void ocfs2_evict_inode(struct inode *inode); -int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index afd54ec66103..b6864602814c 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -62,7 +62,7 @@ static inline int o2info_coherent(struct ocfs2_info_request *req) return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); } -int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags; @@ -82,8 +82,8 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) return status; } -int ocfs2_fileattr_set(struct user_namespace *mnt_userns, - struct dentry *dentry, struct fileattr *fa) +int ocfs2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags = fa->flags; @@ -125,6 +125,7 @@ int ocfs2_fileattr_set(struct user_namespace *mnt_userns, ocfs2_inode->ip_attr = flags; ocfs2_set_inode_flags(inode); + inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); if (status < 0) @@ -357,13 +358,11 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, i); + int len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; @@ -650,12 +649,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, - OCFS2_INVALID_SLOT); + int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), + type, OCFS2_INVALID_SLOT); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; @@ -795,7 +792,7 @@ bail: /* * OCFS2_IOC_INFO handles an array of requests passed from userspace. * - * ocfs2_info_handle() recevies a large info aggregation, grab and + * ocfs2_info_handle() receives a large info aggregation, grab and * validate the request count from header, then break it into small * pieces, later specific handlers can handle them one by one. * @@ -803,8 +800,8 @@ bail: * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ -static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, - int compat_flag) +static noinline_for_stack int +ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag) { int i, status = 0; u64 req_addr; @@ -840,27 +837,26 @@ bail: long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - int new_clusters; - int status; - struct ocfs2_space_resv sr; - struct ocfs2_new_group_input input; - struct reflink_arguments args; - const char __user *old_path; - const char __user *new_path; - bool preserve; - struct ocfs2_info info; void __user *argp = (void __user *)arg; + int status; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: + { + struct ocfs2_space_resv sr; + if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) return -EFAULT; return ocfs2_change_file_space(filp, cmd, &sr); + } case OCFS2_IOC_GROUP_EXTEND: + { + int new_clusters; + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -873,8 +869,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) status = ocfs2_group_extend(inode, new_clusters); mnt_drop_write_file(filp); return status; + } case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: + { + struct ocfs2_new_group_input input; + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -887,7 +887,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) status = ocfs2_group_add(inode, &input); mnt_drop_write_file(filp); return status; + } case OCFS2_IOC_REFLINK: + { + struct reflink_arguments args; + const char __user *old_path; + const char __user *new_path; + bool preserve; + if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; old_path = (const char __user *)(unsigned long)args.old_path; @@ -895,11 +902,16 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + } case OCFS2_IOC_INFO: + { + struct ocfs2_info info; + if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); + } case FITRIM: { struct super_block *sb = inode->i_sb; diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 0297c8846945..4a1c2313b429 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -11,9 +11,9 @@ #ifndef OCFS2_IOCTL_PROTO_H #define OCFS2_IOCTL_PROTO_H -int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ocfs2_fileattr_set(struct user_namespace *mnt_userns, - struct dentry *dentry, struct fileattr *fa); +int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa); +int ocfs2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa); long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3fb98b4569a2..85239807dec7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -15,6 +15,7 @@ #include <linux/time.h> #include <linux/random.h> #include <linux/delay.h> +#include <linux/writeback.h> #include <cluster/masklog.h> @@ -89,7 +90,7 @@ enum ocfs2_replay_state { struct ocfs2_replay_map { unsigned int rm_slots; enum ocfs2_replay_state rm_state; - unsigned char rm_replay_slots[]; + unsigned char rm_replay_slots[] __counted_by(rm_slots); }; static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) @@ -113,9 +114,9 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) if (osb->replay_map) return 0; - replay_map = kzalloc(sizeof(struct ocfs2_replay_map) + - (osb->max_slots * sizeof(char)), GFP_KERNEL); - + replay_map = kzalloc(struct_size(replay_map, rm_replay_slots, + osb->max_slots), + GFP_KERNEL); if (!replay_map) { mlog_errno(-ENOMEM); return -ENOMEM; @@ -173,50 +174,69 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) struct ocfs2_recovery_map *rm; mutex_init(&osb->recovery_lock); - osb->disable_recovery = 0; + osb->recovery_state = OCFS2_REC_ENABLED; osb->recovery_thread_task = NULL; init_waitqueue_head(&osb->recovery_event); - rm = kzalloc(sizeof(struct ocfs2_recovery_map) + - osb->max_slots * sizeof(unsigned int), + rm = kzalloc(struct_size(rm, rm_entries, osb->max_slots), GFP_KERNEL); if (!rm) { mlog_errno(-ENOMEM); return -ENOMEM; } - rm->rm_entries = (unsigned int *)((char *)rm + - sizeof(struct ocfs2_recovery_map)); osb->recovery_map = rm; return 0; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) { - mb(); return osb->recovery_thread_task != NULL; } -void ocfs2_recovery_exit(struct ocfs2_super *osb) +static void ocfs2_recovery_disable(struct ocfs2_super *osb, + enum ocfs2_recovery_state state) { - struct ocfs2_recovery_map *rm; - - /* disable any new recovery threads and wait for any currently - * running ones to exit. Do this before setting the vol_state. */ mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; + /* + * If recovery thread is not running, we can directly transition to + * final state. + */ + if (!ocfs2_recovery_thread_running(osb)) { + osb->recovery_state = state + 1; + goto out_lock; + } + osb->recovery_state = state; + /* Wait for recovery thread to acknowledge state transition */ + wait_event_cmd(osb->recovery_event, + !ocfs2_recovery_thread_running(osb) || + osb->recovery_state >= state + 1, + mutex_unlock(&osb->recovery_lock), + mutex_lock(&osb->recovery_lock)); +out_lock: mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ + /* + * At this point we know that no more recovery work can be queued so + * wait for any recovery completion work to complete. + */ if (osb->ocfs2_wq) flush_workqueue(osb->ocfs2_wq); +} + +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb) +{ + ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE); +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE); /* * Now that recovery is shut down, and the osb is about to be @@ -448,6 +468,23 @@ bail: } /* + * Make sure handle has at least 'nblocks' credits available. If it does not + * have that many credits available, we will try to extend the handle to have + * enough credits. If that fails, we will restart transaction to have enough + * credits. Similar notes regarding data consistency and locking implications + * as for ocfs2_extend_trans() apply here. + */ +int ocfs2_assure_trans_credits(handle_t *handle, int nblocks) +{ + int old_nblks = jbd2_handle_buffer_credits(handle); + + trace_ocfs2_assure_trans_credits(old_nblks); + if (old_nblks >= nblocks) + return 0; + return ocfs2_extend_trans(handle, nblocks - old_nblks); +} + +/* * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. * If that fails, restart the transaction & regain write access for the * buffer head which is used for metadata modifications. @@ -481,12 +518,6 @@ bail: return status; } - -struct ocfs2_triggers { - struct jbd2_buffer_trigger_type ot_triggers; - int ot_offset; -}; - static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) { return container_of(triggers, struct ocfs2_triggers, ot_triggers); @@ -550,85 +581,76 @@ static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh) { + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + mlog(ML_ERROR, "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " "bh->b_blocknr = %llu\n", (unsigned long)bh, (unsigned long long)bh->b_blocknr); - ocfs2_error(bh->b_bdev->bd_super, + ocfs2_error(ot->sb, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } -static struct ocfs2_triggers di_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dinode, i_check), -}; - -static struct ocfs2_triggers eb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_extent_block, h_check), -}; - -static struct ocfs2_triggers rb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), -}; - -static struct ocfs2_triggers gd_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), -}; - -static struct ocfs2_triggers db_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_db_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; +static void ocfs2_setup_csum_triggers(struct super_block *sb, + enum ocfs2_journal_trigger_type type, + struct ocfs2_triggers *ot) +{ + BUG_ON(type >= OCFS2_JOURNAL_TRIGGER_COUNT); -static struct ocfs2_triggers xb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), -}; + switch (type) { + case OCFS2_JTR_DI: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dinode, i_check); + break; + case OCFS2_JTR_EB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_extent_block, h_check); + break; + case OCFS2_JTR_RB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_refcount_block, rf_check); + break; + case OCFS2_JTR_GD: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_group_desc, bg_check); + break; + case OCFS2_JTR_DB: + ot->ot_triggers.t_frozen = ocfs2_db_frozen_trigger; + break; + case OCFS2_JTR_XB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_xattr_block, xb_check); + break; + case OCFS2_JTR_DQ: + ot->ot_triggers.t_frozen = ocfs2_dq_frozen_trigger; + break; + case OCFS2_JTR_DR: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check); + break; + case OCFS2_JTR_DL: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check); + break; + case OCFS2_JTR_NONE: + /* To make compiler happy... */ + return; + } -static struct ocfs2_triggers dq_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_dq_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; + ot->ot_triggers.t_abort = ocfs2_abort_trigger; + ot->sb = sb; +} -static struct ocfs2_triggers dr_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), -}; +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]) +{ + enum ocfs2_journal_trigger_type type; -static struct ocfs2_triggers dl_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), -}; + for (type = OCFS2_JTR_DI; type < OCFS2_JOURNAL_TRIGGER_COUNT; type++) + ocfs2_setup_csum_triggers(sb, type, &triggers[type]); +} static int __ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, @@ -710,56 +732,91 @@ static int __ocfs2_journal_access(handle_t *handle, int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DI], + type); } int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_EB], + type); } int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_RB], type); } int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_GD], + type); } int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DB], + type); } int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_XB], + type); } int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DQ], + type); } int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DR], + type); } int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DL], + type); } int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, @@ -779,14 +836,16 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) mlog_errno(status); if (!is_handle_aborted(handle)) { journal_t *journal = handle->h_transaction->t_journal; - struct super_block *sb = bh->b_bdev->bd_super; - mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " - "Aborting transaction and journal.\n"); + mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed: " + "handle type %u started at line %u, credits %u/%u " + "errcode %d. Aborting transaction and journal.\n", + handle->h_type, handle->h_line_no, + handle->h_requested_credits, + jbd2_handle_buffer_credits(handle), status); handle->h_err = status; jbd2_journal_abort_handle(handle); jbd2_journal_abort(journal, status); - ocfs2_abort(sb, "Journal already aborted.\n"); } } } @@ -841,6 +900,12 @@ bail: return status; } +static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + jinode->i_dirty_start, jinode->i_dirty_end); +} + int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; @@ -897,9 +962,9 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) /* call the kernels journal init function now */ j_journal = jbd2_journal_init_inode(inode); - if (j_journal == NULL) { + if (IS_ERR(j_journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EINVAL; + status = PTR_ERR(j_journal); goto done; } @@ -910,7 +975,7 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) journal->j_journal = j_journal; journal->j_journal->j_submit_inode_data_buffers = - jbd2_journal_submit_inode_data_buffers; + ocfs2_journal_submit_inode_data_buffers; journal->j_journal->j_finish_inode_data_buffers = jbd2_journal_finish_inode_data_buffers; journal->j_inode = inode; @@ -1005,7 +1070,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (!igrab(inode)) BUG(); - num_running_trans = atomic_read(&(osb->journal->j_num_trans)); + num_running_trans = atomic_read(&(journal->j_num_trans)); trace_ocfs2_journal_shutdown(num_running_trans); /* Do a commit_cache here. It will flush our journal, *and* @@ -1024,9 +1089,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) osb->commit_task = NULL; } - BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); + BUG_ON(atomic_read(&(journal->j_num_trans)) != 0); - if (ocfs2_mount_local(osb)) { + if (ocfs2_mount_local(osb) && + (journal->j_journal->j_flags & JBD2_LOADED)) { jbd2_journal_lock_updates(journal->j_journal); status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); @@ -1198,7 +1264,7 @@ static int ocfs2_force_read_journal(struct inode *inode) } for (i = 0; i < p_blocks; i++, p_blkno++) { - bh = __find_get_block(osb->sb->s_bdev, p_blkno, + bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno, osb->sb->s_blocksize); /* block not cached. */ if (!bh) @@ -1421,6 +1487,18 @@ static int __ocfs2_recovery_thread(void *arg) } } restart: + if (quota_enabled) { + mutex_lock(&osb->recovery_lock); + /* Confirm that recovery thread will no longer recover quotas */ + if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) { + osb->recovery_state = OCFS2_REC_QUOTA_DISABLED; + wake_up(&osb->recovery_event); + } + if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED) + quota_enabled = 0; + mutex_unlock(&osb->recovery_lock); + } + status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); @@ -1518,27 +1596,29 @@ bail: ocfs2_free_replay_slots(osb); osb->recovery_thread_task = NULL; - mb(); /* sync with ocfs2_recovery_thread_running */ + if (osb->recovery_state == OCFS2_REC_WANT_DISABLE) + osb->recovery_state = OCFS2_REC_DISABLED; wake_up(&osb->recovery_event); mutex_unlock(&osb->recovery_lock); - if (quota_enabled) - kfree(rm_quota); + kfree(rm_quota); return status; } void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) { + int was_set = -1; + mutex_lock(&osb->recovery_lock); + if (osb->recovery_state < OCFS2_REC_WANT_DISABLE) + was_set = ocfs2_recovery_map_set(osb, node_num); trace_ocfs2_recovery_thread(node_num, osb->node_num, - osb->disable_recovery, osb->recovery_thread_task, - osb->disable_recovery ? - -1 : ocfs2_recovery_map_set(osb, node_num)); + osb->recovery_state, osb->recovery_thread_task, was_set); - if (osb->disable_recovery) + if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE) goto out; if (osb->recovery_thread_task) @@ -1673,9 +1753,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } journal = jbd2_journal_init_inode(inode); - if (journal == NULL) { + if (IS_ERR(journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EIO; + status = PTR_ERR(journal); goto done; } @@ -1905,7 +1985,7 @@ bail: /* * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some - * randomness to the timeout to minimize multple nodes firing the timer at the + * randomness to the timeout to minimize multiple nodes firing the timer at the * same time. */ static inline unsigned long ocfs2_orphan_scan_timeout(void) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 41c382f68529..6397170f302f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -29,7 +29,7 @@ struct ocfs2_dinode; struct ocfs2_recovery_map { unsigned int rm_used; - unsigned int *rm_entries; + unsigned int rm_entries[]; }; @@ -148,6 +148,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); void ocfs2_free_replay_slots(struct ocfs2_super *osb); @@ -243,6 +244,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_assure_trans_credits(handle_t *handle, + int nblocks); int ocfs2_allocate_extend_trans(handle_t *handle, int thresh); diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index c4426d12a2ad..d1aa04a5af1b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -212,14 +212,15 @@ static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters) { - spin_lock(&osb->osb_lock); - if (osb->local_alloc_state == OCFS2_LA_DISABLED || - osb->local_alloc_state == OCFS2_LA_THROTTLED) - if (num_clusters >= osb->local_alloc_default_bits) { + if (num_clusters >= osb->local_alloc_default_bits) { + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) { cancel_delayed_work(&osb->la_enable_wq); osb->local_alloc_state = OCFS2_LA_ENABLED; } - spin_unlock(&osb->osb_lock); + spin_unlock(&osb->osb_lock); + } } void ocfs2_la_enable_worker(struct work_struct *work) @@ -335,7 +336,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off)); status = -EINVAL; goto bail; @@ -863,14 +864,8 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, numfound = bitoff = startoff = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { - if (bitoff == left) { - /* mlog(0, "bitoff (%d) == left", bitoff); */ - break; - } - /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " - "numfound = %d\n", bitoff, startoff, numfound);*/ - + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) < + left) { /* Ok, we found a zero bit... is it contig. or do we * start over?*/ if (bitoff == startoff) { @@ -973,11 +968,11 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, la_start_blk = ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(la->la_bm_off)); bitmap = la->la_bitmap; - start = count = bit_off = 0; + start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) - != -1) { + while (1) { + bit_off = ocfs2_find_next_zero_bit(bitmap, left, start); if ((bit_off < left) && (bit_off == start)) { count++; start++; @@ -1002,6 +997,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, goto bail; } } + if (bit_off >= left) break; count = 1; @@ -1220,7 +1216,7 @@ retry_enospc: OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); trace_ocfs2_local_alloc_new_window_result( - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off, + le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off), le32_to_cpu(alloc->id1.bitmap1.i_total)); bail: diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 73a3854b2afb..6de944818c56 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/fcntl.h> #include <cluster/masklog.h> @@ -26,7 +27,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; - if (fl->fl_type == F_WRLCK) + if (lock_is_write(fl)) level = 1; if (!IS_SETLKW(cmd)) trylock = 1; @@ -52,8 +53,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, */ locks_init_lock(&request); - request.fl_type = F_UNLCK; - request.fl_flags = FL_FLOCK; + request.c.flc_type = F_UNLCK; + request.c.flc_flags = FL_FLOCK; locks_lock_file_wait(file, &request); ocfs2_file_unlock(file); @@ -99,14 +100,14 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) return locks_lock_file_wait(file, fl); - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) return ocfs2_do_funlock(file, cmd, fl); else return ocfs2_do_flock(file, inode, cmd, fl); @@ -117,7 +118,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!(fl->fl_flags & FL_POSIX)) + if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 1834f26522ed..50e2faf64c19 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -44,16 +44,16 @@ static vm_fault_t ocfs2_fault(struct vm_fault *vmf) } static vm_fault_t __ocfs2_page_mkwrite(struct file *file, - struct buffer_head *di_bh, struct page *page) + struct buffer_head *di_bh, struct folio *folio) { int err; vm_fault_t ret = VM_FAULT_NOPAGE; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - loff_t pos = page_offset(page); + loff_t pos = folio_pos(folio); unsigned int len = PAGE_SIZE; pgoff_t last_index; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; void *fsdata; loff_t size = i_size_read(inode); @@ -72,9 +72,9 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, * * Let VM retry with these cases. */ - if ((page->mapping != inode->i_mapping) || - (!PageUptodate(page)) || - (page_offset(page) >= size)) + if ((folio->mapping != inode->i_mapping) || + !folio_test_uptodate(folio) || + (pos >= size)) goto out; /* @@ -87,11 +87,11 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, * worry about ocfs2_write_begin() skipping some buffer reads * because the "write" would invalidate their data. */ - if (page->index == last_index) + if (folio->index == last_index) len = ((size - 1) & ~PAGE_MASK) + 1; err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, - &locked_page, &fsdata, di_bh, page); + &locked_folio, &fsdata, di_bh, folio); if (err) { if (err != -ENOSPC) mlog_errno(err); @@ -99,7 +99,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, goto out; } - if (!locked_page) { + if (!locked_folio) { ret = VM_FAULT_NOPAGE; goto out; } @@ -112,7 +112,7 @@ out: static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; @@ -141,7 +141,7 @@ static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, page); + ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, folio); up_write(&OCFS2_I(inode)->ip_alloc_sem); @@ -159,8 +159,9 @@ static const struct vm_operations_struct ocfs2_file_vm_ops = { .page_mkwrite = ocfs2_page_mkwrite, }; -int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) +int ocfs2_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; int ret = 0, lock_level = 0; ret = ocfs2_inode_lock_atime(file_inode(file), @@ -171,7 +172,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) } ocfs2_inode_unlock(file_inode(file), lock_level); out: - vma->vm_ops = &ocfs2_file_vm_ops; + desc->vm_ops = &ocfs2_file_vm_ops; return 0; } diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h index 1051507cc684..d21c30de6b8c 100644 --- a/fs/ocfs2/mmap.h +++ b/fs/ocfs2/mmap.h @@ -2,6 +2,6 @@ #ifndef OCFS2_MMAP_H #define OCFS2_MMAP_H -int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); +int ocfs2_mmap_prepare(struct vm_area_desc *desc); #endif /* OCFS2_MMAP_H */ diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 192cad0662d8..ce978a2497d9 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -98,21 +98,19 @@ static int __ocfs2_move_extent(handle_t *handle, rec = &el->l_recs[index]; - BUG_ON(ext_flags != rec->e_flags); + if (ext_flags != rec->e_flags) { + ret = ocfs2_error(inode->i_sb, + "Inode %llu has corrupted extent %d with flags 0x%x at cpos %u\n", + (unsigned long long)ino, index, rec->e_flags, cpos); + goto out; + } + /* * after moving/defraging to new location, the extent is not going * to be refcounted anymore. */ replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), - context->et.et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_split_extent(handle, &context->et, path, index, &replace_rec, context->meta_ac, &context->dealloc); @@ -121,8 +119,6 @@ static int __ocfs2_move_extent(handle_t *handle, goto out; } - ocfs2_journal_dirty(handle, context->et.et_root_bh); - context->new_phys_cpos = new_p_cpos; /* @@ -374,7 +370,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, int *vict_bit, struct buffer_head **ret_bh) { - int ret, i, bits_per_unit = 0; + int ret, i, len, bits_per_unit = 0; u64 blkno; char namebuf[40]; @@ -385,9 +381,9 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, struct ocfs2_dinode *ac_dinode; struct ocfs2_group_desc *bg; - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); - ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno); + if (ret) { ret = -ENOENT; goto out; @@ -444,7 +440,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + - le16_to_cpu(bg->bg_bits))) { + (le16_to_cpu(bg->bg_bits) << bits_per_unit))) { *ret_bh = gd_bh; *vict_bit = (vict_blkno - blkno) >> @@ -502,7 +498,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; /* - * moving goal is not allowd to start with a group desc blok(#0 blk) + * moving goal is not allowed to start with a group desc blok(#0 blk) * let's compromise to the latter cluster. */ if (range->me_goal == le64_to_cpu(bg->bg_blkno)) @@ -559,6 +555,7 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, last_free_bits++; if (last_free_bits == move_len) { + i -= move_len; *goal_bit = i; *phys_cpos = base_cpos + i; break; @@ -626,6 +623,8 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, */ credits += OCFS2_INODE_UPDATE_CREDITS + 1; + inode_lock(tl_inode); + /* * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() * logic, while we still need to lock the global_bitmap. @@ -635,7 +634,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, if (!gb_inode) { mlog(ML_ERROR, "unable to get global_bitmap inode\n"); ret = -EIO; - goto out; + goto out_unlock_tl_inode; } inode_lock(gb_inode); @@ -643,16 +642,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { mlog_errno(ret); - goto out_unlock_gb_mutex; + goto out_unlock_gb_inode; } - inode_lock(tl_inode); - handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_unlock_tl_inode; + goto out_unlock; } new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); @@ -667,7 +664,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, /* * probe the victim cluster group to find a proper - * region to fit wanted movement, it even will perfrom + * region to fit wanted movement, it even will perform * a best-effort attempt by compromising to a threshold * around the goal. */ @@ -694,7 +691,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, - goal_bit, len); + goal_bit, len, 0, 0); if (ret) { ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, le16_to_cpu(gd->bg_chain)); @@ -712,15 +709,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, out_commit: ocfs2_commit_trans(osb, handle); brelse(gd_bh); - -out_unlock_tl_inode: - inode_unlock(tl_inode); - +out_unlock: ocfs2_inode_unlock(gb_inode, 1); -out_unlock_gb_mutex: +out_unlock_gb_inode: inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); +out_unlock_tl_inode: + inode_unlock(tl_inode); out: if (context->meta_ac) { @@ -877,6 +873,11 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh, mlog_errno(ret); goto out; } + /* + * Invalidate extent cache after moving/defragging to prevent + * stale cached data with outdated extent flags. + */ + ocfs2_extent_map_trunc(inode, cpos); context->clusters_moved += alloc_size; next: @@ -929,7 +930,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } /* - * rememer ip_xattr_sem also needs to be held if necessary + * remember ip_xattr_sem also needs to be held if necessary */ down_write(&OCFS2_I(inode)->ip_alloc_sem); @@ -959,9 +960,9 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } di = (struct ocfs2_dinode *)di_bh->b_data; - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); @@ -1030,18 +1031,25 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) context->range = ⦥ + /* + * ok, the default threshold for the defragmentation + * is 1M, since our maximum clustersize was 1M also. + * any thought? + */ + if (!range.me_threshold) + range.me_threshold = 1024 * 1024; + + if (range.me_threshold > i_size_read(inode)) + range.me_threshold = i_size_read(inode); + + if (range.me_flags & ~(OCFS2_MOVE_EXT_FL_AUTO_DEFRAG | + OCFS2_MOVE_EXT_FL_PART_DEFRAG)) { + status = -EINVAL; + goto out_free; + } + if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { context->auto_defrag = 1; - /* - * ok, the default theshold for the defragmentation - * is 1M, since our maximum clustersize was 1M also. - * any thought? - */ - if (!range.me_threshold) - range.me_threshold = 1024 * 1024; - - if (range.me_threshold > i_size_read(inode)) - range.me_threshold = i_size_read(inode); if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) context->partial = 1; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a8fd51afb794..c90b254da75e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -142,6 +142,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, bail_add: ret = d_splice_alias(inode, dentry); + if (IS_ERR(ret)) + goto bail_unlock; if (inode) { /* @@ -154,15 +156,16 @@ bail_add: * NOTE: This dentry already has ->d_op set from * ocfs2_get_parent() and ocfs2_get_dentry() */ - if (!IS_ERR_OR_NULL(ret)) + if (ret) dentry = ret; status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); if (status) { mlog_errno(status); + if (ret) + dput(ret); ret = ERR_PTR(status); - goto bail_unlock; } } else ocfs2_dentry_attach_gen(dentry); @@ -197,11 +200,13 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) * callers. */ if (S_ISDIR(mode)) set_nlink(inode, 2); - mode = mode_strip_sgid(&init_user_ns, dir, mode); - inode_init_owner(&init_user_ns, inode, dir, mode); + mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); status = dquot_initialize(inode); - if (status) + if (status) { + iput(inode); return ERR_PTR(status); + } return inode; } @@ -221,7 +226,7 @@ static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, iput(inode); } -static int ocfs2_mknod(struct user_namespace *mnt_userns, +static int ocfs2_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -242,6 +247,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, int want_meta = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { + .name = NULL, .enable = 1, }; int did_quota_inode = 0; @@ -505,7 +511,6 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct inode *inode, dev_t dev, struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, handle_t *handle, struct ocfs2_alloc_context *inode_ac, u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) @@ -565,7 +570,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); - ktime_get_real_ts64(&ts); + ktime_get_coarse_real_ts64(&ts); fe->i_atime = fe->i_ctime = fe->i_mtime = cpu_to_le64(ts.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = @@ -638,27 +643,27 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, - parent_fe_bh, handle, inode_ac, - fe_blkno, suballoc_loc, suballoc_bit); + handle, inode_ac, fe_blkno, + suballoc_loc, suballoc_bit); } -static int ocfs2_mkdir(struct user_namespace *mnt_userns, - struct inode *dir, - struct dentry *dentry, - umode_t mode) +static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { int ret; trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (ret) mlog_errno(ret); - return ret; + return ERR_PTR(ret); } -static int ocfs2_create(struct user_namespace *mnt_userns, +static int ocfs2_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -668,7 +673,7 @@ static int ocfs2_create(struct user_namespace *mnt_userns, trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); if (ret) mlog_errno(ret); @@ -792,10 +797,11 @@ static int ocfs2_link(struct dentry *old_dentry, } inc_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ocfs2_set_links_count(fe, inode->i_nlink); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, @@ -992,9 +998,10 @@ static int ocfs2_unlink(struct inode *dir, drop_nlink(inode); drop_nlink(inode); ocfs2_set_links_count(fe, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (S_ISDIR(inode->i_mode)) drop_nlink(dir); @@ -1194,7 +1201,7 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) ocfs2_inode_unlock(inode2, 1); } -static int ocfs2_rename(struct user_namespace *mnt_userns, +static int ocfs2_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, @@ -1335,7 +1342,7 @@ static int ocfs2_rename(struct user_namespace *mnt_userns, goto bail; } - if (S_ISDIR(old_inode->i_mode)) { + if (S_ISDIR(old_inode->i_mode) && new_dir != old_dir) { u64 old_inode_parent; update_dot_dot = 1; @@ -1352,8 +1359,7 @@ static int ocfs2_rename(struct user_namespace *mnt_userns, goto bail; } - if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= ocfs2_link_max(osb)) { + if (!new_inode && new_dir->i_nlink >= ocfs2_link_max(osb)) { status = -EMLINK; goto bail; } @@ -1449,8 +1455,8 @@ static int ocfs2_rename(struct user_namespace *mnt_userns, newfe = (struct ocfs2_dinode *) newfe_bh->b_data; trace_ocfs2_rename_over_existing( - (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? - (unsigned long long)newfe_bh->b_blocknr : 0ULL); + (unsigned long long)newfe_blkno, newfe_bh, + (unsigned long long)newfe_bh->b_blocknr); if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, @@ -1534,9 +1540,13 @@ static int ocfs2_rename(struct user_namespace *mnt_userns, status = ocfs2_add_entry(handle, new_dentry, old_inode, OCFS2_I(old_inode)->ip_blkno, new_dir_bh, &target_insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } } - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), @@ -1545,8 +1555,8 @@ static int ocfs2_rename(struct user_namespace *mnt_userns, if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; - old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); - old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); + old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode)); + old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode)); ocfs2_journal_dirty(handle, old_inode_bh); } else mlog_errno(status); @@ -1585,13 +1595,20 @@ static int ocfs2_rename(struct user_namespace *mnt_userns, if (new_inode) { drop_nlink(new_inode); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, &old_inode_dot_dot_res, new_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (S_ISDIR(old_inode->i_mode)) { drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); @@ -1609,7 +1626,8 @@ static int ocfs2_rename(struct user_namespace *mnt_userns, if (old_dir != new_dir) { /* Keep the same times on both directories.*/ - new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + inode_set_mtime_to_ts(new_dir, + inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir))); /* * This will also pick up the i_nlink change from the @@ -1630,6 +1648,10 @@ static int ocfs2_rename(struct user_namespace *mnt_userns, INODE_CACHE(old_dir), old_dir_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } fe = (struct ocfs2_dinode *) old_dir_bh->b_data; ocfs2_set_links_count(fe, old_dir->i_nlink); ocfs2_journal_dirty(handle, old_dir_bh); @@ -1784,7 +1806,7 @@ bail: return status; } -static int ocfs2_symlink(struct user_namespace *mnt_userns, +static int ocfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) @@ -1805,6 +1827,7 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, int want_clusters = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { + .name = NULL, .enable = 1, }; int did_quota = 0, did_quota_inode = 0; @@ -2170,8 +2193,10 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, * @osb: ocfs2 file system * @ret_orphan_dir: Orphan dir inode - returned locked! * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @name: Buffer to store the name of the orphan. * @lookup: dir lookup result, to be passed back into functions like * ocfs2_orphan_add + * @dio: Flag indicating if direct IO is being used or not. * * Returns zero on success and the ret_orphan_dir, name and lookup * fields will be populated. @@ -2553,7 +2578,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, clear_nlink(inode); /* do the real work now. */ status = __ocfs2_mknod_locked(dir, inode, - 0, &new_di_bh, parent_di_bh, handle, + 0, &new_di_bh, handle, inode_ac, di_blkno, suballoc_loc, suballoc_bit); if (status < 0) { diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a503c553bab2..6aaa94c554c1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -154,7 +154,7 @@ struct ocfs2_lock_stats { struct ocfs2_lock_res { void *l_priv; - struct ocfs2_lock_res_ops *l_ops; + const struct ocfs2_lock_res_ops *l_ops; struct list_head l_blocked_list; @@ -284,6 +284,45 @@ enum ocfs2_mount_options #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; + struct super_block *sb; +}; + +enum ocfs2_journal_trigger_type { + OCFS2_JTR_DI, + OCFS2_JTR_EB, + OCFS2_JTR_RB, + OCFS2_JTR_GD, + OCFS2_JTR_DB, + OCFS2_JTR_XB, + OCFS2_JTR_DQ, + OCFS2_JTR_DR, + OCFS2_JTR_DL, + OCFS2_JTR_NONE /* This must be the last entry */ +}; + +#define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE + +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]); + +enum ocfs2_recovery_state { + OCFS2_REC_ENABLED = 0, + OCFS2_REC_QUOTA_WANT_DISABLE, + /* + * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for + * ocfs2_recovery_disable_quota() to work. + */ + OCFS2_REC_QUOTA_DISABLED, + OCFS2_REC_WANT_DISABLE, + /* + * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work + */ + OCFS2_REC_DISABLED, +}; + struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; @@ -346,11 +385,14 @@ struct ocfs2_super struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; - int disable_recovery; + enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; + /* Journal triggers for checksum */ + struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; + struct delayed_work la_enable_wq; /* diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 7aebdbf5cc0a..f7763da5c4a2 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -132,7 +132,7 @@ * well as the name of the cluster being joined. * mount.ocfs2 must pass in a matching stack name. * - * If not set, the classic stack will be used. This is compatbile with + * If not set, the classic stack will be used. This is compatible with * all older versions. */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 @@ -143,7 +143,7 @@ /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 -/* Support for indexed directores */ +/* Support for indexed directories */ #define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 /* Metadata checksum and error correction */ @@ -156,7 +156,7 @@ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 /* - * Incompat bit to indicate useable clusterinfo with stackflags for all + * Incompat bit to indicate usable clusterinfo with stackflags for all * cluster stacks (userspace adnd o2cb). If this bit is set, * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. */ @@ -468,7 +468,8 @@ struct ocfs2_extent_list { __le16 l_reserved1; __le64 l_reserved2; /* Pad to sizeof(ocfs2_extent_rec) */ -/*10*/ struct ocfs2_extent_rec l_recs[]; /* Extent records */ + /* Extent records */ +/*10*/ struct ocfs2_extent_rec l_recs[] __counted_by_le(l_count); }; /* @@ -482,7 +483,8 @@ struct ocfs2_chain_list { __le16 cl_count; /* Total chains in this list */ __le16 cl_next_free_rec; /* Next unused chain slot */ __le64 cl_reserved1; -/*10*/ struct ocfs2_chain_rec cl_recs[]; /* Chain records */ + /* Chain records */ +/*10*/ struct ocfs2_chain_rec cl_recs[] __counted_by_le(cl_count); }; /* @@ -494,7 +496,8 @@ struct ocfs2_truncate_log { /*00*/ __le16 tl_count; /* Total records in this log */ __le16 tl_used; /* Number of records in use */ __le32 tl_reserved1; -/*08*/ struct ocfs2_truncate_rec tl_recs[]; /* Truncate records */ + /* Truncate records */ +/*08*/ struct ocfs2_truncate_rec tl_recs[] __counted_by_le(tl_count); }; /* @@ -614,7 +617,7 @@ struct ocfs2_super_block { __le16 s_reserved0; __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. * s_uuid_hash serves as seed[3]. */ -/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ +/*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* @@ -796,9 +799,10 @@ struct ocfs2_dx_entry_list { * possible in de_entries */ __le16 de_num_used; /* Current number of * de_entries entries */ - struct ocfs2_dx_entry de_entries[]; /* Indexed dir entries - * in a packed array of - * length de_num_used */ + /* Indexed dir entries in a packed + * array of length de_num_used. + */ + struct ocfs2_dx_entry de_entries[] __counted_by_le(de_count); }; #define OCFS2_DX_FLAG_INLINE 0x01 @@ -883,7 +887,8 @@ struct ocfs2_group_desc __le16 bg_free_bits_count; /* Free bits count */ __le16 bg_chain; /* What chain I am in. */ /*10*/ __le32 bg_generation; - __le32 bg_reserved1; + __le16 bg_contig_free_bits; /* max contig free bits length */ + __le16 bg_reserved1; __le64 bg_next_group; /* Next group in my list, in blocks */ /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in @@ -933,7 +938,8 @@ struct ocfs2_refcount_list { __le16 rl_used; /* Current number of used records */ __le32 rl_reserved2; __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ -/*10*/ struct ocfs2_refcount_rec rl_recs[]; /* Refcount records */ + /* Refcount records */ +/*10*/ struct ocfs2_refcount_rec rl_recs[] __counted_by_le(rl_count); }; @@ -1019,7 +1025,8 @@ struct ocfs2_xattr_header { buckets. A block uses xb_check and sets this field to zero.) */ - struct ocfs2_xattr_entry xh_entries[]; /* xattr entry list. */ + /* xattr entry list. */ + struct ocfs2_xattr_entry xh_entries[] __counted_by_le(xh_count); }; /* @@ -1082,7 +1089,7 @@ struct ocfs2_xattr_block { struct ocfs2_xattr_header xb_header; /* xattr header if this block contains xattr */ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this - block cotains xattr + block contains xattr tree. */ } xb_attrs; }; diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 9680797bc531..2de2f8733283 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -215,7 +215,7 @@ struct ocfs2_move_extents { movement less likely to fail, may make fs even more fragmented */ -#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation +#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmentation completely gets done. */ diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 8ac357ce6a30..9b234c03d693 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -93,7 +93,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_DATA] = "Data", [OCFS2_LOCK_TYPE_SUPER] = "Super", [OCFS2_LOCK_TYPE_RENAME] = "Rename", - /* Need to differntiate from [R]ename.. serializing writes is the + /* Need to differentiate from [R]ename.. serializing writes is the * important job it does, anyway. */ [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index dc4bce1649c1..4b32fb5658ad 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -82,7 +82,7 @@ DECLARE_EVENT_CLASS(ocfs2__string, __string(name,name) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); ), TP_printk("%s", __get_str(name)) ); @@ -1157,8 +1157,6 @@ DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end); DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage); -DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage); - DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap); TRACE_EVENT(ocfs2_try_to_write_inline_data, @@ -1291,7 +1289,7 @@ DECLARE_EVENT_CLASS(ocfs2__file_ops, __entry->dentry = dentry; __entry->ino = ino; __entry->d_len = d_len; - __assign_str(d_name, d_name); + __assign_str(d_name); __entry->para = para; ), TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file, @@ -1315,10 +1313,10 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); - DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); + DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error); @@ -1427,7 +1425,7 @@ TRACE_EVENT(ocfs2_setattr, __entry->dentry = dentry; __entry->ino = ino; __entry->d_len = d_len; - __assign_str(d_name, d_name); + __assign_str(d_name); __entry->ia_valid = ia_valid; __entry->ia_mode = ia_mode; __entry->ia_uid = ia_uid; @@ -1470,6 +1468,7 @@ TRACE_EVENT(ocfs2_prepare_inode_for_write, ); DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret); +DEFINE_OCFS2_INT_EVENT(filemap_splice_read_ret); /* End of trace events for fs/ocfs2/file.c. */ @@ -1570,8 +1569,6 @@ DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode); DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode); -DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode); - TRACE_EVENT(ocfs2_inode_revalidate, TP_PROTO(void *inode, unsigned long long ino, unsigned int flags), @@ -1659,34 +1656,34 @@ TRACE_EVENT(ocfs2_remount, ); TRACE_EVENT(ocfs2_fill_super, - TP_PROTO(void *sb, void *data, int silent), - TP_ARGS(sb, data, silent), + TP_PROTO(void *sb, void *fc, int silent), + TP_ARGS(sb, fc, silent), TP_STRUCT__entry( __field(void *, sb) - __field(void *, data) + __field(void *, fc) __field(int, silent) ), TP_fast_assign( __entry->sb = sb; - __entry->data = data; + __entry->fc = fc; __entry->silent = silent; ), TP_printk("%p %p %d", __entry->sb, - __entry->data, __entry->silent) + __entry->fc, __entry->silent) ); TRACE_EVENT(ocfs2_parse_options, - TP_PROTO(int is_remount, char *options), - TP_ARGS(is_remount, options), + TP_PROTO(int is_remount, const char *option), + TP_ARGS(is_remount, option), TP_STRUCT__entry( __field(int, is_remount) - __string(options, options) + __string(option, option) ), TP_fast_assign( __entry->is_remount = is_remount; - __assign_str(options, options); + __assign_str(option); ), - TP_printk("%d %s", __entry->is_remount, __get_str(options)) + TP_printk("%d %s", __entry->is_remount, __get_str(option)) ); DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super); @@ -1719,8 +1716,8 @@ TRACE_EVENT(ocfs2_initialize_super, __field(int, cluster_bits) ), TP_fast_assign( - __assign_str(label, label); - __assign_str(uuid_str, uuid_str); + __assign_str(label); + __assign_str(uuid_str); __entry->root_dir = root_dir; __entry->system_dir = system_dir; __entry->cluster_bits = cluster_bits; @@ -1747,7 +1744,7 @@ TRACE_EVENT(ocfs2_init_xattr_set_ctxt, __field(int, credits) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->meta = meta; __entry->clusters = clusters; __entry->credits = credits; @@ -1771,7 +1768,7 @@ DECLARE_EVENT_CLASS(ocfs2__xattr_find, ), TP_fast_assign( __entry->ino = ino; - __assign_str(name, name); + __assign_str(name); __entry->name_index = name_index; __entry->hash = hash; __entry->location = location; @@ -2020,7 +2017,7 @@ TRACE_EVENT(ocfs2_sync_dquot_helper, __entry->dq_id = dq_id; __entry->dq_type = dq_type; __entry->type = type; - __assign_str(s_id, s_id); + __assign_str(s_id); ), TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type, __entry->type, __get_str(s_id)) @@ -2061,7 +2058,7 @@ TRACE_EVENT(ocfs2_dx_dir_search, TP_fast_assign( __entry->ino = ino; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->major_hash = major_hash; __entry->minor_hash = minor_hash; __entry->blkno = blkno; @@ -2089,7 +2086,7 @@ TRACE_EVENT(ocfs2_find_files_on_disk, ), TP_fast_assign( __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->blkno = blkno; __entry->dir = dir; ), @@ -2108,7 +2105,7 @@ TRACE_EVENT(ocfs2_check_dir_for_entry, TP_fast_assign( __entry->dir = dir; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%llu %.*s", __entry->dir, __entry->namelen, __get_str(name)) @@ -2136,7 +2133,7 @@ TRACE_EVENT(ocfs2_dx_dir_index_root_block, __entry->major_hash = major_hash; __entry->minor_hash = minor_hash; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->num_used = num_used; ), TP_printk("%llu %x %x %.*s %u", __entry->dir, @@ -2172,7 +2169,7 @@ DECLARE_EVENT_CLASS(ocfs2__dentry_ops, __entry->dir = dir; __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->dir_blkno = dir_blkno; __entry->extra = extra; ), @@ -2218,7 +2215,7 @@ TRACE_EVENT(ocfs2_mknod, __entry->dir = dir; __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->dir_blkno = dir_blkno; __entry->dev = dev; __entry->mode = mode; @@ -2242,9 +2239,9 @@ TRACE_EVENT(ocfs2_link, TP_fast_assign( __entry->ino = ino; __entry->old_len = old_len; - __assign_str(old_name, old_name); + __assign_str(old_name); __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%llu %.*s %.*s", __entry->ino, __entry->old_len, __get_str(old_name), @@ -2280,9 +2277,9 @@ TRACE_EVENT(ocfs2_rename, __entry->new_dir = new_dir; __entry->new_dentry = new_dentry; __entry->old_len = old_len; - __assign_str(old_name, old_name); + __assign_str(old_name); __entry->new_len = new_len; - __assign_str(new_name, new_name); + __assign_str(new_name); ), TP_printk("%p %p %p %p %.*s %.*s", __entry->old_dir, __entry->old_dentry, @@ -2302,7 +2299,7 @@ TRACE_EVENT(ocfs2_rename_target_exists, ), TP_fast_assign( __entry->new_len = new_len; - __assign_str(new_name, new_name); + __assign_str(new_name); ), TP_printk("%.*s", __entry->new_len, __get_str(new_name)) ); @@ -2345,7 +2342,7 @@ TRACE_EVENT(ocfs2_symlink_begin, __entry->dentry = dentry; __entry->symname = symname; __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry, __entry->symname, __entry->len, __get_str(name)) @@ -2361,7 +2358,7 @@ TRACE_EVENT(ocfs2_blkno_stringify, ), TP_fast_assign( __entry->blkno = blkno; - __assign_str(name, name); + __assign_str(name); __entry->namelen = namelen; ), TP_printk("%llu %s %d", __entry->blkno, __get_str(name), @@ -2382,7 +2379,7 @@ TRACE_EVENT(ocfs2_orphan_del, ), TP_fast_assign( __entry->dir = dir; - __assign_str(name, name); + __assign_str(name); __entry->namelen = namelen; ), TP_printk("%llu %s %d", __entry->dir, __get_str(name), @@ -2404,7 +2401,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate, TP_fast_assign( __entry->dentry = dentry; __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name)) ); @@ -2421,7 +2418,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate_negative, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->pgen = pgen; __entry->gen = gen; ), @@ -2446,7 +2443,7 @@ TRACE_EVENT(ocfs2_find_local_alias, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%.*s", __entry->len, __get_str(name)) ); @@ -2463,7 +2460,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->parent = parent; __entry->fsdata = fsdata; ), @@ -2481,7 +2478,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found, __field(unsigned long long, ino) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->parent = parent; __entry->ino = ino; ), @@ -2528,7 +2525,7 @@ TRACE_EVENT(ocfs2_get_parent, TP_fast_assign( __entry->child = child; __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->ino = ino; ), TP_printk("%p %.*s %llu", __entry->child, __entry->len, @@ -2552,7 +2549,7 @@ TRACE_EVENT(ocfs2_encode_fh_begin, TP_fast_assign( __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->fh = fh; __entry->len = len; __entry->connectable = connectable; @@ -2578,6 +2575,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); +DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits); + DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index ebb5c99f490e..788a8de922a4 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -97,7 +97,6 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); int ocfs2_global_read_info(struct super_block *sb, int type); int ocfs2_global_write_info(struct super_block *sb, int type); -int ocfs2_global_read_dquot(struct dquot *dquot); int __ocfs2_sync_dquot(struct dquot *dquot, int freeing); static inline int ocfs2_sync_dquot(struct dquot *dquot) { diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index dc9f76ab7e13..e85b1ccf81be 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -273,7 +273,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, if (new) memset(bh->b_data, 0, sb->s_blocksize); memcpy(bh->b_data + offset, data, len); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); unlock_buffer(bh); ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh); @@ -371,12 +371,16 @@ int ocfs2_global_read_info(struct super_block *sb, int type) status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk, &pcount, NULL); - if (status < 0) + if (status < 0) { + mlog_errno(status); goto out_unlock; + } status = ocfs2_qinfo_lock(oinfo, 0); - if (status < 0) + if (status < 0) { + mlog_errno(status); goto out_unlock; + } status = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct ocfs2_global_disk_dqinfo), OCFS2_GLOBAL_INFO_OFF); @@ -404,12 +408,11 @@ int ocfs2_global_read_info(struct super_block *sb, int type) schedule_delayed_work(&oinfo->dqi_sync_work, msecs_to_jiffies(oinfo->dqi_syncms)); -out_err: - return status; + return 0; out_unlock: ocfs2_unlock_global_qf(oinfo, 0); - mlog_errno(status); - goto out_err; +out_err: + return status; } /* Write information to global quota file. Expects exclusive lock on quota @@ -447,14 +450,17 @@ int ocfs2_global_write_info(struct super_block *sb, int type) int err; struct quota_info *dqopt = sb_dqopt(sb); struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv; + unsigned int memalloc; down_write(&dqopt->dqio_sem); + memalloc = memalloc_nofs_save(); err = ocfs2_qinfo_lock(info, 1); if (err < 0) goto out_sem; err = __ocfs2_global_write_info(sb, type); ocfs2_qinfo_unlock(info, 1); out_sem: + memalloc_nofs_restore(memalloc); up_write(&dqopt->dqio_sem); return err; } @@ -601,6 +607,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; struct ocfs2_super *osb = OCFS2_SB(sb); int status = 0; + unsigned int memalloc; trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id), dquot->dq_id.type, @@ -618,6 +625,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) goto out_ilock; } down_write(&sb_dqopt(sb)->dqio_sem); + memalloc = memalloc_nofs_save(); status = ocfs2_sync_dquot(dquot); if (status < 0) mlog_errno(status); @@ -625,6 +633,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) status = ocfs2_local_write_dquot(dquot); if (status < 0) mlog_errno(status); + memalloc_nofs_restore(memalloc); up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out_ilock: @@ -662,6 +671,7 @@ static int ocfs2_write_dquot(struct dquot *dquot) handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; + unsigned int memalloc; trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id), dquot->dq_id.type); @@ -673,7 +683,9 @@ static int ocfs2_write_dquot(struct dquot *dquot) goto out; } down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem); + memalloc = memalloc_nofs_save(); status = ocfs2_local_write_dquot(dquot); + memalloc_nofs_restore(memalloc); up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out: @@ -749,6 +761,11 @@ static int ocfs2_release_dquot(struct dquot *dquot) handle = ocfs2_start_trans(osb, ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type)); if (IS_ERR(handle)) { + /* + * Mark dquot as inactive to avoid endless cycle in + * quota_release_workfn(). + */ + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); status = PTR_ERR(handle); mlog_errno(status); goto out_ilock; @@ -881,7 +898,7 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) int status = 0; trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); - if (!sb_has_quota_loaded(sb, type)) { + if (!sb_has_quota_active(sb, type)) { status = -ESRCH; goto out; } @@ -920,6 +937,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(sb); + unsigned int memalloc; trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id), type); @@ -946,6 +964,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) goto out_ilock; } down_write(&sb_dqopt(sb)->dqio_sem); + memalloc = memalloc_nofs_save(); status = ocfs2_sync_dquot(dquot); if (status < 0) { mlog_errno(status); @@ -954,6 +973,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) /* Now write updated local dquot structure */ status = ocfs2_local_write_dquot(dquot); out_dlock: + memalloc_nofs_restore(memalloc); up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out_ilock: diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 5022b3e9bfcd..de7f12858729 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -453,8 +453,7 @@ out: /* Sync changes in local quota file into global quota file and * reinitialize local quota file. - * The function expects local quota file to be already locked and - * s_umount locked in shared mode. */ + * The function expects local quota file to be already locked. */ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int type, struct ocfs2_quota_recovery *rec) @@ -470,6 +469,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int bit, chunk; struct ocfs2_recovery_chunk *rchunk, *next; qsize_t spacechange, inodechange; + unsigned int memalloc; trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type); @@ -521,6 +521,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, goto out_drop_lock; } down_write(&sb_dqopt(sb)->dqio_sem); + memalloc = memalloc_nofs_save(); spin_lock(&dquot->dq_dqb_lock); /* Add usage from quota entry into quota changes * of our node. Auxiliary variables are important @@ -553,6 +554,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, unlock_buffer(qbh); ocfs2_journal_dirty(handle, qbh); out_commit: + memalloc_nofs_restore(memalloc); up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(OCFS2_SB(sb), handle); out_drop_lock: @@ -585,7 +587,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, { unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; - struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; handle_t *handle; @@ -597,7 +598,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); - down_read(&sb->s_umount); for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; @@ -674,8 +674,7 @@ out_put: break; } out: - up_read(&sb->s_umount); - kfree(rec); + ocfs2_free_quota_recovery(rec); return status; } @@ -689,7 +688,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) int status; struct buffer_head *bh = NULL; struct ocfs2_quota_recovery *rec; - int locked = 0; + int locked = 0, global_read = 0; info->dqi_max_spc_limit = 0x7fffffffffffffffLL; info->dqi_max_ino_limit = 0x7fffffffffffffffLL; @@ -697,6 +696,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) if (!oinfo) { mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" " info."); + status = -ENOMEM; goto out_err; } info->dqi_priv = oinfo; @@ -709,6 +709,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) status = ocfs2_global_read_info(sb, type); if (status < 0) goto out_err; + global_read = 1; status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1); if (status < 0) { @@ -779,10 +780,12 @@ out_err: if (locked) ocfs2_inode_unlock(lqinode, 1); ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + if (global_read) + cancel_delayed_work_sync(&oinfo->dqi_sync_work); kfree(oinfo); } brelse(bh); - return -1; + return status; } /* Write local info to quota file */ @@ -811,7 +814,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) struct ocfs2_quota_chunk *chunk; struct ocfs2_local_disk_chunk *dchunk; int mark_clean = 1, len; - int status; + int status = 0; iput(oinfo->dqi_gqinode); ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); @@ -836,8 +839,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); /* - * s_umount held in exclusive mode protects us against racing with - * recovery thread... + * ocfs2_dismount_volume() has already aborted quota recovery... */ if (oinfo->dqi_rec) { ocfs2_free_quota_recovery(oinfo->dqi_rec); @@ -853,17 +855,15 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) oinfo->dqi_libh, olq_update_info, info); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto out; - } - out: ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); - return 0; + info->dqi_priv = NULL; + return status; } static void olq_set_dquot(struct buffer_head *bh, void *private) @@ -1243,6 +1243,10 @@ int ocfs2_create_local_dquot(struct dquot *dquot) &od->dq_local_phys_blk, &pcount, NULL); + if (status < 0) { + mlog_errno(status); + goto out; + } /* Initialize dquot structure on disk */ status = ocfs2_local_write_dquot(dquot); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 623db358b1ef..c92e0ea85bca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -25,6 +25,7 @@ #include "namei.h" #include "ocfs2_trace.h" #include "file.h" +#include "symlink.h" #include <linux/bio.h> #include <linux/blkdev.h> @@ -33,6 +34,7 @@ #include <linux/pagevec.h> #include <linux/swap.h> #include <linux/security.h> +#include <linux/string.h> #include <linux/fsnotify.h> #include <linux/quotaops.h> #include <linux/namei.h> @@ -620,7 +622,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, /* Initialize ocfs2_refcount_block. */ rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(rb, 0, inode->i_sb->s_blocksize); - strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + strscpy(rb->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE); rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); @@ -630,7 +632,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); spin_lock(&osb->osb_lock); - rb->rf_generation = osb->s_next_generation++; + rb->rf_generation = cpu_to_le32(osb->s_next_generation++); spin_unlock(&osb->osb_lock); ocfs2_journal_dirty(handle, new_bh); @@ -1392,13 +1394,6 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b) return 0; } -static void swap_refcount_rec(void *a, void *b, int size) -{ - struct ocfs2_refcount_rec *l = a, *r = b; - - swap(*l, *r); -} - /* * The refcount cpos are ordered by their 64bit cpos, * But we will use the low 32 bit to be the e_cpos in the b-tree. @@ -1474,7 +1469,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, */ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), - cmp_refcount_rec_by_low_cpos, swap_refcount_rec); + cmp_refcount_rec_by_low_cpos, NULL); ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); if (ret) { @@ -1499,11 +1494,11 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), - cmp_refcount_rec_by_cpos, swap_refcount_rec); + cmp_refcount_rec_by_cpos, NULL); sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), sizeof(struct ocfs2_refcount_rec), - cmp_refcount_rec_by_cpos, swap_refcount_rec); + cmp_refcount_rec_by_cpos, NULL); *split_cpos = cpos; return 0; @@ -1568,7 +1563,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, /* Initialize ocfs2_refcount_block. */ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(new_rb, 0, sb->s_blocksize); - strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + strscpy(new_rb->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE); new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); @@ -2426,7 +2421,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, * * If we will insert a new one, this is easy and only happens * during adding refcounted flag to the extent, so we don't - * have a chance of spliting. We just need one record. + * have a chance of splitting. We just need one record. * * If the refcount rec already exists, that would be a little * complicated. we may have to: @@ -2616,11 +2611,11 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, /* * Calculate out the start and number of virtual clusters we need to CoW. * - * cpos is vitual start cluster position we want to do CoW in a + * cpos is virtual start cluster position we want to do CoW in a * file and write_len is the cluster length. * max_cpos is the place where we want to stop CoW intentionally. * - * Normal we will start CoW from the beginning of extent record cotaining cpos. + * Normal we will start CoW from the beginning of extent record containing cpos. * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we * get good I/O from the resulting extent tree. */ @@ -2908,7 +2903,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, int ret = 0, partial; struct super_block *sb = inode->i_sb; u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); - struct page *page; pgoff_t page_index; unsigned int from, to; loff_t offset, end, map_end; @@ -2927,6 +2921,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, end = i_size_read(inode); while (offset < end) { + struct folio *folio; page_index = offset >> PAGE_SHIFT; map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) @@ -2939,9 +2934,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, to = map_end & (PAGE_SIZE - 1); retry: - page = find_or_create_page(mapping, page_index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, page_index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); mlog_errno(ret); break; } @@ -2951,18 +2947,17 @@ retry: * page, so write it back. */ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { - if (PageDirty(page)) { - /* - * write_on_page will unlock the page on return - */ - ret = write_one_page(page); + if (folio_test_dirty(folio)) { + folio_unlock(folio); + folio_put(folio); + + ret = filemap_write_and_wait_range(mapping, + offset, map_end - 1); goto retry; } } - if (!PageUptodate(page)) { - struct folio *folio = page_folio(page); - + if (!folio_test_uptodate(folio)) { ret = block_read_full_folio(folio, ocfs2_get_block); if (ret) { mlog_errno(ret); @@ -2971,8 +2966,8 @@ retry: folio_lock(folio); } - if (page_has_buffers(page)) { - ret = walk_page_buffers(handle, page_buffers(page), + if (folio_buffers(folio)) { + ret = walk_page_buffers(handle, folio_buffers(folio), from, to, &partial, ocfs2_clear_cow_buffer); if (ret) { @@ -2981,14 +2976,12 @@ retry: } } - ocfs2_map_and_dirty_page(inode, - handle, from, to, - page, 0, &new_block); - mark_page_accessed(page); + ocfs2_map_and_dirty_folio(inode, handle, from, to, + folio, 0, &new_block); + folio_mark_accessed(folio); unlock: - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); offset = map_end; if (ret) break; @@ -3749,9 +3742,9 @@ static int ocfs2_change_ctime(struct inode *inode, goto out_commit; } - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(handle, di_bh); @@ -4072,12 +4065,12 @@ static int ocfs2_complete_reflink(struct inode *s_inode, * we want mtime to appear identical to the source and * update ctime. */ - t_inode->i_ctime = current_time(t_inode); + inode_set_ctime_current(t_inode); - di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode)); - t_inode->i_mtime = s_inode->i_mtime; + inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode)); di->i_mtime = s_di->i_mtime; di->i_mtime_nsec = s_di->i_mtime_nsec; } @@ -4154,8 +4147,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry, int ret; struct inode *inode = d_inode(old_dentry); struct buffer_head *new_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { ret = -EINVAL; mlog_errno(ret); goto out; @@ -4181,6 +4175,26 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out_unlock; } + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && + (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + /* + * Adjust extent record count to reserve space for extended attribute. + * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). + */ + struct ocfs2_inode_info *new_oi = OCFS2_I(new_inode); + + if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && + !(ocfs2_inode_is_fast_symlink(new_inode))) { + struct ocfs2_dinode *new_di = (struct ocfs2_dinode *)new_bh->b_data; + struct ocfs2_dinode *old_di = (struct ocfs2_dinode *)old_bh->b_data; + struct ocfs2_extent_list *el = &new_di->id2.i_list; + int inline_size = le16_to_cpu(old_di->i_xattr_inline_size); + + le16_add_cpu(&el->l_count, -(inline_size / + sizeof(struct ocfs2_extent_rec))); + } + } + ret = ocfs2_create_reflink_node(inode, old_bh, new_inode, new_bh, preserve); if (ret) { @@ -4188,7 +4202,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto inode_unlock; } - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { ret = ocfs2_reflink_xattrs(inode, old_bh, new_inode, new_bh, preserve); @@ -4316,7 +4330,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); } /** @@ -4370,7 +4384,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * file. */ if (!preserve) { - error = inode_permission(&init_user_ns, inode, MAY_READ); + error = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (error) return error; } @@ -4405,7 +4419,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4422,7 +4436,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); @@ -4455,7 +4469,7 @@ int ocfs2_reflink_update_dest(struct inode *dest, if (newlen > i_size_read(dest)) i_size_write(dest, newlen); spin_unlock(&OCFS2_I(dest)->ip_lock); - dest->i_ctime = dest->i_mtime = current_time(dest); + inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest)); ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); if (ret) { diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index a9d1296d736d..1fe61974d9f0 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -414,7 +414,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, start = search_start; while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, - start)) != -1) { + start)) < resmap->m_bitmap_len) { /* Search reached end of the region */ if (offset >= (search_start + search_len)) break; diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index ec8101ef5717..4fce17180342 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -31,7 +31,7 @@ struct ocfs2_alloc_reservation { #define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */ #define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be - * destroyed immedately after use */ + * destroyed immediately after use */ #define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed * directory btree */ @@ -125,7 +125,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, /** * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used. * @resmap: reservations bitmap - * @resv: optional reservation to recalulate based on new bitmap + * @resv: optional reservation to recalculate based on new bitmap * @cstart: start of allocation in clusters * @clen: end of allocation in clusters. * diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index d65d43c61857..b0733c08ed13 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -91,6 +91,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); u16 old_bg_clusters; + u16 contig_bits; + __le16 old_bg_contig_free_bits; trace_ocfs2_update_last_group_and_inode(new_clusters, first_new_cluster); @@ -122,6 +124,11 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } + contig_bits = ocfs2_find_max_contig_free_bits(group->bg_bitmap, + le16_to_cpu(group->bg_bits), 0); + old_bg_contig_free_bits = group->bg_contig_free_bits; + group->bg_contig_free_bits = cpu_to_le16(contig_bits); + ocfs2_journal_dirty(handle, group_bh); /* update the inode accordingly. */ @@ -160,6 +167,7 @@ out_rollback: le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); + group->bg_contig_free_bits = old_bg_contig_free_bits; } out: if (ret) @@ -566,6 +574,8 @@ out_commit: ocfs2_commit_trans(osb, handle); out_free_group_bh: + if (ret < 0) + ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); out_unlock: diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index da7718cef735..e544c704b583 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -37,7 +37,7 @@ struct ocfs2_slot_info { unsigned int si_blocks; struct buffer_head **si_bh; unsigned int si_num_slots; - struct ocfs2_slot si_slots[]; + struct ocfs2_slot si_slots[] __counted_by(si_num_slots); }; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index c973c03f6fd8..f58e891aa2da 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -227,7 +227,7 @@ static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) } /* - * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB + * o2dlm always has a "valid" LVB. If the dlm loses track of the LVB * contents, it will zero out the LVB. Thus the caller can always trust * the contents. */ @@ -404,7 +404,7 @@ static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, return 0; } -static struct ocfs2_stack_operations o2cb_stack_ops = { +static const struct ocfs2_stack_operations o2cb_stack_ops = { .connect = o2cb_cluster_connect, .disconnect = o2cb_cluster_disconnect, .this_node = o2cb_cluster_this_node, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 64e6ddcfe329..be0a5758bd40 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> @@ -359,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file, struct ocfs2_control_message_setn *msg) { long nodenum; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; if (ocfs2_control_get_handshake_state(file) != @@ -374,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file, return -EINVAL; msg->space = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || @@ -390,7 +389,6 @@ static int ocfs2_control_do_setversion_msg(struct file *file, struct ocfs2_control_message_setv *msg) { long major, minor; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; struct ocfs2_protocol_version *max = &ocfs2_user_plugin.sp_max_proto; @@ -408,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - major = simple_strtol(msg->major, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->major, 16, &major)) return -EINVAL; - minor = simple_strtol(msg->minor, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->minor, 16, &minor)) return -EINVAL; /* @@ -440,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file, struct ocfs2_control_message_down *msg) { long nodenum; - char *p = NULL; if (ocfs2_control_get_handshake_state(file) != OCFS2_CONTROL_HANDSHAKE_VALID) @@ -455,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &p, 16); - if (!p || *p) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || @@ -737,20 +731,13 @@ static int user_plock(struct ocfs2_cluster_connection *conn, * * Internally, fs/dlm will pass these to a misc device, which * a userspace daemon will read and write to. - * - * For now, cancel requests (which happen internally only), - * are turned into unlocks. Most of this function taken from - * gfs2_lock. */ - if (cmd == F_CANCELLK) { - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } - - if (IS_GETLK(cmd)) + if (cmd == F_CANCELLK) + return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl); + else if (IS_GETLK(cmd)) return dlm_posix_get(conn->cc_lockspace, ino, file, fl); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); else return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); @@ -965,7 +952,7 @@ static const struct dlm_lockspace_ops ocfs2_ls_ops = { static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) { version_unlock(conn); - dlm_release_lockspace(conn->cc_lockspace, 2); + dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL); conn->cc_lockspace = NULL; ocfs2_live_connection_drop(conn->cc_private); conn->cc_private = NULL; @@ -1024,6 +1011,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) printk(KERN_ERR "ocfs2: Could not determine" " locking version\n"); user_cluster_disconnect(conn); + lc = NULL; goto out; } wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); @@ -1071,7 +1059,7 @@ static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, return 0; } -static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { +static const struct ocfs2_stack_operations ocfs2_user_plugin_ops = { .connect = user_cluster_connect, .disconnect = user_cluster_disconnect, .this_node = user_cluster_this_node, diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index a8d5ca98fa57..a28c127b9934 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -650,7 +650,7 @@ error: * and easier to preserve the name. */ -static struct ctl_table ocfs2_nm_table[] = { +static const struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, @@ -658,7 +658,6 @@ static struct ctl_table ocfs2_nm_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - { } }; static struct ctl_table_header *ocfs2_table_header; @@ -692,8 +691,7 @@ static void __exit ocfs2_stack_glue_exit(void) memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); - if (ocfs2_table_header) - unregister_sysctl_table(ocfs2_table_header); + unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 3636847fae19..5486a6dce70a 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -210,7 +210,7 @@ struct ocfs2_stack_operations { struct file_lock *fl); /* - * This is an optoinal debugging hook. If provided, the + * This is an optional debugging hook. If provided, the * stack can dump debugging information about this lock. */ void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb); @@ -223,7 +223,7 @@ struct ocfs2_stack_operations { */ struct ocfs2_stack_plugin { char *sp_name; - struct ocfs2_stack_operations *sp_ops; + const struct ocfs2_stack_operations *sp_ops; struct module *sp_owner; /* These are managed by the stackglue code. */ diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 166c8918c825..6ac4dcd54588 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -50,6 +50,10 @@ struct ocfs2_suballoc_result { u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ + unsigned int sr_max_contig_bits; /* The length for contiguous + * free bits, only available + * for cluster group + */ }; static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) @@ -694,10 +698,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, ac, cl); - if (PTR_ERR(bg_bh) == -ENOSPC) + if (PTR_ERR(bg_bh) == -ENOSPC) { + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; bg_bh = ocfs2_block_group_alloc_discontig(handle, alloc_inode, ac, cl); + } if (IS_ERR(bg_bh)) { status = PTR_ERR(bg_bh); bg_bh = NULL; @@ -1272,6 +1278,26 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, return ret; } +u16 ocfs2_find_max_contig_free_bits(void *bitmap, + u16 total_bits, u16 start) +{ + u16 offset, free_bits; + u16 contig_bits = 0; + + while (start < total_bits) { + offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); + if (offset == total_bits) + break; + + start = ocfs2_find_next_bit(bitmap, total_bits, offset); + free_bits = start - offset; + if (contig_bits < free_bits) + contig_bits = free_bits; + } + + return contig_bits; +} + static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, @@ -1280,6 +1306,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, { void *bitmap; u16 best_offset, best_size; + u16 prev_best_size = 0; int offset, start, found, status = 0; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; @@ -1290,10 +1317,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; - while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { - if (offset == total_bits) - break; - + while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < + total_bits) { if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { /* We found a zero, but we can't use it as it * hasn't been put to disk yet! */ @@ -1308,6 +1333,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, /* got a zero after some ones */ found = 1; start = offset + 1; + prev_best_size = best_size; } if (found > best_size) { best_size = found; @@ -1320,6 +1346,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, } } + /* best_size will be allocated, we save prev_best_size */ + res->sr_max_contig_bits = prev_best_size; if (best_size) { res->sr_bit_offset = best_offset; res->sr_bits = best_size; @@ -1337,11 +1365,16 @@ int ocfs2_block_group_set_bits(handle_t *handle, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, - unsigned int num_bits) + unsigned int num_bits, + unsigned int max_contig_bits, + int fastpath) { int status; void *bitmap = bg->bg_bitmap; int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + unsigned int start = bit_off + num_bits; + u16 contig_bits; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); /* All callers get the descriptor via * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -1373,6 +1406,29 @@ int ocfs2_block_group_set_bits(handle_t *handle, while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); + /* + * this is optimize path, caller set old contig value + * in max_contig_bits to bypass finding action. + */ + if (fastpath) { + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { + /* + * Usually, the block group bitmap allocates only 1 bit + * at a time, while the cluster group allocates n bits + * each time. Therefore, we only save the contig bits for + * the cluster group. + */ + contig_bits = ocfs2_find_max_contig_free_bits(bitmap, + le16_to_cpu(bg->bg_bits), start); + if (contig_bits > max_contig_bits) + max_contig_bits = contig_bits; + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); + } else { + bg->bg_contig_free_bits = 0; + } + ocfs2_journal_dirty(handle, group_bh); bail: @@ -1486,7 +1542,12 @@ static int ocfs2_cluster_group_search(struct inode *inode, BUG_ON(!ocfs2_is_cluster_bitmap(inode)); - if (gd->bg_free_bits_count) { + if (le16_to_cpu(gd->bg_contig_free_bits) && + le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) + return -ENOSPC; + + /* ->bg_contig_free_bits may un-initialized, so compare again */ + if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { max_bits = le16_to_cpu(gd->bg_bits); /* Tail groups in cluster bitmaps which aren't cpg @@ -1530,13 +1591,6 @@ static int ocfs2_cluster_group_search(struct inode *inode, * of bits. */ if (min_bits <= res->sr_bits) search = 0; /* success */ - else if (res->sr_bits) { - /* - * Don't show bits which we'll be returning - * for allocation to the local alloc bitmap. - */ - ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); - } } return search; @@ -1555,7 +1609,7 @@ static int ocfs2_block_group_search(struct inode *inode, BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) { + if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), @@ -1715,7 +1769,8 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, - res->sr_bit_offset, res->sr_bits); + res->sr_bit_offset, res->sr_bits, + res->sr_max_contig_bits, 0); if (ret < 0) { ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, res->sr_bits, @@ -1741,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; + u32 contig_bits; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1766,10 +1822,21 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, status = -ENOSPC; /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ - while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, - ac->ac_max_block, - res)) == -ENOSPC) { + while (1) { + if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { + contig_bits = le16_to_cpu(bg->bg_contig_free_bits); + if (!contig_bits) + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, + le16_to_cpu(bg->bg_bits), 0); + if (bits_wanted > contig_bits && contig_bits >= min_bits) + bits_wanted = contig_bits; + } + + status = ac->ac_group_search(alloc_inode, group_bh, + bits_wanted, min_bits, + ac->ac_max_block, res); + if (status != -ENOSPC) + break; if (!bg->bg_next_group) break; @@ -1849,7 +1916,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bg, group_bh, res->sr_bit_offset, - res->sr_bits); + res->sr_bits, + res->sr_max_contig_bits, + 0); if (status < 0) { ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, res->sr_bits, chain); @@ -1927,6 +1996,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; +search: status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { @@ -1951,7 +2021,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { if (i == victim) continue; - if (!cl->cl_recs[i].c_free) + if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) continue; ac->ac_chain = i; @@ -1967,6 +2037,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, } } + /* Chains can't supply the bits_wanted contiguous space. + * We should switch to using every single bit when allocating + * from the global bitmap. */ + if (i == le16_to_cpu(cl->cl_next_free_rec) && + status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; + ac->ac_chain = victim; + goto search; + } + set_hint: if (status != -ENOSPC) { /* If the next search of this group is not likely to @@ -2163,7 +2243,9 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, bg, bg_bh, res->sr_bit_offset, - res->sr_bits); + res->sr_bits, + res->sr_max_contig_bits, + 0); if (ret < 0) { ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, ac->ac_bh, res->sr_bits, chain); @@ -2308,7 +2390,8 @@ int __ocfs2_claim_clusters(handle_t *handle, BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL - && ac->ac_which != OCFS2_AC_USE_MAIN); + && ac->ac_which != OCFS2_AC_USE_MAIN + && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); if (ac->ac_which == OCFS2_AC_USE_LOCAL) { WARN_ON(min_clusters > 1); @@ -2382,11 +2465,13 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, struct buffer_head *group_bh, unsigned int bit_off, unsigned int num_bits, + unsigned int max_contig_bits, void (*undo_fn)(unsigned int bit, unsigned long *bmap)) { int status; unsigned int tmp; + u16 contig_bits; struct ocfs2_group_desc *undo_bg = NULL; struct journal_head *jh; @@ -2433,6 +2518,20 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, num_bits); } + /* + * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), + * we still need to rescan whole bitmap. + */ + if (ocfs2_is_cluster_bitmap(alloc_inode)) { + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, + le16_to_cpu(bg->bg_bits), 0); + if (contig_bits > max_contig_bits) + max_contig_bits = contig_bits; + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else { + bg->bg_contig_free_bits = 0; + } + if (undo_fn) spin_unlock(&jh->b_state_lock); @@ -2459,6 +2558,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group; + __le16 old_bg_contig_free_bits = 0; /* The alloc_bh comes from ocfs2_free_dinode() or * ocfs2_free_clusters(). The callers have all locked the @@ -2483,9 +2583,11 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); + if (ocfs2_is_cluster_bitmap(alloc_inode)) + old_bg_contig_free_bits = group->bg_contig_free_bits; status = ocfs2_block_group_clear_bits(handle, alloc_inode, group, group_bh, - start_bit, count, undo_fn); + start_bit, count, 0, undo_fn); if (status < 0) { mlog_errno(status); goto bail; @@ -2496,7 +2598,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, if (status < 0) { mlog_errno(status); ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, - start_bit, count); + start_bit, count, + le16_to_cpu(old_bg_contig_free_bits), 1); goto bail; } diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 9c74eace3adc..bcf2ed4a8631 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -29,6 +29,7 @@ struct ocfs2_alloc_context { #define OCFS2_AC_USE_MAIN 2 #define OCFS2_AC_USE_INODE 3 #define OCFS2_AC_USE_META 4 +#define OCFS2_AC_USE_MAIN_DISCONTIG 5 u32 ac_which; /* these are used by the chain search */ @@ -79,12 +80,16 @@ void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain); +u16 ocfs2_find_max_contig_free_bits(void *bitmap, + u16 total_bits, u16 start); int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, - unsigned int num_bits); + unsigned int num_bits, + unsigned int max_contig_bits, + int fastpath); int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0b0e6a132101..2c7ba1480f7a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -19,10 +19,10 @@ #include <linux/blkdev.h> #include <linux/socket.h> #include <linux/inet.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/crc32.h> #include <linux/debugfs.h> -#include <linux/mount.h> #include <linux/seq_file.h> #include <linux/quotaops.h> #include <linux/signal.h> @@ -80,17 +80,15 @@ struct mount_options unsigned int resv_level; int dir_resv_level; char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + bool user_stack; }; -static int ocfs2_parse_options(struct super_block *sb, char *options, - struct mount_options *mopt, - int is_remount); +static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options); static int ocfs2_show_options(struct seq_file *s, struct dentry *root); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); -static int ocfs2_remount(struct super_block *sb, int *flags, char *data); static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); static int ocfs2_initialize_mem_caches(void); static void ocfs2_free_mem_caches(void); @@ -122,7 +120,7 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); static int ocfs2_enable_quotas(struct ocfs2_super *osb); static void ocfs2_disable_quotas(struct ocfs2_super *osb); -static struct dquot **ocfs2_get_dquots(struct inode *inode) +static struct dquot __rcu **ocfs2_get_dquots(struct inode *inode) { return OCFS2_I(inode)->i_dquot; } @@ -131,11 +129,10 @@ static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, .free_inode = ocfs2_free_inode, - .drop_inode = ocfs2_drop_inode, + .drop_inode = inode_just_drop, .evict_inode = ocfs2_evict_inode, .sync_fs = ocfs2_sync_fs, .put_super = ocfs2_put_super, - .remount_fs = ocfs2_remount, .show_options = ocfs2_show_options, .quota_read = ocfs2_quota_read, .quota_write = ocfs2_quota_write, @@ -144,15 +141,10 @@ static const struct super_operations ocfs2_sops = { enum { Opt_barrier, - Opt_err_panic, - Opt_err_ro, + Opt_errors, Opt_intr, - Opt_nointr, - Opt_hb_none, - Opt_hb_local, - Opt_hb_global, - Opt_data_ordered, - Opt_data_writeback, + Opt_heartbeat, + Opt_data, Opt_atime_quantum, Opt_slot, Opt_commit, @@ -160,52 +152,64 @@ enum { Opt_localflocks, Opt_stack, Opt_user_xattr, - Opt_nouser_xattr, Opt_inode64, Opt_acl, - Opt_noacl, Opt_usrquota, Opt_grpquota, - Opt_coherency_buffered, - Opt_coherency_full, + Opt_coherency, Opt_resv_level, Opt_dir_resv_level, Opt_journal_async_commit, - Opt_err_cont, - Opt_err, }; -static const match_table_t tokens = { - {Opt_barrier, "barrier=%u"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_intr, "intr"}, - {Opt_nointr, "nointr"}, - {Opt_hb_none, OCFS2_HB_NONE}, - {Opt_hb_local, OCFS2_HB_LOCAL}, - {Opt_hb_global, OCFS2_HB_GLOBAL}, - {Opt_data_ordered, "data=ordered"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_atime_quantum, "atime_quantum=%u"}, - {Opt_slot, "preferred_slot=%u"}, - {Opt_commit, "commit=%u"}, - {Opt_localalloc, "localalloc=%d"}, - {Opt_localflocks, "localflocks"}, - {Opt_stack, "cluster_stack=%s"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_inode64, "inode64"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_coherency_buffered, "coherency=buffered"}, - {Opt_coherency_full, "coherency=full"}, - {Opt_resv_level, "resv_level=%u"}, - {Opt_dir_resv_level, "dir_resv_level=%u"}, - {Opt_journal_async_commit, "journal_async_commit"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err, NULL} +static const struct constant_table ocfs2_param_errors[] = { + {"panic", OCFS2_MOUNT_ERRORS_PANIC}, + {"remount-ro", OCFS2_MOUNT_ERRORS_ROFS}, + {"continue", OCFS2_MOUNT_ERRORS_CONT}, + {} +}; + +static const struct constant_table ocfs2_param_heartbeat[] = { + {"local", OCFS2_MOUNT_HB_LOCAL}, + {"none", OCFS2_MOUNT_HB_NONE}, + {"global", OCFS2_MOUNT_HB_GLOBAL}, + {} +}; + +static const struct constant_table ocfs2_param_data[] = { + {"writeback", OCFS2_MOUNT_DATA_WRITEBACK}, + {"ordered", 0}, + {} +}; + +static const struct constant_table ocfs2_param_coherency[] = { + {"buffered", OCFS2_MOUNT_COHERENCY_BUFFERED}, + {"full", 0}, + {} +}; + +static const struct fs_parameter_spec ocfs2_param_spec[] = { + fsparam_u32 ("barrier", Opt_barrier), + fsparam_enum ("errors", Opt_errors, ocfs2_param_errors), + fsparam_flag_no ("intr", Opt_intr), + fsparam_enum ("heartbeat", Opt_heartbeat, ocfs2_param_heartbeat), + fsparam_enum ("data", Opt_data, ocfs2_param_data), + fsparam_u32 ("atime_quantum", Opt_atime_quantum), + fsparam_u32 ("preferred_slot", Opt_slot), + fsparam_u32 ("commit", Opt_commit), + fsparam_s32 ("localalloc", Opt_localalloc), + fsparam_flag ("localflocks", Opt_localflocks), + fsparam_string ("cluster_stack", Opt_stack), + fsparam_flag_no ("user_xattr", Opt_user_xattr), + fsparam_flag ("inode64", Opt_inode64), + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_enum ("coherency", Opt_coherency, ocfs2_param_coherency), + fsparam_u32 ("resv_level", Opt_resv_level), + fsparam_u32 ("dir_resv_level", Opt_dir_resv_level), + fsparam_flag ("journal_async_commit", Opt_journal_async_commit), + {} }; #ifdef CONFIG_DEBUG_FS @@ -600,32 +604,32 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, return (((unsigned long long)bytes) << bitshift) - trim; } -static int ocfs2_remount(struct super_block *sb, int *flags, char *data) +static int ocfs2_reconfigure(struct fs_context *fc) { int incompat_features; int ret = 0; - struct mount_options parsed_options; + struct mount_options *parsed_options = fc->fs_private; + struct super_block *sb = fc->root->d_sb; struct ocfs2_super *osb = OCFS2_SB(sb); u32 tmp; sync_filesystem(sb); - if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || - !ocfs2_check_set_options(sb, &parsed_options)) { + if (!ocfs2_check_set_options(sb, parsed_options)) { ret = -EINVAL; goto out; } tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; - if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + if ((osb->s_mount_opt & tmp) != (parsed_options->mount_opt & tmp)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); goto out; } if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != - (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { + (parsed_options->mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change data mode on remount\n"); goto out; @@ -634,16 +638,16 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) /* Probably don't want this on remount; it might * mess with other nodes */ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && - (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + (parsed_options->mount_opt & OCFS2_MOUNT_INODE64)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); goto out; } /* We're going to/from readonly mode. */ - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { + if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { /* Disable quota accounting before remounting RO */ - if (*flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { ret = ocfs2_susp_quotas(osb, 0); if (ret < 0) goto out; @@ -657,7 +661,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto unlock_osb; } - if (*flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; } else { @@ -678,11 +682,11 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~SB_RDONLY; osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; } - trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); + trace_ocfs2_remount(sb->s_flags, osb->osb_flags, fc->sb_flags); unlock_osb: spin_unlock(&osb->osb_lock); /* Enable quota accounting after remounting RW */ - if (!ret && !(*flags & SB_RDONLY)) { + if (!ret && !(fc->sb_flags & SB_RDONLY)) { if (sb_any_quota_suspended(sb)) ret = ocfs2_susp_quotas(osb, 1); else @@ -701,11 +705,11 @@ unlock_osb: if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ - osb->s_mount_opt = parsed_options.mount_opt; - osb->s_atime_quantum = parsed_options.atime_quantum; - osb->preferred_slot = parsed_options.slot; - if (parsed_options.commit_interval) - osb->osb_commit_interval = parsed_options.commit_interval; + osb->s_mount_opt = parsed_options->mount_opt; + osb->s_atime_quantum = parsed_options->atime_quantum; + osb->preferred_slot = parsed_options->slot; + if (parsed_options->commit_interval) + osb->osb_commit_interval = parsed_options->commit_interval; if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); @@ -952,8 +956,10 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; - oinfo = sb_dqinfo(sb, type)->dqi_priv; - cancel_delayed_work_sync(&oinfo->dqi_sync_work); + if (!sb_has_quota_suspended(sb, type)) { + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + } inode = igrab(sb->s_dquot.files[type]); /* Turn off quotas. This will remove all dquot structures from * memory and so they will be automatically synced to global @@ -964,23 +970,18 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) } } -static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) +static int ocfs2_fill_super(struct super_block *sb, struct fs_context *fc) { struct dentry *root; int status, sector_size; - struct mount_options parsed_options; + struct mount_options *parsed_options = fc->fs_private; struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; char nodestr[12]; struct ocfs2_blockcheck_stats stats; - trace_ocfs2_fill_super(sb, data, silent); - - if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { - status = -EINVAL; - goto out; - } + trace_ocfs2_fill_super(sb, fc, fc->sb_flags & SB_SILENT); /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); @@ -997,24 +998,24 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb = OCFS2_SB(sb); - if (!ocfs2_check_set_options(sb, &parsed_options)) { + if (!ocfs2_check_set_options(sb, parsed_options)) { status = -EINVAL; goto out_super; } - osb->s_mount_opt = parsed_options.mount_opt; - osb->s_atime_quantum = parsed_options.atime_quantum; - osb->preferred_slot = parsed_options.slot; - osb->osb_commit_interval = parsed_options.commit_interval; + osb->s_mount_opt = parsed_options->mount_opt; + osb->s_atime_quantum = parsed_options->atime_quantum; + osb->preferred_slot = parsed_options->slot; + osb->osb_commit_interval = parsed_options->commit_interval; - ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); - osb->osb_resv_level = parsed_options.resv_level; - osb->osb_dir_resv_level = parsed_options.resv_level; - if (parsed_options.dir_resv_level == -1) - osb->osb_dir_resv_level = parsed_options.resv_level; + ocfs2_la_set_sizes(osb, parsed_options->localalloc_opt); + osb->osb_resv_level = parsed_options->resv_level; + osb->osb_dir_resv_level = parsed_options->resv_level; + if (parsed_options->dir_resv_level == -1) + osb->osb_dir_resv_level = parsed_options->resv_level; else - osb->osb_dir_resv_level = parsed_options.dir_resv_level; + osb->osb_dir_resv_level = parsed_options->dir_resv_level; - status = ocfs2_verify_userspace_stack(osb, &parsed_options); + status = ocfs2_verify_userspace_stack(osb, parsed_options); if (status) goto out_super; @@ -1073,9 +1074,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (ocfs2_meta_ecc(osb)) + if (ocfs2_meta_ecc(osb)) { + ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers); ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, osb->osb_debug_root); + } status = ocfs2_mount_volume(sb); if (status < 0) @@ -1176,27 +1179,72 @@ out: return status; } -static struct dentry *ocfs2_mount(struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *data) +static int ocfs2_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); + return get_tree_bdev(fc, ocfs2_fill_super); +} + +static void ocfs2_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations ocfs2_context_ops = { + .parse_param = ocfs2_parse_param, + .get_tree = ocfs2_get_tree, + .reconfigure = ocfs2_reconfigure, + .free = ocfs2_free_fc, +}; + +static int ocfs2_init_fs_context(struct fs_context *fc) +{ + struct mount_options *mopt; + + mopt = kzalloc(sizeof(struct mount_options), GFP_KERNEL); + if (!mopt) + return -EINVAL; + + mopt->commit_interval = 0; + mopt->mount_opt = OCFS2_MOUNT_NOINTR; + mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; + mopt->slot = OCFS2_INVALID_SLOT; + mopt->localalloc_opt = -1; + mopt->cluster_stack[0] = '\0'; + mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; + mopt->dir_resv_level = -1; + + fc->fs_private = mopt; + fc->ops = &ocfs2_context_ops; + + return 0; } static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", - .mount = ocfs2_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, - .next = NULL + .next = NULL, + .init_fs_context = ocfs2_init_fs_context, + .parameters = ocfs2_param_spec, }; MODULE_ALIAS_FS("ocfs2"); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options) { + if (options->user_stack == 0) { + u32 tmp; + + /* Ensure only one heartbeat mode */ + tmp = options->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + return 0; + } + } if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { @@ -1228,241 +1276,142 @@ static int ocfs2_check_set_options(struct super_block *sb, return 1; } -static int ocfs2_parse_options(struct super_block *sb, - char *options, - struct mount_options *mopt, - int is_remount) +static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) { - int status, user_stack = 0; - char *p; - u32 tmp; - int token, option; - substring_t args[MAX_OPT_ARGS]; - - trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); - - mopt->commit_interval = 0; - mopt->mount_opt = OCFS2_MOUNT_NOINTR; - mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; - mopt->slot = OCFS2_INVALID_SLOT; - mopt->localalloc_opt = -1; - mopt->cluster_stack[0] = '\0'; - mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; - mopt->dir_resv_level = -1; - - if (!options) { - status = 1; - goto bail; - } - - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_hb_local: - mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; - break; - case Opt_hb_none: - mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; - break; - case Opt_hb_global: - mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; - break; - case Opt_barrier: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option) - mopt->mount_opt |= OCFS2_MOUNT_BARRIER; - else - mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; - break; - case Opt_intr: - mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; - break; - case Opt_nointr: + struct fs_parse_result result; + int opt; + struct mount_options *mopt = fc->fs_private; + bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); + + trace_ocfs2_parse_options(is_remount, param->key); + + opt = fs_parse(fc, ocfs2_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_heartbeat: + mopt->mount_opt |= result.uint_32; + break; + case Opt_barrier: + if (result.uint_32) + mopt->mount_opt |= OCFS2_MOUNT_BARRIER; + else + mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; + break; + case Opt_intr: + if (result.negated) mopt->mount_opt |= OCFS2_MOUNT_NOINTR; - break; - case Opt_err_panic: - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; - mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; - break; - case Opt_err_ro: - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; - mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS; - break; - case Opt_err_cont: - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; - mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT; - break; - case Opt_data_ordered: - mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; - break; - case Opt_data_writeback: - mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; - break; - case Opt_user_xattr: - mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; - break; - case Opt_nouser_xattr: + else + mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; + break; + case Opt_errors: + mopt->mount_opt &= ~(OCFS2_MOUNT_ERRORS_CONT | + OCFS2_MOUNT_ERRORS_ROFS | + OCFS2_MOUNT_ERRORS_PANIC); + mopt->mount_opt |= result.uint_32; + break; + case Opt_data: + mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; + mopt->mount_opt |= result.uint_32; + break; + case Opt_user_xattr: + if (result.negated) mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; - break; - case Opt_atime_quantum: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= 0) - mopt->atime_quantum = option; - break; - case Opt_slot: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option) - mopt->slot = (u16)option; - break; - case Opt_commit: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option < 0) - return 0; - if (option == 0) - option = JBD2_DEFAULT_MAX_COMMIT_AGE; - mopt->commit_interval = HZ * option; - break; - case Opt_localalloc: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= 0) - mopt->localalloc_opt = option; - break; - case Opt_localflocks: - /* - * Changing this during remount could race - * flock() requests, or "unbalance" existing - * ones (e.g., a lock is taken in one mode but - * dropped in the other). If users care enough - * to flip locking modes during remount, we - * could add a "local" flag to individual - * flock structures for proper tracking of - * state. - */ - if (!is_remount) - mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; - break; - case Opt_stack: - /* Check both that the option we were passed - * is of the right length and that it is a proper - * string of the right length. - */ - if (((args[0].to - args[0].from) != - OCFS2_STACK_LABEL_LEN) || - (strnlen(args[0].from, - OCFS2_STACK_LABEL_LEN) != - OCFS2_STACK_LABEL_LEN)) { - mlog(ML_ERROR, - "Invalid cluster_stack option\n"); - status = 0; - goto bail; - } - memcpy(mopt->cluster_stack, args[0].from, - OCFS2_STACK_LABEL_LEN); - mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; - /* - * Open code the memcmp here as we don't have - * an osb to pass to - * ocfs2_userspace_stack(). - */ - if (memcmp(mopt->cluster_stack, - OCFS2_CLASSIC_CLUSTER_STACK, - OCFS2_STACK_LABEL_LEN)) - user_stack = 1; - break; - case Opt_inode64: - mopt->mount_opt |= OCFS2_MOUNT_INODE64; - break; - case Opt_usrquota: - mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; - break; - case Opt_grpquota: - mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; - break; - case Opt_coherency_buffered: - mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; - break; - case Opt_coherency_full: - mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; - break; - case Opt_acl: - mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; - mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; - break; - case Opt_noacl: + else + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_atime_quantum: + mopt->atime_quantum = result.uint_32; + break; + case Opt_slot: + if (result.uint_32) + mopt->slot = (u16)result.uint_32; + break; + case Opt_commit: + if (result.uint_32 == 0) + mopt->commit_interval = HZ * JBD2_DEFAULT_MAX_COMMIT_AGE; + else + mopt->commit_interval = HZ * result.uint_32; + break; + case Opt_localalloc: + if (result.int_32 >= 0) + mopt->localalloc_opt = result.int_32; + break; + case Opt_localflocks: + /* + * Changing this during remount could race flock() requests, or + * "unbalance" existing ones (e.g., a lock is taken in one mode + * but dropped in the other). If users care enough to flip + * locking modes during remount, we could add a "local" flag to + * individual flock structures for proper tracking of state. + */ + if (!is_remount) + mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; + break; + case Opt_stack: + /* Check both that the option we were passed is of the right + * length and that it is a proper string of the right length. + */ + if (strlen(param->string) != OCFS2_STACK_LABEL_LEN) { + mlog(ML_ERROR, "Invalid cluster_stack option\n"); + return -EINVAL; + } + memcpy(mopt->cluster_stack, param->string, OCFS2_STACK_LABEL_LEN); + mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have an osb to pass + * to ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + mopt->user_stack = 1; + break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; + case Opt_usrquota: + mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; + break; + case Opt_grpquota: + mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; + break; + case Opt_coherency: + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; + mopt->mount_opt |= result.uint_32; + break; + case Opt_acl: + if (result.negated) { mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; + } else { + mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; + } + break; + case Opt_resv_level: + if (is_remount) break; - case Opt_resv_level: - if (is_remount) - break; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= OCFS2_MIN_RESV_LEVEL && - option < OCFS2_MAX_RESV_LEVEL) - mopt->resv_level = option; - break; - case Opt_dir_resv_level: - if (is_remount) - break; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= OCFS2_MIN_RESV_LEVEL && - option < OCFS2_MAX_RESV_LEVEL) - mopt->dir_resv_level = option; - break; - case Opt_journal_async_commit: - mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; + if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL && + result.uint_32 < OCFS2_MAX_RESV_LEVEL) + mopt->resv_level = result.uint_32; + break; + case Opt_dir_resv_level: + if (is_remount) break; - default: - mlog(ML_ERROR, - "Unrecognized mount option \"%s\" " - "or missing value\n", p); - status = 0; - goto bail; - } - } - - if (user_stack == 0) { - /* Ensure only one heartbeat mode */ - tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | - OCFS2_MOUNT_HB_GLOBAL | - OCFS2_MOUNT_HB_NONE); - if (hweight32(tmp) != 1) { - mlog(ML_ERROR, "Invalid heartbeat mount options\n"); - status = 0; - goto bail; - } + if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL && + result.uint_32 < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = result.uint_32; + break; + case Opt_journal_async_commit: + mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; + break; + default: + return -EINVAL; } - status = 1; - -bail: - return status; + return 0; } static int ocfs2_show_options(struct seq_file *s, struct dentry *root) @@ -1515,8 +1464,7 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, - OCFS2_STACK_LABEL_LEN); + seq_show_option(s, "cluster_stack", osb->osb_cluster_stack); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) @@ -1568,15 +1516,13 @@ static int __init ocfs2_init(void) ocfs2_set_locking_protocol(); - status = register_quota_format(&ocfs2_quota_format); - if (status < 0) - goto out3; + register_quota_format(&ocfs2_quota_format); + status = register_filesystem(&ocfs2_fs_type); if (!status) return 0; unregister_quota_format(&ocfs2_quota_format); -out3: debugfs_remove(ocfs2_debugfs_root); ocfs2_free_mem_caches(); out2: @@ -1705,18 +1651,17 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), 0, - (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, NULL); ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", sizeof(struct ocfs2_quota_chunk), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + SLAB_RECLAIM_ACCOUNT, NULL); if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || !ocfs2_qf_chunk_cachep) { @@ -1858,7 +1803,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); - /* Remove file check sysfs related directores/files, + /* Remove file check sysfs related directories/files, * and wait for the pending file check operations */ ocfs2_filecheck_remove_sysfs(osb); @@ -1867,6 +1812,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); + /* Stop quota recovery so that we can disable quotas */ + ocfs2_recovery_disable_quota(osb); + ocfs2_disable_quotas(osb); /* All dquots should be freed by now */ @@ -2014,7 +1962,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; - sb->s_d_op = &ocfs2_dentry_ops; + set_default_d_op(sb, &ocfs2_dentry_ops); sb->s_export_op = &ocfs2_export_ops; sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->dq_op = &ocfs2_quota_operations; @@ -2026,8 +1974,8 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); - memcpy(&sb->s_uuid, di->id2.i_super.s_uuid, - sizeof(di->id2.i_super.s_uuid)); + super_set_uuid(sb, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; @@ -2319,6 +2267,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct ocfs2_blockcheck_stats *stats) { int status = -EAGAIN; + u32 blksz_bits; if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { @@ -2333,11 +2282,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, goto out; } status = -EINVAL; - if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */ + blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + if (blksz_bits < 9 || blksz_bits > 12) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size bits: found %u, should be 9, 10, 11, or 12\n", + blksz_bits); + } else if ((1 << blksz_bits) != blksz) { mlog(ML_ERROR, "found superblock with incorrect block " - "size: found %u, should be %u\n", - 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), - blksz); + "size: found %u, should be %u\n", 1 << blksz_bits, blksz); } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != OCFS2_MAJOR_REV_LEVEL || le16_to_cpu(di->id2.i_super.s_minor_rev_level) != @@ -2355,8 +2308,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, (unsigned long long)bh->b_blocknr); } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { - mlog(ML_ERROR, "bad cluster size found: %u\n", - 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); + mlog(ML_ERROR, "bad cluster size bit found: %u\n", + le32_to_cpu(di->id2.i_super.s_clustersize_bits)); } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { mlog(ML_ERROR, "bad root_blkno: 0\n"); } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index d4c5fdcfa1e4..ad8be3300b49 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -54,31 +54,27 @@ static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct buffer_head *bh = NULL; int status = ocfs2_read_inode_block(inode, &bh); struct ocfs2_dinode *fe; const char *link; - void *kaddr; size_t len; if (status < 0) { mlog_errno(status); - return status; + goto out; } fe = (struct ocfs2_dinode *) bh->b_data; link = (char *) fe->id2.i_symlink; /* will be less than a page size */ len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb)); - kaddr = kmap_atomic(page); - memcpy(kaddr, link, len + 1); - kunmap_atomic(kaddr); - SetPageUptodate(page); - unlock_page(page); + memcpy_to_folio(folio, 0, link, len + 1); +out: + folio_end_read(folio, status == 0); brelse(bh); - return 0; + return status; } const struct address_space_operations ocfs2_fast_symlink_aops = { diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 53a945da873b..d53a6cc866be 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -127,14 +127,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, char namebuf[40]; struct inode *inode = NULL; u64 blkno; - int status = 0; + int len, status = 0; - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, slot); + len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); - status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, len, &blkno); if (status < 0) { goto bail; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7..dc1761e84814 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -87,23 +87,19 @@ static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), }; -const struct xattr_handler *ocfs2_xattr_handlers[] = { +const struct xattr_handler * const ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL }; -static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { - [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, - [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] - = &posix_acl_access_xattr_handler, - [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] - = &posix_acl_default_xattr_handler, - [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, - [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, +static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, + [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; struct ocfs2_xattr_info { @@ -652,7 +648,7 @@ int ocfs2_calc_xattr_init(struct inode *dir, * 256(name) + 80(value) + 16(entry) = 352 bytes, * The max space of acl xattr taken inline is * 80(value) + 16(entry) * 2(if directory) = 192 bytes, - * when blocksize = 512, may reserve one more cluser for + * when blocksize = 512, may reserve one more cluster for * xattr bucket, otherwise reserve one metadata block * for them is ok. * If this is a new directory with inline data, @@ -1066,13 +1062,13 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, return i_ret + b_ret; } -static int ocfs2_xattr_find_entry(int name_index, +static int ocfs2_xattr_find_entry(struct inode *inode, int name_index, const char *name, struct ocfs2_xattr_search *xs) { struct ocfs2_xattr_entry *entry; size_t name_len; - int i, cmp = 1; + int i, name_offset, cmp = 1; if (name == NULL) return -EINVAL; @@ -1080,13 +1076,22 @@ static int ocfs2_xattr_find_entry(int name_index, name_len = strlen(name); entry = xs->here; for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + if ((void *)entry >= xs->end) { + ocfs2_error(inode->i_sb, "corrupted xattr entries"); + return -EFSCORRUPTED; + } cmp = name_index - ocfs2_xattr_get_type(entry); if (!cmp) cmp = name_len - entry->xe_name_len; - if (!cmp) - cmp = memcmp(name, (xs->base + - le16_to_cpu(entry->xe_name_offset)), - name_len); + if (!cmp) { + name_offset = le16_to_cpu(entry->xe_name_offset); + if ((xs->base + name_offset + name_len) > xs->end) { + ocfs2_error(inode->i_sb, + "corrupted xattr entries"); + return -EFSCORRUPTED; + } + cmp = memcmp(name, (xs->base + name_offset), name_len); + } if (cmp == 0) break; entry += 1; @@ -1170,7 +1175,7 @@ static int ocfs2_xattr_ibody_get(struct inode *inode, xs->base = (void *)xs->header; xs->here = xs->header->xh_entries; - ret = ocfs2_xattr_find_entry(name_index, name, xs); + ret = ocfs2_xattr_find_entry(inode, name_index, name, xs); if (ret) return ret; size = le64_to_cpu(xs->here->xe_value_size); @@ -2031,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, rc = 0; ocfs2_xa_cleanup_value_truncate(loc, "removing", orig_clusters); - if (rc) - goto out; + goto out; } } @@ -2702,7 +2706,7 @@ static int ocfs2_xattr_ibody_find(struct inode *inode, /* Find the named attribute. */ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { - ret = ocfs2_xattr_find_entry(name_index, name, xs); + ret = ocfs2_xattr_find_entry(inode, name_index, name, xs); if (ret && ret != -ENODATA) return ret; xs->not_found = ret; @@ -2837,7 +2841,7 @@ static int ocfs2_xattr_block_find(struct inode *inode, xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; xs->here = xs->header->xh_entries; - ret = ocfs2_xattr_find_entry(name_index, name, xs); + ret = ocfs2_xattr_find_entry(inode, name_index, name, xs); } else ret = ocfs2_xattr_index_block_find(inode, blk_bh, name_index, @@ -2904,7 +2908,7 @@ static int ocfs2_create_xattr_block(struct inode *inode, /* Initialize ocfs2_xattr_block */ xblk = (struct ocfs2_xattr_block *)new_bh->b_data; memset(xblk, 0, inode->i_sb->s_blocksize); - strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + strscpy(xblk->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE); xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc); xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); @@ -3425,9 +3429,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); } out: @@ -4162,15 +4166,6 @@ static int cmp_xe(const void *a, const void *b) return 0; } -static void swap_xe(void *a, void *b, int size) -{ - struct ocfs2_xattr_entry *l = a, *r = b, tmp; - - tmp = *l; - memcpy(l, r, sizeof(struct ocfs2_xattr_entry)); - memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry)); -} - /* * When the ocfs2_xattr_block is filled up, new bucket will be created * and all the xattr entries will be moved to the new bucket. @@ -4236,7 +4231,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change); sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), - cmp_xe, swap_xe); + cmp_xe, NULL); } /* @@ -4376,7 +4371,7 @@ static int cmp_xe_offset(const void *a, const void *b) /* * defrag a xattr bucket if we find that the bucket has some - * holes beteen name/value pairs. + * holes between name/value pairs. * We will move all the name/value pairs to the end of the bucket * so that we can spare some space for insertion. */ @@ -4431,7 +4426,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, */ sort(entries, le16_to_cpu(xh->xh_count), sizeof(struct ocfs2_xattr_entry), - cmp_xe_offset, swap_xe); + cmp_xe_offset, NULL); /* Move all name/values to the end of the bucket. */ xe = xh->xh_entries; @@ -4473,7 +4468,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, /* sort the entries by their name_hash. */ sort(entries, le16_to_cpu(xh->xh_count), sizeof(struct ocfs2_xattr_entry), - cmp_xe, swap_xe); + cmp_xe, NULL); buf = bucket_buf; for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) @@ -5016,7 +5011,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, * 2. If cluster_size == bucket_size: * a) If the previous extent rec has more than one cluster and the insert * place isn't in the last cluster, copy the entire last cluster to the - * new one. This time, we don't need to upate the first_bh and header_bh + * new one. This time, we don't need to update the first_bh and header_bh * since they will not be moved into the new cluster. * b) Otherwise, move the bottom half of the xattrs in the last cluster into * the new one. And we set the extend flag to zero if the insert place is @@ -6194,7 +6189,7 @@ struct ocfs2_xattr_reflink { /* * Given a xattr header and xe offset, * return the proper xv and the corresponding bh. - * xattr in inode, block and xattr tree have different implementaions. + * xattr in inode, block and xattr tree have different implementations. */ typedef int (get_xattr_value_root)(struct super_block *sb, struct buffer_head *bh, @@ -6274,7 +6269,7 @@ static int ocfs2_get_xattr_value_root(struct super_block *sb, } /* - * Lock the meta_ac and caculate how much credits we need for reflink xattrs. + * Lock the meta_ac and calculate how much credits we need for reflink xattrs. * It is only used for inline xattr and xattr block. */ static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb, @@ -6356,7 +6351,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count)); - last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)]; + last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)] - 1; for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) { xe = &xh->xh_entries[i]; @@ -6515,16 +6510,7 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) } new_oi = OCFS2_I(args->new_inode); - /* - * Adjust extent record count to reserve space for extended attribute. - * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). - */ - if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && - !(ocfs2_inode_is_fast_symlink(args->new_inode))) { - struct ocfs2_extent_list *el = &new_di->id2.i_list; - le16_add_cpu(&el->l_count, -(inline_size / - sizeof(struct ocfs2_extent_rec))); - } + spin_lock(&new_oi->ip_lock); new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); @@ -7247,7 +7233,7 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7259,9 +7245,21 @@ static int ocfs2_xattr_security_set(const struct xattr_handler *handler, static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { + struct ocfs2_security_xattr_info *si = fs_info; const struct xattr *xattr; int err = 0; + if (si) { + si->value = kmemdup(xattr_array->value, xattr_array->value_len, + GFP_KERNEL); + if (!si->value) + return -ENOMEM; + + si->name = xattr_array->name; + si->value_len = xattr_array->value_len; + return 0; + } + for (xattr = xattr_array; xattr->name != NULL; xattr++) { err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, xattr->name, xattr->value, @@ -7277,13 +7275,23 @@ int ocfs2_init_security_get(struct inode *inode, const struct qstr *qstr, struct ocfs2_security_xattr_info *si) { + int ret; + /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - if (si) - return security_old_inode_init_security(inode, dir, qstr, - &si->name, &si->value, - &si->value_len); + if (si) { + ret = security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, si); + /* + * security_inode_init_security() does not return -EOPNOTSUPP, + * we have to check the xattr ourselves. + */ + if (!ret && !si->name) + si->enable = 0; + + return ret; + } return security_inode_init_security(inode, dir, qstr, &ocfs2_initxattrs, NULL); @@ -7320,7 +7328,7 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7351,7 +7359,7 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f..65e9aa743919 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -30,7 +30,7 @@ struct ocfs2_security_xattr_info { extern const struct xattr_handler ocfs2_xattr_user_handler; extern const struct xattr_handler ocfs2_xattr_trusted_handler; extern const struct xattr_handler ocfs2_xattr_security_handler; -extern const struct xattr_handler *ocfs2_xattr_handlers[]; +extern const struct xattr_handler * const ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, |
